In [1]:
import pickle 
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

In [2]:
def write_lines(fn, lines, mode='w'):
    text_to_write = "\n".join(lines)
    with open(fn, encoding='utf-8', mode=mode) as f:
        f.write(text_to_write)

In [3]:
with open("data/dump_wl_1.pickle", "rb") as f:
    dump_wl = pickle.load(f)
    
with open("data/dump_fce_1.pickle", "rb") as f:
    dump_fce = pickle.load(f)
    
with open("data/dump_nucle_1.pickle", "rb") as f:
    dump_nucle = pickle.load(f)
    
with open("data/dump_lang8_1.pickle", "rb") as f:
    dump_lang8 = pickle.load(f)

In [4]:
dump_wl.keys()

dict_keys(['text_src', 'text_tgt', 'vectors', 'clusters', 'mean_vectors'])

In [5]:
len(dump_wl['text_src'])

34308

In [6]:
wl_clusters = dump_wl["clusters"]
wl_mean_vectors = dump_wl["mean_vectors"]

In [7]:
fce_mean_vectors = dump_fce["mean_vectors"]
nucle_mean_vectors = dump_nucle["mean_vectors"]
lang8_mean_vectors = dump_lang8["mean_vectors"]

In [8]:
wl_mean_cluster_ids = np.array(list(wl_mean_vectors.keys()))
wl_mean_cluster_vectors = np.array(list(wl_mean_vectors.values()))

fce_mean_cluster_ids = np.array(list(fce_mean_vectors.keys()))
fce_mean_cluster_vectors = np.array(list(fce_mean_vectors.values()))

nucle_mean_cluster_ids = np.array(list(nucle_mean_vectors.keys()))
nucle_mean_cluster_vectors = np.array(list(nucle_mean_vectors.values()))

lang8_mean_cluster_ids = np.array(list(lang8_mean_vectors.keys()))
lang8_mean_cluster_vectors = np.array(list(lang8_mean_vectors.values()))

### FCE

In [9]:
cos_sim_wl_fce = cosine_similarity(wl_mean_cluster_vectors, fce_mean_cluster_vectors)

In [10]:
cos_sim_wl_fce.shape

(1515, 912)

In [101]:
fce_sim_clusters = []
for i in range(1, 1515):
    ind = cos_sim_wl_fce[i].argsort()[::-1]
    good_ind = cos_sim_wl_fce[i][ind] > 0.6
    fce_sim_clusters.extend(ind[good_ind])

In [102]:
len(fce_sim_clusters)

14509

In [103]:
fce_most_sim_cluster = list(pd.Series(fce_sim_clusters).value_counts().index)

In [104]:
len(set(fce_sim_clusters))

829

In [105]:
diff_fce_class = {i for i in range(cos_sim_wl_fce.shape[1])} - set(fce_sim_clusters)
diff_fce_class

{12,
 15,
 16,
 21,
 22,
 27,
 29,
 40,
 51,
 63,
 89,
 98,
 113,
 118,
 125,
 185,
 238,
 248,
 265,
 280,
 299,
 309,
 330,
 333,
 342,
 357,
 375,
 381,
 382,
 389,
 397,
 409,
 427,
 475,
 479,
 480,
 484,
 487,
 493,
 510,
 521,
 528,
 536,
 546,
 564,
 581,
 592,
 601,
 605,
 613,
 619,
 628,
 635,
 657,
 681,
 692,
 699,
 703,
 710,
 721,
 723,
 724,
 729,
 747,
 750,
 754,
 781,
 787,
 800,
 807,
 812,
 815,
 836,
 840,
 849,
 875,
 879,
 881,
 884,
 888,
 889,
 890,
 910}

In [106]:
fce_df = pd.DataFrame({"text_src": dump_fce['text_src'], "text_tgt": dump_fce["text_tgt"], "clusters": dump_fce["clusters"]})

In [108]:
len(fce_df)

28350

In [112]:
fce_df["has_change"].sum()

17742

In [109]:
fce_df["has_change"] = (fce_df["text_src"] != fce_df["text_tgt"]).astype(int)

In [110]:
len(fce_df[~fce_df["clusters"].isin(diff_fce_class)])

25852

In [111]:
fce_df[~fce_df["clusters"].isin(diff_fce_class)]["has_change"].sum()

16414

### Nucle

In [53]:
cos_sim_wl_nucle = cosine_similarity(wl_mean_cluster_vectors, nucle_mean_cluster_vectors)
cos_sim_wl_nucle.shape

(1515, 1798)

In [113]:
nucle_sim_clusters = []
for i in range(1, 1515):
    ind = cos_sim_wl_nucle[i].argsort()[::-1]
    good_ind = cos_sim_wl_nucle[i][ind] > 0.6
    nucle_sim_clusters.extend(ind[good_ind])

In [114]:
len(nucle_sim_clusters)

18844

In [115]:
len(set(nucle_sim_clusters))

1236

In [116]:
nucle_most_sim_cluster = list(pd.Series(nucle_sim_clusters).value_counts().index)

In [117]:
diff_nucle_class = {i for i in range(cos_sim_wl_nucle.shape[1])} - set(nucle_sim_clusters) 

In [118]:
#diff_nucle_class

In [123]:
def create_df(dump):
    df = pd.DataFrame({"text_src": dump['text_src'], "text_tgt": dump["text_tgt"], "clusters": dump["clusters"]})
    df["has_change"] = (df["text_src"] != df["text_tgt"]).astype(int)
    return df

In [124]:
nucle_df = create_df(dump_nucle)

In [128]:
len(nucle_df)

57151

In [126]:
nucle_df.has_change.sum()

21834

In [127]:
len(nucle_df[~nucle_df["clusters"].isin(diff_nucle_class)])

46199

In [129]:
nucle_df[~nucle_df["clusters"].isin(diff_nucle_class)]["has_change"].sum()

19958

### Lang8

In [29]:
cos_sim_wl_lang8 = cosine_similarity(wl_mean_cluster_vectors, lang8_mean_cluster_vectors)
cos_sim_wl_lang8.shape

(1515, 1427)

In [130]:
lang8_sim_clusters = []
for i in range(1, 1515):
    ind = cos_sim_wl_lang8[i].argsort()[::-1]
    good_ind = cos_sim_wl_lang8[i][ind] > 0.6
    lang8_sim_clusters.extend(ind[good_ind])

In [138]:
diff_lang8_class = {i for i in range(cos_sim_wl_lang8.shape[1])} - set(lang8_sim_clusters) 

In [139]:
len(lang8_sim_clusters)

16653

In [140]:
len(set(lang8_sim_clusters))

1219

In [133]:
lang8_most_sim_cluster = list(pd.Series(lang8_sim_clusters).value_counts().index)

In [135]:
lang8_df = create_df(dump_lang8)

In [136]:
len(lang8_df)

35000

In [137]:
lang8_df.has_change.sum()

16928

In [141]:
len(lang8_df[~lang8_df["clusters"].isin(diff_lang8_class)])

33097

In [142]:
lang8_df[~lang8_df["clusters"].isin(diff_lang8_class)]["has_change"].sum()

16265

In [140]:
def generate_text_dicts(dump):
    df = pd.DataFrame({"text_src":dump['text_src'], 'text_tgt': dump['text_tgt'], 'clusters':dump['clusters']})
    cluster_text_src = df.groupby(['clusters'])['text_src'].apply(lambda grp: list(grp)).to_dict()
    cluster_text_tgt = df.groupby(['clusters'])['text_tgt'].apply(lambda grp: list(grp)).to_dict()
    return cluster_text_src, cluster_text_tgt

In [141]:
fce_cluster_text_src, fce_cluster_text_tgt = generate_text_dicts(dump_fce)
nucle_cluster_text_src, nucle_cluster_text_tgt = generate_text_dicts(dump_nucle)
lang8_cluster_text_src, lang8_cluster_text_tgt = generate_text_dicts(dump_lang8)

In [142]:
fce_src = []
fce_tgt = []
for cl in fce_most_sim_cluster:
    fce_src.extend(fce_cluster_text_src.get(cl, []))
    fce_tgt.extend(fce_cluster_text_tgt.get(cl, []))

In [143]:
len(fce_src)

8242

In [144]:
nucle_src = []
nucle_tgt = []
for cl in nucle_most_sim_cluster:
    nucle_src.extend(nucle_cluster_text_src.get(cl, []))
    nucle_tgt.extend(nucle_cluster_text_tgt.get(cl, []))

In [145]:
len(nucle_src)

4750

In [146]:
lang8_src = []
lang8_tgt = []
for cl in lang8_most_sim_cluster:
    lang8_src.extend(lang8_cluster_text_src.get(cl, []))
    lang8_tgt.extend(lang8_cluster_text_tgt.get(cl, []))

In [147]:
len(lang8_src)

9674

In [156]:
total_len = len(nucle_tgt)+len(fce_src)+len(lang8_src)
fce_prop = round(len(fce_src)/total_len,2)
nucle_prop = round(len(nucle_src)/total_len,2)
lang8_prop = round(len(lang8_src)/total_len,2)

print(100*fce_prop,"%")
print(100*nucle_prop,"%")
print(100*lang8_prop,"%")

36.0 %
21.0 %
43.0 %


In [257]:
c = 20000
fce_get = c*fce_prop
nucle_get = c*nucle_prop
lang8_get = c*lang8_prop
print(fce_get)
print(nucle_get)
print(lang8_get)

7200.0
4200.0
8600.0


In [258]:
np.random.seed(4)
fce_ind = np.random.choice(np.arange(len(fce_src)), int(fce_get), replace=False)
nucle_ind = np.random.choice(np.arange(len(nucle_src)), int(nucle_get), replace=False)
lang8_ind = np.random.choice(np.arange(len(lang8_src)), int(lang8_get), replace=False)

In [259]:
final_scr = []
final_scr.extend([lang8_src[i] for i in lang8_ind])
final_scr.extend([nucle_src[i] for i in nucle_ind])
final_scr.extend([fce_src[i] for i in fce_ind])
final_scr.extend(dump_wl['text_src'])

len(final_scr)

54308

In [260]:
final_tgt = []
final_tgt.extend([lang8_tgt[i] for i in lang8_ind])
final_tgt.extend([nucle_tgt[i] for i in nucle_ind])
final_tgt.extend([fce_tgt[i] for i in fce_ind])
final_tgt.extend(dump_wl['text_tgt'])

len(final_tgt)

54308

In [261]:
from sklearn.model_selection import train_test_split

In [262]:
train_tgt, dev_tgt, train_src, dev_src = train_test_split(final_tgt, final_scr, test_size=0.02, random_state=4)

In [263]:
path = "../../data_parallel/clustering/"

In [264]:
# write_lines(path + "cluster_5k_train_src", train_src, mode='w')
# write_lines(path + "cluster_5k_train_tgt", train_tgt, mode='w')
# write_lines(path + "cluster_5k_dev_src", dev_src, mode='w')
# write_lines(path + "cluster_5k_dev_tgt", dev_tgt, mode='w')

In [248]:
# write_lines(path + "cluster_10k_train_src", train_src, mode='w')
# write_lines(path + "cluster_10k_train_tgt", train_tgt, mode='w')
# write_lines(path + "cluster_10k_dev_src", dev_src, mode='w')
# write_lines(path + "cluster_10k_dev_tgt", dev_tgt, mode='w')

In [265]:
# write_lines(path + "cluster_15k_train_src", train_src, mode='w')
# write_lines(path + "cluster_15k_train_tgt", train_tgt, mode='w')
# write_lines(path + "cluster_15k_dev_src", dev_src, mode='w')
# write_lines(path + "cluster_15k_dev_tgt", dev_tgt, mode='w')

In [266]:
write_lines(path + "cluster_20k_train_src", train_src, mode='w')
write_lines(path + "cluster_20k_train_tgt", train_tgt, mode='w')
write_lines(path + "cluster_20k_dev_src", dev_src, mode='w')
write_lines(path + "cluster_20k_dev_tgt", dev_tgt, mode='w')