In [117]:
import pickle 
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

In [201]:
def write_lines(fn, lines, mode='w'):
    text_to_write = "\n".join(lines)
    with open(fn, encoding='utf-8', mode=mode) as f:
        f.write(text_to_write)

In [118]:
with open("data/dump_wl_1.pickle", "rb") as f:
    dump_wl = pickle.load(f)
    
with open("data/dump_fce_1.pickle", "rb") as f:
    dump_fce = pickle.load(f)
    
with open("data/dump_nucle_1.pickle", "rb") as f:
    dump_nucle = pickle.load(f)
    
with open("data/dump_lang8_1.pickle", "rb") as f:
    dump_lang8 = pickle.load(f)

In [119]:
dump_wl.keys()

dict_keys(['text_src', 'text_tgt', 'vectors', 'clusters', 'mean_vectors'])

In [120]:
len(dump_wl['text_src'])

34308

In [121]:
wl_clusters = dump_wl["clusters"]
wl_mean_vectors = dump_wl["mean_vectors"]

In [122]:
fce_mean_vectors = dump_fce["mean_vectors"]
nucle_mean_vectors = dump_nucle["mean_vectors"]
lang8_mean_vectors = dump_lang8["mean_vectors"]

In [123]:
wl_mean_cluster_ids = np.array(list(wl_mean_vectors.keys()))
wl_mean_cluster_vectors = np.array(list(wl_mean_vectors.values()))

fce_mean_cluster_ids = np.array(list(fce_mean_vectors.keys()))
fce_mean_cluster_vectors = np.array(list(fce_mean_vectors.values()))

nucle_mean_cluster_ids = np.array(list(nucle_mean_vectors.keys()))
nucle_mean_cluster_vectors = np.array(list(nucle_mean_vectors.values()))

lang8_mean_cluster_ids = np.array(list(lang8_mean_vectors.keys()))
lang8_mean_cluster_vectors = np.array(list(lang8_mean_vectors.values()))

### FCE

In [124]:
cos_sim_wl_fce = cosine_similarity(wl_mean_cluster_vectors, fce_mean_cluster_vectors)

In [125]:
cos_sim_wl_fce.shape

(1515, 912)

In [126]:
fce_sim_clusters = []
for i in range(1, 1515):
    ind = cos_sim_wl_fce[i].argsort()[::-1]
    good_ind = cos_sim_wl_fce[i][ind] > 0.85
    fce_sim_clusters.extend(ind[good_ind])

In [127]:
len(fce_sim_clusters)

179

In [128]:
fce_most_sim_cluster = list(pd.Series(fce_sim_clusters).value_counts().index)

In [129]:
len(set(fce_sim_clusters))

165

### Nucle

In [130]:
cos_sim_wl_nucle = cosine_similarity(wl_mean_cluster_vectors, nucle_mean_cluster_vectors)
cos_sim_wl_nucle.shape

(1515, 1798)

In [131]:
nucle_sim_clusters = []
for i in range(1, 1515):
    ind = cos_sim_wl_nucle[i].argsort()[::-1]
    good_ind = cos_sim_wl_nucle[i][ind] > 0.85
    nucle_sim_clusters.extend(ind[good_ind])

In [132]:
len(nucle_sim_clusters)

73

In [133]:
len(set(nucle_sim_clusters))

66

In [134]:
nucle_most_sim_cluster = list(pd.Series(nucle_sim_clusters).value_counts().index)

### Lang8

In [135]:
cos_sim_wl_lang8 = cosine_similarity(wl_mean_cluster_vectors, lang8_mean_cluster_vectors)
cos_sim_wl_lang8.shape

(1515, 1427)

In [136]:
lang8_sim_clusters = []
for i in range(1, 1515):
    ind = cos_sim_wl_lang8[i].argsort()[::-1]
    good_ind = cos_sim_wl_lang8[i][ind] > 0.85
    lang8_sim_clusters.extend(ind[good_ind])

In [137]:
len(lang8_sim_clusters)

224

In [138]:
len(set(lang8_sim_clusters))

204

In [139]:
lang8_most_sim_cluster = list(pd.Series(lang8_sim_clusters).value_counts().index)

In [140]:
def generate_text_dicts(dump):
    df = pd.DataFrame({"text_src":dump['text_src'], 'text_tgt': dump['text_tgt'], 'clusters':dump['clusters']})
    cluster_text_src = df.groupby(['clusters'])['text_src'].apply(lambda grp: list(grp)).to_dict()
    cluster_text_tgt = df.groupby(['clusters'])['text_tgt'].apply(lambda grp: list(grp)).to_dict()
    return cluster_text_src, cluster_text_tgt

In [141]:
fce_cluster_text_src, fce_cluster_text_tgt = generate_text_dicts(dump_fce)
nucle_cluster_text_src, nucle_cluster_text_tgt = generate_text_dicts(dump_nucle)
lang8_cluster_text_src, lang8_cluster_text_tgt = generate_text_dicts(dump_lang8)

In [142]:
fce_src = []
fce_tgt = []
for cl in fce_most_sim_cluster:
    fce_src.extend(fce_cluster_text_src.get(cl, []))
    fce_tgt.extend(fce_cluster_text_tgt.get(cl, []))

In [143]:
len(fce_src)

8242

In [144]:
nucle_src = []
nucle_tgt = []
for cl in nucle_most_sim_cluster:
    nucle_src.extend(nucle_cluster_text_src.get(cl, []))
    nucle_tgt.extend(nucle_cluster_text_tgt.get(cl, []))

In [145]:
len(nucle_src)

4750

In [146]:
lang8_src = []
lang8_tgt = []
for cl in lang8_most_sim_cluster:
    lang8_src.extend(lang8_cluster_text_src.get(cl, []))
    lang8_tgt.extend(lang8_cluster_text_tgt.get(cl, []))

In [147]:
len(lang8_src)

9674

In [156]:
total_len = len(nucle_tgt)+len(fce_src)+len(lang8_src)
fce_prop = round(len(fce_src)/total_len,2)
nucle_prop = round(len(nucle_src)/total_len,2)
lang8_prop = round(len(lang8_src)/total_len,2)

print(100*fce_prop,"%")
print(100*nucle_prop,"%")
print(100*lang8_prop,"%")

36.0 %
21.0 %
43.0 %


In [257]:
c = 20000
fce_get = c*fce_prop
nucle_get = c*nucle_prop
lang8_get = c*lang8_prop
print(fce_get)
print(nucle_get)
print(lang8_get)

7200.0
4200.0
8600.0


In [258]:
np.random.seed(4)
fce_ind = np.random.choice(np.arange(len(fce_src)), int(fce_get), replace=False)
nucle_ind = np.random.choice(np.arange(len(nucle_src)), int(nucle_get), replace=False)
lang8_ind = np.random.choice(np.arange(len(lang8_src)), int(lang8_get), replace=False)

In [259]:
final_scr = []
final_scr.extend([lang8_src[i] for i in lang8_ind])
final_scr.extend([nucle_src[i] for i in nucle_ind])
final_scr.extend([fce_src[i] for i in fce_ind])
final_scr.extend(dump_wl['text_src'])

len(final_scr)

54308

In [260]:
final_tgt = []
final_tgt.extend([lang8_tgt[i] for i in lang8_ind])
final_tgt.extend([nucle_tgt[i] for i in nucle_ind])
final_tgt.extend([fce_tgt[i] for i in fce_ind])
final_tgt.extend(dump_wl['text_tgt'])

len(final_tgt)

54308

In [261]:
from sklearn.model_selection import train_test_split

In [262]:
train_tgt, dev_tgt, train_src, dev_src = train_test_split(final_tgt, final_scr, test_size=0.02, random_state=4)

In [263]:
path = "../../data_parallel/clustering/"

In [264]:
# write_lines(path + "cluster_5k_train_src", train_src, mode='w')
# write_lines(path + "cluster_5k_train_tgt", train_tgt, mode='w')
# write_lines(path + "cluster_5k_dev_src", dev_src, mode='w')
# write_lines(path + "cluster_5k_dev_tgt", dev_tgt, mode='w')

In [248]:
# write_lines(path + "cluster_10k_train_src", train_src, mode='w')
# write_lines(path + "cluster_10k_train_tgt", train_tgt, mode='w')
# write_lines(path + "cluster_10k_dev_src", dev_src, mode='w')
# write_lines(path + "cluster_10k_dev_tgt", dev_tgt, mode='w')

In [265]:
# write_lines(path + "cluster_15k_train_src", train_src, mode='w')
# write_lines(path + "cluster_15k_train_tgt", train_tgt, mode='w')
# write_lines(path + "cluster_15k_dev_src", dev_src, mode='w')
# write_lines(path + "cluster_15k_dev_tgt", dev_tgt, mode='w')

In [266]:
write_lines(path + "cluster_20k_train_src", train_src, mode='w')
write_lines(path + "cluster_20k_train_tgt", train_tgt, mode='w')
write_lines(path + "cluster_20k_dev_src", dev_src, mode='w')
write_lines(path + "cluster_20k_dev_tgt", dev_tgt, mode='w')