In [1]:
import pickle
from scipy.cluster import hierarchy
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np


class Cluster_vectors:
    def create_clusters(self, vectors, min_distance=0.4, metric="cosine", criterion="distance",
                        reindex_clusters=True, count_merge=1, min_count_in_cluster=5, k=20):
        Z = hierarchy.linkage(vectors, "average", metric)  # hierarchy between similarities
        clusters = hierarchy.fcluster(Z, min_distance, criterion)  # clusters

        for i in range(count_merge):
            clusters = self.merge_clusters(clusters, vectors, min_distance, k=k)

        if reindex_clusters:
            clusters = self.reindex_clusters(clusters, min_count_in_cluster)

        return clusters

    @staticmethod
    def get_cluster_mean_vectors(clusters, vectors, k=20):
        cl_df = pd.DataFrame(clusters, columns=['cluster'])
        cl_df['index'] = cl_df.index
        cluster_indexes = cl_df.groupby(['cluster'])['index'].apply(lambda grp: list(grp)).to_dict()

        cluster_mean_vectors = {}
        for cluster, indexes in cluster_indexes.items():
            cluster_vectors = vectors[indexes]
            sorted_indexes = Cluster_vectors.get_sorted_indexes(cluster_vectors)
            vectors_to_use = cluster_vectors[sorted_indexes[:k]]
            mean_vector = np.mean(vectors_to_use, axis=0)
            cluster_mean_vectors[cluster] = mean_vector
        return cluster_mean_vectors

    def merge_clusters(self, clusters, vectors, min_distance=0.4, k=20):

        cluster_mean_vectors = self.get_cluster_mean_vectors(clusters, vectors, k=k)

        new_vectors = np.array(list(cluster_mean_vectors.values()))
        new_clusters = self.create_clusters(new_vectors, min_distance=min_distance, count_merge=0,
                                            reindex_clusters=False)
        map_old_new_cluster = dict(zip(list(cluster_mean_vectors.keys()), new_clusters))
        return pd.Series(clusters).map(map_old_new_cluster).values

    @staticmethod
    def reindex_clusters(original_clusters, min_count_in_cluster=1):
        # start give id number from bigger clusters to smaller
        serie_original_clusters = pd.Series(original_clusters)
        val_count = serie_original_clusters.value_counts()
        more_val_count = val_count[val_count >= min_count_in_cluster]
        less_val_count = val_count[val_count < min_count_in_cluster]

        cluster_map = dict(zip(more_val_count.index, range(1, len(more_val_count) + 1)))
        cluster_map_less = {ind: -1 for ind in less_val_count.index}
        cluster_map.update(cluster_map_less)
        renamed_clusters = serie_original_clusters.map(cluster_map)
        return renamed_clusters.values

    @staticmethod
    def get_sorted_indexes(cluster_vectors):
        cos_cluster = cosine_similarity(cluster_vectors, cluster_vectors)
        text_sim_to_text_in_cluster = cos_cluster.sum(axis=0) / cos_cluster.shape[0]
        sorted_indexes = np.argsort(text_sim_to_text_in_cluster)[::-1]
        return sorted_indexes
    
import os
import glob
def read_lines(fn):
    if not os.path.exists(fn):
        return []
    with open(fn, 'r', encoding='utf-8') as f:
        text = f.read()
    lines = text.split("\n")
    if lines[-1] == '':
        return lines[:-1]
    else:
        return lines

def write_lines(fn, lines, mode='w'):
    text_to_write = "\n".join(list(lines)) 
    with open(fn, encoding='utf-8', mode=mode) as f:
        f.write(text_to_write)

In [2]:
clustering = Cluster_vectors()

In [3]:
wl_texts = read_lines("../data_parallel/wi+locness/train_tgt")

with open("data/wl_train_tgt_embed.pickle", "rb") as f:
    wl_vectors = pickle.load(f)

In [4]:
len(wl_texts)

34308

In [5]:
%%time
wl_clusters = clustering.create_clusters(wl_vectors, min_distance=0.5, metric="cosine", criterion="distance",
                        reindex_clusters=True, count_merge=0, min_count_in_cluster=5, k=20)

CPU times: user 8min 15s, sys: 10.2 s, total: 8min 25s
Wall time: 8min 25s


In [23]:
pd.Series(wl_clusters).value_counts()[:10]

-1    11630
 1      599
 2      341
 3      307
 4      282
 5      239
 6      200
 7      197
 8      188
 9      165
dtype: int64

In [51]:
#pd.Series(wl_texts)[pd.Series(wl_clusters) == 7].values

In [52]:
%%time
wl_mean_vectors = clustering.get_cluster_mean_vectors(wl_clusters, wl_vectors, k=20)

CPU times: user 22.5 s, sys: 3.85 s, total: 26.3 s
Wall time: 3.63 s


In [53]:
dump_wl = {
    "vectors": wl_vectors,
    "texts": wl_texts,
    "clusters": wl_clusters,
    "mean_vectors": wl_mean_vectors
}

with open("dump_wl.pickle", "wb") as f:
    pickle.dump(dump_wl, f)

In [6]:
nucle_texts = read_lines("../data_parallel/nucle/nucle_tgt")

with open("nucle_train_tgt_embed.pickle", "rb") as f:
    nucle_vectors = pickle.load(f)

In [17]:
len(nucle_texts)

57151

In [8]:
%%time
nucle_clusters = clustering.create_clusters(nucle_vectors, min_distance=0.5, metric="cosine", criterion="distance",
                        reindex_clusters=True, count_merge=0, min_count_in_cluster=5, k=20)

CPU times: user 22min 47s, sys: 34.1 s, total: 23min 21s
Wall time: 23min 21s


In [32]:
pd.Series(nucle_clusters).value_counts()

-1       9289
 1       1550
 2       1269
 3       1038
 4        756
         ... 
 1614       5
 1646       5
 1678       5
 1710       5
 1711       5
Length: 1798, dtype: int64

In [54]:
nucle_mean_vectors = clustering.get_cluster_mean_vectors(nucle_clusters, nucle_vectors, k=20)

In [55]:
pd.Series(nucle_texts)[pd.Series(nucle_clusters) == 4].values[:10]

array(['Reference', 'Reference : 1 .', 'References :', 'References',
       'References :', 'References :', 'References', 'Reference',
       'References', 'Reference'], dtype=object)

In [57]:
pd.Series(nucle_texts)[pd.Series(nucle_clusters) == 6].values[:5]

array(['Retrieved June 29 , 2009 , from http : //www.iisme.org/etp/HS % 20Engineering- % 20Engineering.pdf',
       'Retrieved 14 : 47 , Sep 4 , 2009 , from http : //www.history.com/encyclopedia.do ? articleId = 201891 .',
       'Retrieved 15 : 00 , Sep 4 , 2009 , from http : //www.history.com/encyclopedia.do ? articleId = 200465 .',
       'Retrieved 29 June , 2009 from http : //www.iisme.org/etp/HS % 20Engineering- % 20Engineering.pdf',
       'Retrieved September 7 , 2009 , from USTC News : { http : //news1.ustc.edu.cn/Article_Show.asp ? ArticleID = 7635 } [ HYPERLINK : http : //news1.ustc.edu.cn/Article_Show.asp ? ArticleID = 7635 ] Xin .'],
      dtype=object)

In [58]:
pd.Series(nucle_texts)[pd.Series(nucle_clusters) == 6].values[:5]

array(['Retrieved June 29 , 2009 , from http : //www.iisme.org/etp/HS % 20Engineering- % 20Engineering.pdf',
       'Retrieved 14 : 47 , Sep 4 , 2009 , from http : //www.history.com/encyclopedia.do ? articleId = 201891 .',
       'Retrieved 15 : 00 , Sep 4 , 2009 , from http : //www.history.com/encyclopedia.do ? articleId = 200465 .',
       'Retrieved 29 June , 2009 from http : //www.iisme.org/etp/HS % 20Engineering- % 20Engineering.pdf',
       'Retrieved September 7 , 2009 , from USTC News : { http : //news1.ustc.edu.cn/Article_Show.asp ? ArticleID = 7635 } [ HYPERLINK : http : //news1.ustc.edu.cn/Article_Show.asp ? ArticleID = 7635 ] Xin .'],
      dtype=object)

In [59]:
pd.Series(nucle_texts)[pd.Series(nucle_clusters) == 15].values[:5]

array(['.', '.', '.', '', ''], dtype=object)

In [60]:
pd.Series(nucle_texts)[pd.Series(nucle_clusters) == 17].values[:5]

array(['( 2009 ) .', '( 2009 ) .', '( 2009 September 09 ) .',
       '( 2009 ) .',
       'Retrieved 9 September , 2009 , http : //www.water- technology.net/projects/tuas/'],
      dtype=object)

In [50]:
#pd.Series(nucle_texts)[pd.Series(nucle_clusters) == 18].values

In [61]:
dump_nucle = {
    "vectors": nucle_vectors,
    "texts": nucle_texts,
    "clusters": nucle_clusters,
    "mean_vectors": nucle_mean_vectors
}

with open("dump_nucle.pickle", "wb") as f:
    pickle.dump(dump_nucle, f)

In [10]:
fce_texts = read_lines("../data_parallel/fce/fce_train_tgt")

with open("fce_train_tgt_embed.pickle", "rb") as f:
    fce_vectors = pickle.load(f)

In [19]:
len(fce_texts)

28350

In [11]:
%%time
fce_clusters = clustering.create_clusters(fce_vectors, min_distance=0.5, metric="cosine", criterion="distance",
                        reindex_clusters=True, count_merge=0, min_count_in_cluster=5, k=20)

CPU times: user 5min 26s, sys: 6.8 s, total: 5min 33s
Wall time: 5min 33s


In [62]:
pd.Series(fce_clusters).value_counts()

-1      5029
 1       903
 2       753
 3       713
 4       673
        ... 
 806       5
 822       5
 838       5
 854       5
 911       5
Length: 912, dtype: int64

In [85]:
pd.Series(fce_texts)[pd.Series(fce_clusters) == 3].values[:10]

array(['I have seen your programme for the trip and I think it is going to be a great trip .',
       'Thank you very much for organising this trip and putting all your spare time and effort into it .',
       'I assume that this fact attracted people very much .',
       'Moreover , most of the events I went to were fantastic .',
       "Firstly , I would like to say that I 'm very glad to have been chosen and I will do my best for this competition .",
       'It was unbelievable ! ! !', 'It was an absolutely great idea .',
       'First of all , I would like to thank you for the excellent organization of this trip which seems to be very interesting and useful for the students .',
       'Although modern progress has made our life easier , it may also be harmful .',
       'First of all , I wanted to thank you for giving me the first prize in your competition .'],
      dtype=object)

In [88]:
pd.Series(wl_texts)[pd.Series(wl_clusters) == 29].values

array(['But when I was there , I began to make new friends that I never thought I would have , and I never imagined the way that I was going to know them either . At the beginning , I felt very strange talking with them , but now we are very good friends .',
       'Even there , you can meet really nice people and talk with them .',
       'On the other hand , I have met a lot of people , and they are very friendly .',
       'I was very happy and , of course , I agreed .',
       'When I became , first a teenager and later an adult , I got to know the majority of my friends , good friends .',
       'It is hard to live overseas alone . Therefore , making friends from the same country means that they can help one another .',
       'Getting along with international friends is a great way to expand your horizons .',
       "I enjoy it because it 's well known and easy to find partners to play with .",
       "In addition , I love dealing with people . I 'm very sociable .",
       'They

In [70]:
fce_mean_vectors = clustering.get_cluster_mean_vectors(fce_clusters, fce_vectors, k=20)

In [71]:
dump_fce = {
    "vectors": fce_vectors,
    "texts": fce_texts,
    "clusters": fce_clusters,
    "mean_vectors": fce_mean_vectors
}

with open("dump_fce.pickle", "wb") as f:
    pickle.dump(dump_fce, f)

### Cluster lang8

In [5]:
lang8_texts_tgt = read_lines("../../data_parallel/lang8/lang8_tgt")
lang8_texts_src = read_lines("../../data_parallel/lang8/lang8_src")

with open("data/lang8_train_tgt_embed.pickle", "rb") as f:
    lang8_vectors = pickle.load(f)

In [6]:
len(lang8_vectors)

1037561

In [30]:
len(lang8_texts_tgt)/30

34585.36666666667

In [31]:
batch_size = 35000
count_processed = 0

In [32]:
text_src = lang8_texts_src
text_tgt = lang8_texts_tgt
text_vectors = lang8_vectors

In [33]:
from tqdm.auto import tqdm

In [3]:
%%time
for i in tqdm(range(31)):
    start, end = (i*batch_size, (i+1)*batch_size)
    src_text_batch = text_src[start:end]
    tgt_text_batch = text_tgt[start:end]
    vectors_batch = text_vectors[start:end]
    
    batch_clusters = clustering.create_clusters(vectors_batch, min_distance=0.5, metric="cosine", criterion="distance",
                        reindex_clusters=True, count_merge=0, min_count_in_cluster=5, k=20)
    
    batch_mean_vectors = clustering.get_cluster_mean_vectors(batch_clusters, vectors_batch, k=20)
    
    dump = {
        "text_src": src_text_batch,
        "text_tgt": tgt_text_batch,
        "vectors": vectors_batch,
        "clusters": batch_clusters,
        "mean_vectors": batch_mean_vectors
    }
    
    count_processed += len(tgt_text_batch)
    
    with open(f"data/dump_lang8_{str(i)}.pickle", "wb") as f:
        pickle.dump(dump, f)
    

In [20]:
# %%time
# lang8_clusters = clustering.create_clusters(lang8_vectors, min_distance=0.5, metric="cosine", criterion="distance",
#                         reindex_clusters=True, count_merge=0, min_count_in_cluster=5, k=20)

In [21]:
# dump_lang8 = {
#     "vectors": lang8_vectors,
#     "texts": lang8_texts,
#     "clusters": lang8_clusters
# }

# with open("dump_lang8.pickle", "wb") as f:
#     pickle.dump(dump_lang8, f)

## 1BW

In [6]:
texts = read_lines("../../data_parallel/new_1bw/train_target")
texts = texts[:20000]

with open("data/1bw_train_tgt_embed.pickle", "rb") as f:
    bw_vectors = pickle.load(f)

In [7]:
%%time
bw_clusters = clustering.create_clusters(bw_vectors, min_distance=0.6, metric="cosine", criterion="distance",
                        reindex_clusters=True, count_merge=0, min_count_in_cluster=10, k=20)

CPU times: user 2min 46s, sys: 1.16 s, total: 2min 47s
Wall time: 2min 47s


In [8]:
bw_mean_vectors = clustering.get_cluster_mean_vectors(bw_clusters, bw_vectors, k=20)

In [9]:
dump_bw = {
    "vectors": bw_vectors,
    "texts": texts,
    "clusters": bw_clusters,
    "mean_vectors": bw_mean_vectors
}

In [10]:
with open("data/dump_bw.pickle", "wb") as f:
    pickle.dump(dump_bw, f)

## PIE

In [11]:
texts = read_lines("../../data_parallel/synthetic/a1/a1_corr_train_98.txt")
texts = texts[:20000]

with open("data/pie_train_tgt_embed.pickle", "rb") as f:
    pie_vectors = pickle.load(f)

In [12]:
%%time
pie_clusters = clustering.create_clusters(pie_vectors, min_distance=0.6, metric="cosine", criterion="distance",
                        reindex_clusters=True, count_merge=0, min_count_in_cluster=10, k=20)

CPU times: user 2min 44s, sys: 2.8 s, total: 2min 47s
Wall time: 2min 47s


In [13]:
pie_mean_vectors = clustering.get_cluster_mean_vectors(pie_clusters, pie_vectors, k=20)

In [14]:
dump_pie = {
    "vectors": pie_vectors,
    "texts": texts,
    "clusters": pie_clusters,
    "mean_vectors": pie_mean_vectors
}

In [16]:
with open("data/dump_pie.pickle", "wb") as f:
    pickle.dump(dump_pie, f)

### Blogs

In [17]:
texts = read_lines("../../data_parallel/blogs/train_tgt")
texts = texts[:30000]

with open("data/blogs_train_tgt_embed.pickle", "rb") as f:
    blogs_vectors = pickle.load(f)

In [18]:
%%time
blogs_clusters = clustering.create_clusters(blogs_vectors, min_distance=0.6, metric="cosine", criterion="distance",
                        reindex_clusters=True, count_merge=0, min_count_in_cluster=10, k=20)

CPU times: user 6min 27s, sys: 9.27 s, total: 6min 36s
Wall time: 6min 36s


In [19]:
blogs_mean_vectors = clustering.get_cluster_mean_vectors(blogs_clusters, blogs_vectors, k=20)

In [20]:
dump_blogs = {
    "vectors": blogs_vectors,
    "texts": texts,
    "clusters": blogs_clusters,
    "mean_vectors": blogs_mean_vectors
}

In [21]:
with open("data/dump_blogs.pickle", "wb") as f:
    pickle.dump(dump_blogs, f)