In [1]:
import re
from gensim import utils
import csv
import nltk

nltk.download('stopwords')

csv.field_size_limit(100000000)

class TCUCorpus(object):
    """An interator that yields sentences (lists of str)."""
    
    def clean_text(self, text):
        #source: https://medium.com/ml2vec/using-word2vec-to-analyze-reddit-comments-28945d8cee57
        
        #Normalize tabs and remove newlines
        no_tabs = str(text).replace('\t', ' ').replace('\n', '');
        
        #Remove all characters except A-Z
        alphas_only = re.sub("[^a-zA-Z]", " ", no_tabs);
        
        #Normalize spaces to 1
        multi_spaces = re.sub(" +", " ", alphas_only);
        
        #Strip trailing and leading spaces
        no_spaces = multi_spaces.strip();
        
        #Remove stopwords 
        stopwords = nltk.corpus.stopwords.words('portuguese')
        clean_text = [w for w in no_spaces.split() if not w in stopwords] 

        return ' '.join(clean_text)
                

    def __iter__(self):
        filename = 'datasets/acordaos_relator_5k.csv'
        with open(filename, "r") as csvfile:
            corpus = csv.reader(csvfile)
            next(corpus) #ignores header
            for doc in corpus:
                yield  utils.simple_preprocess(self.clean_text(doc[1]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/danieljunior/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import gensim.models

sentences = TCUCorpus()
model = gensim.models.Word2Vec(sentences=sentences, min_count=50, size=300, workers=4)

In [3]:
model.save('model.bin')
# loaded_model = gensim.models.Word2Vec.load('model.bin')

In [26]:
# source: https://github.com/v1shwa/document-similarity/blob/master/DocSim.py
import numpy as np
from gensim import utils
import re
import nltk 

nltk.download('stopwords')

class DocSim:
    
    def __init__(self, w2v_model):
        self.w2v_model = w2v_model
    
    def clean_text(self, text):
        #source: https://medium.com/ml2vec/using-word2vec-to-analyze-reddit-comments-28945d8cee57
        
        #Normalize tabs and remove newlines
        no_tabs = str(text).replace('\t', ' ').replace('\n', '');
        
        #Remove all characters except A-Z
        alphas_only = re.sub("[^a-zA-Z]", " ", no_tabs);
        
        #Normalize spaces to 1
        multi_spaces = re.sub(" +", " ", alphas_only);
        
        #Strip trailing and leading spaces
        no_spaces = multi_spaces.strip();
        
        #Remove stopwords
        stopwords = nltk.corpus.stopwords.words('portuguese')
        clean_text = [w for w in no_spaces.split() if not w in stopwords] 

        return ' '.join(clean_text)


    def vectorize(self, doc: str) -> np.ndarray:
        """
        Identify the vector values for each word in the given document
        :param doc:
        :return:
        """
#         words = utils.simple_preprocess(self.clean_text(doc))
        word_vecs = []
        for word in doc:
            try:
                vec = self.w2v_model.wv[word]
                word_vecs.append(vec)
            except KeyError:
                # Ignore, if the word doesn't exist in the vocabulary
                pass

        # Assuming that document vector is the mean of all the word vectors
        # PS: There are other & better ways to do it.
        vector = np.mean(word_vecs, axis=0)
        return vector

    def _cosine_sim(self, vecA, vecB):
        """Find the cosine similarity distance between two vectors."""
        csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
        if np.isnan(np.sum(csim)):
            return 0
        return csim

    def calculate_similarity(self, source_doc, target_docs=None, threshold=0):
        """Calculates & returns similarity scores between given source document & all
        the target documents."""
        if not target_docs:
            return []

        if isinstance(target_docs, str):
            target_docs = [target_docs]

        source_vec = self.vectorize(source_doc)
        results = []
        for doc in target_docs:
            target_vec = self.vectorize(doc)
            sim_score = self._cosine_sim(source_vec, target_vec)
            if sim_score > threshold:
                results.append({"score": sim_score, "doc": doc})
            # Sort results by score in desc order
            results.sort(key=lambda k: k["score"], reverse=True)

        return results

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/danieljunior/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
from annoy import AnnoyIndex

class TCUAnnoyIndexer:
    
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.indexer = None

    def build(self, corpus, n_tree=10, metric='angular'):
        f = self.word2vec.vector_size
        t = AnnoyIndex(f, metric)  # Length of item vector that will be indexed
        for i, doc in enumerate(corpus):
            v = DocSim(self.word2vec).vectorize(doc)
            t.add_item(i, v)

        t.build(n_tree)
        self.indexer = t
    
    def save(self, path='test.ann'):
        self.indexer.save(path)
    
    def load(self, path='test.ann', metric='angular'):
        self.indexer = AnnoyIndex(self.word2vec.vector_size, metric)
        self.indexer.load(path) # super fast, will just mmap the file

    def get_k_neighbors(self, item_id, k):
        return self.indexer.get_nns_by_item(item_id, k) # will find the 1000 nearest neighbors

In [28]:
docs = TCUCorpus()
indexer = TCUAnnoyIndexer(model)
indexer.build(docs, n_tree=100)

In [29]:
indexer.save()

In [34]:
indexer = TCUAnnoyIndexer(model)
indexer.load()

In [36]:
indexer.get_k_neighbors(10, 5)

[10, 301, 623, 227140, 414]

In [38]:
indexer.indexer.get_item_vector(10)

[-0.41857677698135376,
 0.594915509223938,
 0.5671983957290649,
 -0.8950641751289368,
 -0.16800275444984436,
 -0.006473747082054615,
 -0.18669268488883972,
 0.32550209760665894,
 -0.10208466649055481,
 -0.1160338893532753,
 0.7734190821647644,
 -0.10675669461488724,
 -0.43691563606262207,
 0.21447418630123138,
 -0.7320036888122559,
 0.23761135339736938,
 -0.055727001279592514,
 0.6368032693862915,
 -0.15061238408088684,
 -0.07796695083379745,
 0.22029219567775726,
 0.5957158207893372,
 -0.2652522325515747,
 0.07908584177494049,
 0.29124340415000916,
 0.02937070094048977,
 0.3197613060474396,
 -0.7666438817977905,
 0.13228237628936768,
 0.19182248413562775,
 -0.05887952819466591,
 0.29060548543930054,
 0.43367335200309753,
 -0.08448410779237747,
 -0.18387064337730408,
 -0.120962955057621,
 0.018008029088377953,
 0.16499193012714386,
 -0.2880810797214508,
 0.21324284374713898,
 -0.09496060758829117,
 -0.10431899130344391,
 0.8365148901939392,
 -0.3765210509300232,
 -0.11423588544130325,


In [39]:
indexer.indexer.get_distance(10, 301)

0.0960145965218544