In [9]:
!pip install glove-python-binary
import nltk
nltk.download('punkt')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [123]:
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
import time
from glove import Glove,Corpus
import pandas as pd


def data_preprocessing(raw_file):
    data = []
    # iterate through each sentence in the file
    for i in sent_tokenize(raw_file):
        temp = []
        # tokenize the sentence into words and avoid stop words
        for j in word_tokenize(i):
            if not j.isalpha():
                continue

            temp.append(j.lower())
        data.append(temp)

    flat_list = list(set(np.concatenate(data).flat))
    return data, flat_list

def prepare_data( data, limit_data = None):
    db = []
    words = []
    for i, patent in enumerate(data[:limit_data]):
        print(f"training on patent num: {i}", end = '\r')
        f = patent.replace("\n", " ")
        db_file, words_file = data_preprocessing(f)
        db = db + db_file
        words = words + words_file

    flat_list = list(set(words))

    return db,flat_list


path = r'C:\Users\Chen\Desktop\Masters_degree\NLP'  # PC Chen
# path = r'C:/Users/Chen/Documents/Masters_degree/word2vec_vs_glove' # Laptop Chen
class W2V:
    def __init__(self, vector_size, window_size, sg = 1, lr = 0.05, workers = 4,data=[]):
        self.vector_size = vector_size
        self.window_size = window_size
        self.sg = sg
        self.learning_rate = lr
        self.workers = workers

        self.data = data
        self.model = None
        self.vocab = None

    def train_word2vec(self, num_epochs, model_path = ""):
        print("\n\nTraining word2vec model")
        # Create Skip Gram model ( sg=1 )  or CBOW ( sg=0 )
        start = time.perf_counter()

        word2vec = Word2Vec(sentences=self.data,size=self.vector_size,
                            window=self.window_size,sg=self.sg,min_count =0,
                            # epochs=num_epochs,
                            workers=self.workers,alpha=self.learning_rate)
                            
            # self.data, self.vector_size,self.window_size,
            #                 self.sg,num_epochs,self.workers, self.learning_rate)

        end = time.perf_counter()

        print("Finished training word2vec model\n")

        self.model = word2vec
        self.vocab = self.model.wv.index2word
        # self.vocab = self.model.wv.index_to_key
        if model_path != "":
            self.model.save(model_path)

        return end - start

    def check_one_word(self, word):
        a = []
        for i in self.vocab:
            a.append(self.model.wv.similarity(word, i))
        return a

  
    def delete_unknown_words(self, words):

        known_words = [word for word in words if word in self.vocab]
        unknown_words = list(set(words) - set(known_words))
        return known_words, unknown_words

    def get_top_similar_words(self, word, num = 5):
        most_similar = np.array(self.model.wv.most_similar(word, topn = num)).transpose()[0]
        return most_similar


    def check_synonyms_in_model(self,target, words, num):
        count = 0
        similar_words = self.get_top_similar_words(target, num)
        for word in words:
            if word in similar_words:
                count += 1
        return count

    def load_model(self, path):
        self.model = Word2Vec.load(path)
        self.vocab = self.model.wv.index2word
        # self.vocab = self.model.wv.index_to_key


In [115]:
# path = r'C:\Users\Chen\Desktop\Masters_degree\NLP'  # PC Chen
class GV:
    def __init__(self,vector_size, window_size, lr = 0.05, workers = 4,data=[]):
        self.vector_size = vector_size
        self.window_size = window_size
        self.learning_rate = lr
        self.workers = workers

        self.data = data
        self.model = None
        self.vocab = None
        self.corpus=Corpus()
        if data!=[]:
            self.corpus.fit(self.data, window=self.window_size)
  
    def load_glove(self,path):
        self.model=Glove.load(path)
        self.vocab=self.model.dictionary.keys()

    def train_glove(self,num_epochs,model_path=""):
        # self.corpus.fit(self.data, window=self.window_size)
        glove = Glove(no_components=self.vector_size, learning_rate=self.learning_rate) 
        start = time.perf_counter()
        glove.fit(self.corpus.matrix, epochs=num_epochs, no_threads=self.workers, verbose=True)
        end = time.perf_counter()
        glove.add_dictionary(self.corpus.dictionary)
        self.model=glove
        self.vocab=self.model.dictionary.keys()
        if model_path != "":
            self.model.save(model_path)
        return end-start
    def get_top_similar_words(self, word, num = 5):
        most_similar = np.array(self.model.most_similar(word, number = num+1)).transpose()[0]
        return most_similar


    def check_synonyms_in_model(self,target, words, num):
        count = 0
        similar_words = self.get_top_similar_words(target, num)
        for word in words:
            if word in similar_words:
                count += 1
        return count


# path = r'C:/Users/Chen/Documents/Masters_degree/word2vec_vs_glove' # Laptop Chen

In [121]:
full_text=pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/full_text.pkl')
db,a=prepare_data(full_text, limit_data = 100)
glove_model = GV(300, 2, 0.05, 4,data=db)
train_time = glove_model.train_glove(num_epochs = 30, model_path = '/content/drive/MyDrive/Colab Notebooks/output/glove_model')

print(f"training the GloVe model took : {train_time:0.4f} sec \n")



ValueError: ignored

In [116]:
# glove_model = GV(300, 2, 0.05, 4)
# glove_model.load_glove(path='/content/drive/MyDrive/Colab Notebooks/output/glove_model')

In [125]:

w2v_model = W2V(300, 2, 1, 0.05, 4,data=db)
train_time = w2v_model.train_word2vec(num_epochs = 30, model_path = '/content/drive/MyDrive/Colab Notebooks/output/word2vec_model')

print(f"training the model took : {train_time:0.4f} sec \n")





Training word2vec model
Finished training word2vec model

training the model took : 23.0080 sec 

