# Tutorial example
https://radimrehurek.com/gensim/models/word2vec.html

In [63]:
import pandas as pd
import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec, KeyedVectors
from gensim import downloader
import gzip 
import math
import itertools

#Input is a seires of n-grams
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

# model = Word2Vec.load("word2vec.model")
# model.train([["hello", "world"]], total_examples=1, epochs=1)
ex_vector = model.wv['computer'] 
sims = model.wv.most_similar('computer', topn=10) 

word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")

# Load back with memory-mapping = read-only, shared across processes.
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')
vector = wv['computer']  # Get numpy vector of a word

del model # to alleviate pressure from memory, using the vectors instead (as I understand it)

OSError: [Errno 22] Invalid argument: 'word2vec.model'

In [2]:
# Train the model to recognise phrases
from gensim.models import Phrases

# Train a bigram detector.
bigram_transformer = Phrases(common_texts)

# Apply the trained MWE detector to a corpus, using the result to train a Word2vec model.
model = Word2Vec(bigram_transformer[common_texts], min_count=1)

In [9]:
#Pre trained models
pre_trained = list(gensim.downloader.info()['models'].keys())
print(pre_trained)
# glove_vectors = gensim.downloader.load('glove-twitter-25')

# glove_vectors.most_similar("hello")

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


NameError: name 'glove_vectors' is not defined

# Experimenting with ngrams

### Preparing Data

My first attempt - partitioning the data according to year (is this a good preliminary step to do before attempting the collab? It will easen the load on the data)

In [4]:
import gzip
import pandas as pd

#example file
ng = "googlebooks-eng-all-5gram-20120701-gp.gz"

# first try at opening
# f = gzip.open(ng, "rb")
# contents = f.readlines()
# for c in contents:
#     c.split("\t")

#Using pandas, first is if you want to chunk data
# for chunk in pd.read_csv(ng, sep='\t', chunksize=10**5):
#     #process
colnames = ["ngram", "year", "match_count", "vol_count"]
df = pd.read_csv(ng, sep='\t', names=colnames, header=None )
#[index] ngram TAB year TAB match_count TAB volume_count NEWLINE


ybins = [1800+i*10 for i in range(22)]
ylabels = [str(dec) + "'s" for dec in ybins[:-1]]

df["decade"] = pd.cut(df["year"], ybins, labels = ylabels)

token_list = []
twenties = df[df["decade"] == "1920's"]
for i, count in enumerate(twenties["match_count"]):
    #     print(i)
    for j in range(count):
        token_list.append(twenties["ngram"].iloc[i].split())



The GoC code - extracting ngrams by iterating through the file path

In [10]:
class ngram_extractor():
    def __init__(self, dir_path, start_yr, end_yr, limit):
        self.dir_path = dir_path
        self.start_yr = start_yr
        self.end_yr = end_yr
        self.limit = limit
    def __iter__(self): #Create generator for ngram counts - each ngram is multiplied by its counts. Easy on memory.
        for filname in os.listdir():
            with gensim.utils.smart_open(os.path.join(self.dir_path, filname)) as fil:
                for line in itertools.islice(fil, self.limit):
                    line = gensim.utils.to_unicode(line).split("\t")
                    ngram = line[0]
                    if len(line)<3: #Why this component?
                        continue
                    try:
                        year = int(line[1])
                    except ValueError:
                        continue
                    if year > self.end_yr or year < self.start_yr:
                        continue
                    match_count = int(line[2])
                    prcssd_ngram = [word.split("_")[0] for word in ngram.lower().split()] #Get rid of POS tagging on end of words
                    for i in range(match_count):
                        yield prcssd_ngram

Training the GoC model

In [12]:
assert gensim.models.word2vec.FAST_VERSION > -1 # check that it is the latest version of Word2vecngrams = ngram_extractor(dir_path, save_path, start_yr, end_yr, limit)

ngrams = ngram_extractor(dir_path, start_yr, end_yr, limit)

#Using the recommended parameters according to Radim Rehurek
model = gensim.models.word2vec.Word2Vec(ngrams,sg=1, size=300, window=5, min_count=10, workers=10, hs=0, negative=8)
model.save('w2vmodel_ng5_'+str(start_yr)+'_'+str(end_yr)+'_full')
syn0_object=model.wv.syn0

##output vector space##
numpy.savetxt('syn0_ngf_'+str(start_yr)+'_'+str(end_yr)+'_full.txt',syn0_object,delimiter=" ")

NameError: name 'dir_path' is not defined

Partitioning ngrams according to decade and saving in folders

In [61]:
class ngram_filer():
    def __init__(self, dir_path, save_path, start_yr, end_yr, limit):
        self.dir_path = dir_path
        self.save_path = save_path
        self.start_yr = start_yr
        self.end_yr = end_yr
        self.limit = limit
        self.decades = [start_yr+i*10 for i in range((end_yr-start_yr)/10)]
        
    def __iter__(self): 
        write_dic = {}
        for dec in self.decades:
            fname = str(dec) + ".gz"
            write_dic[dec] = gzip.open(fname, "wt")
            
        for filname in os.listdir():
            with gensim.utils.smart_open(os.path.join(self.dir_path, filname)) as fil:
                for line in itertools.islice(fil, self.limit):
                    line = gensim.utils.to_unicode(line).split("\t")
                    ngram = line[0]
                    if len(line)<3: #Why this component?
                        continue
                    try:
                        year = int(line[1])
                    except ValueError:
                        continue
                       u 
                    dec = math.floor(year/10)*10
                    prcssd_ngram = [word.split("_")[0] for word in ngram.lower().split()] #Get rid of POS tagging on end of words
                    
                    new_line = pr
                    try:
                        write_dic[dec].write('Contents of the example file go here.\n')
                    except:
                        continue #is this what I want?
                    for i in range(match_count):
                        yield prcssd_ngram


Mini section of this for testing

In [5]:
import gensim
import itertools
import gzip
import os
import math

os.chdir("C:/Users/user/Google Drive/KU/Thesis")
ng = "googlebooks-eng-all-5gram-20120701-gp.gz"
ng = '-aa'
fil = gensim.utils.open(ng)
fil = gzip.open(ng, "rt")
test_fil = gzip.open("test.gz", "wt")
for i,line in enumerate(fil):
#     print(f"jkl: {i}", line)

    line = gensim.utils.to_unicode(line).split("\t")
    ngram = line[0]
#     print(ngram)
    if len(line)<3: #Why this component?
        continue
    try:
        year = int(line[1])
    except ValueError:
        continue
    vol_count = [line[-1][:-2]]

    dec = math.floor(year/10)*10
    prcssd_ngram = [" ".join([word.split("_")[0] for word in ngram.lower().split()])] #Get rid of POS tagging on end of words
    new_line = "\t".join(prcssd_ngram + line[1:]) 
    if i <10:
        print(line, new_line)
    test_fil.write(new_line)
    
print("Here")

fil2 = gensim.utils.open("test.gz")   
test_fil = gzip.open("test.gz", "rt")
for i,line in enumerate(test_fil):
#     line = line.split("\n")
    if i <10:
        print(line[:-2])
    
#     g_line = gensim.utils.to_unicode(line).split("\t")
    
#     print(f"***{i}***", line)
# prcssd_ngram = [word.split("_")[0] for word in ng.lower().split()]

['AA Foundation for Road Safety', '1990', '4', '2\n'] aa foundation for road safety	1990	4	2

['AA Foundation for Road Safety', '1992', '1', '1\n'] aa foundation for road safety	1992	1	1

['AA Foundation for Road Safety', '1993', '1', '1\n'] aa foundation for road safety	1993	1	1

['AA Foundation for Road Safety', '1995', '1', '1\n'] aa foundation for road safety	1995	1	1

['AA Foundation for Road Safety', '1996', '20', '8\n'] aa foundation for road safety	1996	20	8

['AA Foundation for Road Safety', '1997', '3', '3\n'] aa foundation for road safety	1997	3	3

['AA Foundation for Road Safety', '1998', '8', '3\n'] aa foundation for road safety	1998	8	3

['AA Foundation for Road Safety', '1999', '3', '2\n'] aa foundation for road safety	1999	3	2

['AA Foundation for Road Safety', '2000', '2', '1\n'] aa foundation for road safety	2000	2	1

['AA Foundation for Road Safety', '2001', '12', '2\n'] aa foundation for road safety	2001	12	2

Here
aa foundation for road safety	1990	4	
aa foundation

In [60]:
import os
os.chdir("D:/google_ngrams/gb_12")
fil = gzip.open("1-1", mode = "rt")
count = 0
for i,line in enumerate(itertools.islice(fil, 100000000)):
    count += 1
print(count)
# colnames = ["ngram", "year", "match_count", "vol_count"]
# df = pd.read_csv("1-0.gz", sep='\t', names=colnames, header=None )
# df

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'D:/google_ngrams/gb_12'

In [59]:
total_time = ((30*5*100)/60)*2 
# total_time/(2*5)
total_time #250 for one GB decade 2.5k for 20th century
(50*5)/100

2.5

### Training model

In [None]:
model = Word2Vec(sentences=token_list, vector_size=100, window=5, min_count=1, workers=4)
model.save("example_ngram.model")
# word_vec = model.wv
# word_vec.most_similar("Government")

In [29]:
import pandas as pd
pd.read_csv("googlebooks-eng-all-totalcounts-20120701.txt", header = 0, names = ["Year", "match_count", "page_count", "volume_count"], sep = ",")


Unnamed: 0,Year,match_count,page_count,volume_count
