In [2]:
import pandas as pd
import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec, KeyedVectors
from gensim import downloader
import gzip 
import math
import itertools
from time import time
import os
import tqdm.notebook as tq

In [3]:
class ngram_extractor():
    def __init__(self, dir_path, start_yr, end_yr, limit):
        self.dir_path = dir_path
        self.start_yr = start_yr
        self.end_yr = end_yr
        self.limit = limit
    def __iter__(self): #Create generator for ngram counts - each ngram is multiplied by its counts. Easy on memory.
        os.chdir(self.dir_path)
        for filname in os.listdir():
            print(filname)
            with gzip.open(os.path.join(self.dir_path, filname)) as fil:
                for line in tq.tqdm(itertools.islice(fil, self.limit)):
                    line = gensim.utils.to_unicode(line).split("\t")
                    ngram = line[0]
                    if len(line)<3: #Why this component?
                        continue
                    try:
                        year = int(line[1])
                    except ValueError:
                        continue
                    if year > self.end_yr or year < self.start_yr:
                        continue
                    match_count = int(line[2])
                    prcssd_ngram = [word.split("_")[0] for word in ngram.lower().split()] #Get rid of POS tagging on end of words
                    for i in range(match_count):
                        yield prcssd_ngram

# Classic version

In [4]:
assert gensim.models.word2vec.FAST_VERSION > -1 # check that it is the latest version of Word2vecngrams = ngram_extractor(dir_path, save_path, start_yr, end_yr, limit)
dir_path = ("D:/google_ngrams/raw_data/gb_12/")
start_yr, end_yr = 1900, 1910
ngrams = ngram_extractor(dir_path, start_yr = start_yr, end_yr = end_yr, limit=None)

#Using the recommended parameters according to Radim Rehurek
#model = gensim.models.word2vec.Word2Vec(ngrams,sg=1, vector_size=300, window=5, min_count=10, workers=10, hs=0, negative=8)
model = Word2Vec.load('D:/google_ngrams/vectors/us_12/w2vmodel_ng5_1990.gz_vocab')
model.train(ngrams, total_examples = model.corpus_count, epochs = 5)

model.save('w2vmodel_ng5_'+str(start_yr)+'_'+str(end_yr)+'_full')
syn0_object=model.wv.syn0

##output vector space##
numpy.savetxt('syn0_ngf_'+str(start_yr)+'_'+str(end_yr)+'_full.txt',syn0_object,delimiter=" ")

a_


0it [00:00, ?it/s]

KeyboardInterrupt: 

# Iterable

In [5]:
assert gensim.models.word2vec.FAST_VERSION > -1 # check that it is the latest version of Word2vecngrams = ngram_extractor(dir_path, save_path, start_yr, end_yr, limit)

dir_path = ("D:/google_ngrams/raw_data/gb_12/")
start_yr, end_yr = 1800, 1810
ngrams = ngram_extractor(dir_path, start_yr = start_yr, end_yr = end_yr, limit=None)

#an iterable version, so each iteration is ssaved (perhaps for each file there should be a save?):
epochs = 5
vocab = False
completed = 0
method = 1 # 1  sg, cbow = 0
size = 300 # vector size
min_count = 25
data_dir = "D:/google_ngrams/processed_data/us_12_latest8"
out_dir = "D:/google_ngrams/vectors/us_12/"
os.chdir(data_dir)
print(os.listdir())
filname = "1990.gz"
for i in range(epochs-completed):
    if vocab:
        model = Word2Vec.load(out_dir + '/w2vmodel_ng5_' + filname  + "_" + str("vocab"))
        print("Vocab loaded.")
    elif completed != 0:
        model = Word2Vec.load(out_dir +'/w2vmodel_ng5_' + filname + "_" + str(completed))
        print("Model loaded.")
    elif i == 0:
        model = gensim.models.word2vec.Word2Vec(sg=method, vector_size=size, window=5, min_count=0, workers=10, hs=0, negative=8, epochs=1)
        model.build_vocab(ngrams)
        print("Vocab built.")
        model.save('w2vmodel_ng5_'+ filname +'_' + str("vocab"))
    model.train(ngrams, total_examples = model.corpus_count, epochs = 1)
    model.save('w2vmodel_ng5_'+ filname +'_' + str(completed+i+1))
    print("Epoch " + str(completed + i + 1) + " completed.")


['1500.gz', '1510.gz', '1520.gz', '1530.gz', '1540.gz', '1550.gz', '1560.gz', '1570.gz', '1580.gz', '1590.gz', '1600.gz', '1610.gz', '1620.gz', '1630.gz', '1640.gz', '1650.gz', '1660.gz', '1670.gz', '1680.gz', '1690.gz', '1700.gz', '1710.gz', '1720.gz', '1730.gz', '1740.gz', '1750.gz', '1760.gz', '1770.gz', '1780.gz', '1790.gz', '1800.gz', '1810.gz', '1820.gz', '1830.gz', '1840.gz', '1850.gz', '1860.gz', '1870.gz', '1880.gz', '1890.gz', '1910.gz', '1930.gz', '1940.gz', '1950.gz', '1960.gz', '1970.gz', '1980.gz', '1990.gz', '2000.gz', '1930', '1910', '1900', '1900.gz', '1920', '1920.gz']
a_


0it [00:00, ?it/s]

KeyboardInterrupt: 