In [1]:
import pandas as pd
import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec, KeyedVectors
from gensim import downloader
import gzip 
import math
import itertools
from time import time
import os
import tqdm.notebook as tq

colab = True

if colab == True:
  from google.colab import files
  from google.colab import drive
  drive.mount('/content/drive/')
  from pydrive.auth import GoogleAuth
  from pydrive.drive import GoogleDrive
  from google.colab import auth
  from oauth2client.client import GoogleCredentials
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  dir="/content/drive/MyDrive/google_ngrams/gb_12_processed_full"
else:
    dir="D:/google_ngrams/gb_12_processed_filtered"

In [2]:
 #Create generator for ngram counts - each ngram is multiplied by its counts. Easy on memory.
class ngram_extractor():
    def __init__(self, filname, extracted=False, limit=None):
        self.limit = limit
        self.filname = filname
        self.extracted = extracted
    def __iter__(self):
      # file_list = drive.ListFile({'q': "title contains '.gz'"}).GetList() #shoehorn into my usecase!! source: https://colab.research.google.com/notebooks/snippets/drive.ipynb#scrollTo=-f-hfkapsiPc
        start = time()
        print(self.filname)
        if self.extracted==False:
            with gzip.open(self.filname, "rt", encoding="utf-8") as fil: #just changed from r to rt so that it reads the csv separations
            # with gensim.utils.smart_open(os.path.join(self.dir_path, filname)) as fil:
                for line in tq.tqdm(itertools.islice(fil, self.limit)):
                    line = gensim.utils.to_unicode(line).split("\t")
                    ngram = line[0]
                    # if len(line)<3: #Why this component?
                    #     continue
                    # try:
                    #     year = int(line[1])
                    # except ValueError:
                    #     continue
                    # if year > self.end_yr or year < self.start_yr:
                    #     continue
                    match_count = int(line[2])
                    prcssd_ngram = [word.split("_")[0] for word in ngram.lower().split()] #Get rid of POS tagging on end of words
                    for i in range(match_count):
                        yield prcssd_ngram
                print(f"Time taken: {time()-start}")
        else: 
            with open(self.filname, "rt", encoding="utf-8") as fil:
                for line in tq.tqdm(itertools.islice(fil, self.limit)):
                    line = gensim.utils.to_unicode(line).split("\t")
                    ngram = line[0]
                    # if len(line)<3: #Why this component?
                    #     continue
                    # try:
                    #     year = int(line[1])
                    # except ValueError:
                    #     continue
                    # if year > self.end_yr or year < self.start_yr:
                    #     continue
                    match_count = int(line[2])
                    prcssd_ngram = [word.split("_")[0] for word in ngram.lower().split()] #Get rid of POS tagging on end of words
                    for i in range(match_count):
                        yield prcssd_ngram
                print(f"Time taken: {time()-start}")

In [14]:
class ngram_extractor_lite():
    def __init__(self, filname, limit=None):
        self.limit = limit
        self.filname = filname
    def __iter__(self):
        start = time()
        print(self.filname)
        with gzip.open(self.filname, "rt", encoding="utf-8") as fil: #just changed from r to rt so that it reads the csv separations
        # with gensim.utils.smart_open(os.path.join(self.dir_path, filname)) as fil:
            for line in tq.tqdm(itertools.islice(fil, self.limit)):
                line = gensim.utils.to_unicode(line).split("\t")
                ngram = line[0].split()
               # print(ngram)
                # if len(line)<3: #Why this component?
                #     continue
                # try:
                #     year = int(line[1])
                # except ValueError:
                #     continue
                # if year > self.end_yr or year < self.start_yr:
                #     continue
                match_count = int(line[-1])
                #prcssd_ngram = [word.split("_")[0] for word in ngram.lower().split()] #Get rid of POS tagging on end of words
                for i in range(match_count):
                    yield ngram
            print(f"Time taken: {time()-start}")

In [None]:
assert gensim.models.word2vec.FAST_VERSION > -1 # check that it is the latest version of Word2vecngrams = ngram_extractor(dir_path, save_path, start_yr, end_yr, limit)

# %cd "/content/drive/My\ Drive/google_ngrams/gb_12_processed_full" #for google colab
# coll = "gb_12_processed_3"
# os.chdir("D:/google_ngrams/gb_12_processed_3")
os.chdir(dir)
print(os.listdir())

# "1900.gz" - 4gb, ci - 45mb, cl - 100 mb
filnames = [str(1900+10*i) + ".gz" for i in range(10)]
start = time()
#Create generator
for filname in filnames[::-1]:
    ngrams = ngram_extractor_lite(filname=filname)

    #Run model
    #Using the recommended parameters according to Radim Rehurek. VECTOR_SIZE CHANGES TO SIZE IF USING GOOGLE COLAB
    model = gensim.models.word2vec.Word2Vec(ngrams,sg=1, vector_size=300, window=5, min_count=25, workers=10, hs=0, negative=8) 
    #default epochs = 5, size = vector dimensions, min_count= min count for word to be considered, workers for multiprocessing, sg=1 = skipgram (0 = CBOW), hs=1 = softmax used, negative = no. noise words used in negative sampling
    model.save('w2vmodel_ng5_'+ filname[:-3] +'_full')


['1500.gz', '1510.gz', '1520.gz', '1530.gz', '1540.gz', '1550.gz', '1560.gz', '1570.gz', '1580.gz', '1590.gz', '1600.gz', '1610.gz', '1620.gz', '1630.gz', '1640.gz', '1650.gz', '1660.gz', '1670.gz', '1680.gz', '1690.gz', '1700.gz', '1710.gz', '1720.gz', '1730.gz', '1740.gz', '1750.gz', '1760.gz', '1770.gz', '1780.gz', '1790.gz', '1800.gz', '1810.gz', '1820.gz', '1830.gz', '1840.gz', '1850.gz', '1860.gz', '1870.gz', '1880.gz', '1890.gz', '1900.gz', '1910.gz', '1920.gz', '1930.gz', '1940.gz', '1950.gz', '1960.gz', '1970.gz', '1980.gz', '1990.gz', '2000.gz', 'w2vmodel_ng5_1800.gz_full', 'w2vmodel_ng5_1980.gz_full.wv.vectors.npy', 'w2vmodel_ng5_1980.gz_full.syn1neg.npy', 'w2vmodel_ng5_1980.gz_full', 'w2vmodel_ng5_1970.gz_full.wv.vectors.npy', 'w2vmodel_ng5_1970.gz_full.syn1neg.npy', 'w2vmodel_ng5_1970.gz_full', 'w2vmodel_ng5_1900_full']
1900.gz


0it [00:00, ?it/s]

In [None]:

#an iterable version, so each iteration is ssaved (perhaps for each file there should be a save?):
epochs = 5
for i in range(epochs):
  if i > 0:
    model = Word2Vec.load('w2vmodel_ng5_' + coll + "_" + str(start_yr)+'_'+str(end_yr)+str(-1))
    model.train(ngrams,sg=1, size=300, window=5, min_count=10, workers=10, hs=0, negative=8, epochs=1)
  else:
    model = gensim.models.word2vec.Word2Vec(ngrams,sg=1, size=300, window=5, min_count=10, workers=10, hs=0, negative=8, epochs=1)
    model.save('w2vmodel_ng5_'+ str(start_yr)+'_'+str(end_yr)+'_' + i)

##output vector space##
numpy.savetxt('syn0_ngf_'+str(start_yr)+'_'+str(end_yr)+'_full.txt',syn0_object,delimiter=" ")

print(f"Time to download: {time() - start}")