### Logging

In [2]:
import logging
#logging.getLogger('').handlers = []  #To delete previous logging configuration

logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
    #If log into standard output and to a file is desired:
    #handlers=[logging.FileHandler("{0}/{1}.log".format('./', uniName)), #path, File name
    #         logging.StreamHandler()]
)

# Batch Model Training

In [3]:
import os
#from word2vecTraining import preprocessing, training, evaluation

# Where Text Corpus is located
inputPath = os.path.normpath(r'D:\data\nyt') 
#inputPath = os.path.normpath(r'.') 

# Get all CSV Text Corpus files In target Path
#onlyfiles = [f for f in listdir(inputPath) if isfile(join(inputPath, f)) and f.endswith('.csv')]
#onlyfiles = onlyfiles[:2] #For testing
# print(onlyfiles)
# len(onlyfiles)

#Where output word2vec models will be stored
outputPath = os.path.normpath(r'D:\data\nyt\word2vecModels') #\ to \\
#outputPath = os.path.normpath(r'.\models') 

## Preprocessing

In [4]:
import re
import csv
import time
from gensim.parsing.preprocessing import * #provides a number of convenience preprocessing functions optimized for speed
from gensim.models.word2vec import Word2Vec
import gensim
from multiprocessing import cpu_count




def preprocessing(file):

    CUSTOM_FILTERS = [lambda x: x.lower(), #To lowercase
                      lambda text: re.sub(r'https?:\/\/.*\s', '', text, flags=re.MULTILINE), #To Strip away URLs
                      #split_alphanum, #Add spaces between digits & letters in s using RE_AL_NUM.
                      strip_tags, #Remove tags from s using RE_TAGS.
                      strip_non_alphanum,#Remove non-alphabetic characters from s using RE_NONALPHA.
                      strip_punctuation, #Replace punctuation characters with spaces in s using RE_PUNCT.
                      strip_numeric, #Remove digits from s using RE_NUMERIC.
                      strip_multiple_whitespaces,#Remove repeating whitespace characters (spaces, tabs, line breaks) from s and turns tabs & line breaks into spaces using RE_WHITESPACE.
                      remove_stopwords, # Set of 339 stopwords from Stone, Denis, Kwantes (2010).
                      #lambda x:" ".join(w for w in x.split() if w not in stopword_file) #Custom stopwords
                      lambda x: strip_short(x, minsize=3), #Remove words with length lesser than minsize from s.
                      #stem_text #Transform s into lowercase and stem it.
                     ]

    tic = time.time() # Start timing


    csv.field_size_limit(2147483647)
    #csv.field_size_limit(sys.maxsize)
    #Option 1: split paragraphs Into  sentences
    with open(file,'r', newline='',encoding="utf-8") as inpFile:

        csvObject = csv.reader(inpFile, delimiter=',',quotechar='"')

        wordThreshold=5 #Important: filter out sentences with less than wordThreshold words

        sentences = []
        for csvEntry in csvObject:
            if len(csvEntry)>1:
                #IMPORTANT: If all of your sentences have been loaded as one sentence, Word2vec training could take a very long time.
                #That’s because Word2vec is a sentence-level algorithm, so sentence boundaries are very important, because
                #co-occurrence statistics are gathered sentence by sentence. For many corpora, average sentence length is six words.
                #That means that with a window size of 5 you have, say, 30 (random number here) rounds of skip-gram calculations.
                #If you forget to specify your sentence boundaries, you may load a “sentence” that’s 10,000 words long.
                #In that case, Word2vec would attempt a full skip-gram cycle for the whole 10,000-word “sentence”. Hence, I split
                #the CSV entries By paragraphs '\n
                lines = csvEntry[0].split('\n') #csvEntry[0] is url csvEntry[1] is text Fetched from URL

                for line in lines: #Different elements appear in their own line
                    words = preprocess_string(line,CUSTOM_FILTERS)

                    if len(words)>wordThreshold: #Important: filter out sentences with less than wordThreshold words
                        sentences.append(words)

    toc = time.time() # Start timing
    computationTime = toc-tic

    print("Reading Corpus file and preprocessing time:" +str(computationTime)+" seconds")

    print(" printing Top 2 and last sentences For sanity check")
    for i, s in enumerate(sentences[0:2]):
        print(i,s)
    print(len(sentences),sentences[-1])

    print(" stats about Corpus read from file")
    wordsInCorpus = 0
    for i, s in enumerate(sentences):
        wordsInCorpus += len(s)
    print("Number of words in corpus:",wordsInCorpus)
    print("Number of sentences in corpus:",len(sentences))
    #for i, s in enumerate(sentences[0:30]):
    #    print(i,s)
    return sentences

def training(sentences):
    #Training the model
    tic = time.time() # Start timing

    #For the Score method to work hs And negative Parameters need to be specified
    #A good heuristic For Word vectors dimensions `size` thats frequently used is the square-root of the length of the vocabulary, after pre-processing

    model = Word2Vec(sentences, # The sentences iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network
                     sg=0, #Defines the training algorithm. If 1, skip-gram is employed; otherwise, CBOW is used
                     size=300,#Dimensionality of the feature vectors
                     window=10,# The maximum distance between the current and predicted word within a sentence
                     min_count=5, #Ignores all words with total frequency lower than this
                     workers=cpu_count()-3, #Use these many worker threads to train the model (=faster training with multicore machines).
                     hs = 0, # int {1,0}) – If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
                     negative = 10, # If > 0, negative sampling will be used, specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
                     sample = 0.001, # The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
                     iter = 5, # (int) – Number of iterations (epochs) over the corpus. //5
                    )

    toc = time.time() # Start timing
    computationTime = toc-tic
    print("Computing time for training the model:" +str(computationTime)+" seconds")
    wordsInCorpus = sum([len(l) for l in sentences])
    print("Number of words processed per second:",wordsInCorpus/computationTime)
    print(model)
    print("Most frequent words In model: ", model.wv.index2word[:10])

    return model

2018-10-15 11:46:11,511 : INFO : 'pattern' package not found; tag filters are not available for English


In [8]:
years = ['1999','1998','1997','1996','1995','1994','1993','1992','1991','1990','1989','1988','1987','1986','1985']

for year in years: 
    print("starting the processing of " + year)
    inputFilePath = os.path.join(inputPath, year+'.csv')
    sentences = preprocessing(inputFilePath)
    model = training(sentences)
    #Discard parameters that are used in training and score. Use if you’re sure you’re done training a model.
    #If replace_word_vectors_with_normalized is set, forget the original vectors and only keep 
    #the normalized ones = saves lots of memory!
    model.delete_temporary_training_data(replace_word_vectors_with_normalized=False)   
    # Evaluation
    print(" similarity evaluation:", model.wv.evaluate_word_pairs('wordsim353.tsv', restrict_vocab=50000))
    # Analogies
    r = model.wv.accuracy('questions-words.txt', restrict_vocab=30000)
    outputFilePath = os.path.join(outputPath, year)
    model.save(outputFilePath) #binary=False saves the vectors as Textual data
    print("----------------------------------------------------")    

starting the processing of 1999
Reading Corpus file and preprocessing time:68.53386998176575 seconds
 printing Top 2 and last sentences For sanity check
0 ['years', 'ago', 'american', 'company', 'seeking', 'toehold', 'newly', 'privatized', 'potentially', 'lucrative', 'russian', 'telecommunications', 'market', 'cut', 'secret', 'deal', 'directors', 'officers', 'moscow', 'telephone', 'company', 'arrangement', 'raised', 'legal', 'ethical', 'questions', 'thrown', 'light', 'western', 'businessmen', 'operated', 'russia', 'american', 'company', 'global', 'telesystems', 'paid', 'million', 'stock', 'cash', 'global', 'paid', 'money', 'company', 'insiders', 'set', 'bahamas', 'avoid', 'public', 'scrutiny', 'taxes', 'exchange', 'global', 'telesystems', 'ended', 'ownership', 'russian', 'telecommunications', 'company', 'generates', 'millions', 'dollars', 'year', 'revenues', 'providing', 'service', 'moscow', 'customers', 'businesses', 'willing', 'pay', 'premium', 'russians', 'took', 'deal', 'walked', '

2018-10-15 12:50:04,951 : INFO : collecting all words and their counts
2018-10-15 12:50:04,952 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 12:50:05,508 : INFO : PROGRESS: at sentence #10000, processed 3297755 words, keeping 94810 word types
2018-10-15 12:50:06,053 : INFO : PROGRESS: at sentence #20000, processed 6440084 words, keeping 124716 word types
2018-10-15 12:50:06,584 : INFO : PROGRESS: at sentence #30000, processed 9518989 words, keeping 153234 word types
2018-10-15 12:50:07,146 : INFO : PROGRESS: at sentence #40000, processed 12663848 words, keeping 174160 word types
2018-10-15 12:50:07,733 : INFO : PROGRESS: at sentence #50000, processed 15803772 words, keeping 194587 word types
2018-10-15 12:50:08,292 : INFO : PROGRESS: at sentence #60000, processed 18886648 words, keeping 209957 word types
2018-10-15 12:50:08,863 : INFO : PROGRESS: at sentence #70000, processed 22040075 words, keeping 224773 word types
2018-10-15 12:50:09,429 : INF

2018-10-15 12:50:52,797 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-10-15 12:50:52,798 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-10-15 12:50:52,799 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-10-15 12:50:52,802 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-10-15 12:50:52,815 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-10-15 12:50:52,819 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-15 12:50:52,821 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 12:50:52,823 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-15 12:50:52,830 : INFO : EPOCH 2 - PROGRESS: at 100.00% examples, 1259909 words/s, in_qsize 0, out_qsize 1
2018-10-15 12:50:52,831 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-15 12:50:52,832 : INFO : EPOCH - 2 : training on 26087311 

2018-10-15 12:51:34,426 : INFO : EPOCH 5 - PROGRESS: at 4.64% examples, 1196770 words/s, in_qsize 24, out_qsize 1
2018-10-15 12:51:35,427 : INFO : EPOCH 5 - PROGRESS: at 9.19% examples, 1221757 words/s, in_qsize 23, out_qsize 2
2018-10-15 12:51:36,435 : INFO : EPOCH 5 - PROGRESS: at 14.10% examples, 1236528 words/s, in_qsize 24, out_qsize 1
2018-10-15 12:51:37,445 : INFO : EPOCH 5 - PROGRESS: at 18.97% examples, 1238239 words/s, in_qsize 22, out_qsize 3
2018-10-15 12:51:38,452 : INFO : EPOCH 5 - PROGRESS: at 24.04% examples, 1240793 words/s, in_qsize 26, out_qsize 3
2018-10-15 12:51:39,459 : INFO : EPOCH 5 - PROGRESS: at 29.43% examples, 1246228 words/s, in_qsize 24, out_qsize 1
2018-10-15 12:51:40,468 : INFO : EPOCH 5 - PROGRESS: at 34.17% examples, 1246777 words/s, in_qsize 23, out_qsize 2
2018-10-15 12:51:41,479 : INFO : EPOCH 5 - PROGRESS: at 39.09% examples, 1245912 words/s, in_qsize 24, out_qsize 1
2018-10-15 12:51:42,492 : INFO : EPOCH 5 - PROGRESS: at 44.10% examples, 1244631 w

Computing time for training the model:108.93075013160706 seconds
Number of words processed per second: 239485.27820181215
Word2Vec(vocab=95145, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'like', 'people', 'time', 'years', 'york', 'company', 'percent']


  if np.issubdtype(vec.dtype, np.int):
2018-10-15 12:51:54,207 : INFO : Pearson correlation coefficient against wordsim353.tsv: 0.6034
2018-10-15 12:51:54,208 : INFO : Spearman rank-order correlation coefficient against wordsim353.tsv: 0.6352
2018-10-15 12:51:54,208 : INFO : Pairs with unknown words ratio: 4.8%
  from ipykernel import kernelapp as app
2018-10-15 12:51:54,241 : INFO : precomputing L2-norms of word weight vectors


 similarity evaluation: ((0.6033708946192443, 1.0661791626241528e-34), SpearmanrResult(correlation=0.6352495404801989, pvalue=2.273777221661854e-39), 4.815864022662889)


2018-10-15 12:51:55,514 : INFO : capital-common-countries: 60.5% (254/420)
2018-10-15 12:51:57,803 : INFO : capital-world: 51.6% (446/864)
2018-10-15 12:51:58,144 : INFO : currency: 22.7% (29/128)
2018-10-15 12:52:03,400 : INFO : city-in-state: 16.2% (325/2004)
2018-10-15 12:52:04,297 : INFO : family: 74.3% (254/342)
2018-10-15 12:52:06,589 : INFO : gram1-adjective-to-adverb: 13.7% (119/870)
2018-10-15 12:52:08,045 : INFO : gram2-opposite: 18.1% (100/552)
2018-10-15 12:52:11,535 : INFO : gram3-comparative: 67.2% (895/1332)
2018-10-15 12:52:13,520 : INFO : gram4-superlative: 29.2% (221/756)
2018-10-15 12:52:15,229 : INFO : gram5-present-participle: 47.1% (306/650)
2018-10-15 12:52:18,632 : INFO : gram6-nationality-adjective: 68.4% (888/1299)
2018-10-15 12:52:22,718 : INFO : gram7-past-tense: 55.0% (858/1560)
2018-10-15 12:52:25,487 : INFO : gram8-plural: 56.1% (592/1056)
2018-10-15 12:52:26,704 : INFO : gram9-plural-verbs: 35.7% (165/462)
2018-10-15 12:52:26,705 : INFO : total: 44.3% (5

----------------------------------------------------
starting the processing of 1998
Reading Corpus file and preprocessing time:67.73431754112244 seconds
 printing Top 2 and last sentences For sanity check
0 ['keyshawn', 'johnson', 'said', 'accomplished', 'things', 'set', 'turned', 'pro', 'snaring', 'jets', 'opening', 'pass', 'yards', 'johnson', 'reached', 'receiving', 'yards', 'season', 'making', 'playoffs', 'voted', 'pro', 'bowl', 'week', 'reached', 'percent', 'goals', 'guess', 'said', 'moment', 'like', 'jets', 'clinching', 'division', 'title', 'johnson', 'reflective', 'rookie', 'season', 'thinking', 'locker', 'room', 'today', 'numbskulls', 'got', 'said', 'mind', 'shifting', 'tomorrow', 'night', 'game', 'going', 'biggest', 'sports', 'fan', 'history', 'jets', 'interested', 'jaguars', 'vikings', 'game', 'jaguar', 'loss', 'propels', 'jets', 'week', 'bye', 'playoffs', 'critically', 'important', 'hoping', 'reach', 'super', 'bowl', 'clubs', 'played', 'super', 'bowl', 'enjoying', 'bye', 'we

2018-10-15 12:53:35,657 : INFO : collecting all words and their counts
2018-10-15 12:53:35,658 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 12:53:36,185 : INFO : PROGRESS: at sentence #10000, processed 3021040 words, keeping 87928 word types
2018-10-15 12:53:36,734 : INFO : PROGRESS: at sentence #20000, processed 5983265 words, keeping 116954 word types
2018-10-15 12:53:37,311 : INFO : PROGRESS: at sentence #30000, processed 9126375 words, keeping 141323 word types
2018-10-15 12:53:37,932 : INFO : PROGRESS: at sentence #40000, processed 12382639 words, keeping 162345 word types
2018-10-15 12:53:38,572 : INFO : PROGRESS: at sentence #50000, processed 15532879 words, keeping 189775 word types
2018-10-15 12:53:39,181 : INFO : PROGRESS: at sentence #60000, processed 18640325 words, keeping 205634 word types
2018-10-15 12:53:39,769 : INFO : PROGRESS: at sentence #70000, processed 21629877 words, keeping 219031 word types
2018-10-15 12:53:40,321 : INF

2018-10-15 12:54:22,234 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-10-15 12:54:22,235 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-10-15 12:54:22,255 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-10-15 12:54:22,261 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-10-15 12:54:22,266 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-15 12:54:22,270 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 12:54:22,272 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-15 12:54:22,275 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-15 12:54:22,275 : INFO : EPOCH - 2 : training on 25252749 raw words (24652060 effective words) took 19.6s, 1257661 effective words/s
2018-10-15 12:54:23,291 : INFO : EPOCH 3 - PROGRESS: at 4.74% examples, 1172956 words/s, in_qsize 26, out_qsize 3
2018-10-15 12:54:

2018-10-15 12:55:06,432 : INFO : EPOCH 5 - PROGRESS: at 20.60% examples, 1215822 words/s, in_qsize 26, out_qsize 1
2018-10-15 12:55:07,442 : INFO : EPOCH 5 - PROGRESS: at 25.85% examples, 1215210 words/s, in_qsize 23, out_qsize 2
2018-10-15 12:55:08,456 : INFO : EPOCH 5 - PROGRESS: at 30.49% examples, 1208260 words/s, in_qsize 26, out_qsize 1
2018-10-15 12:55:09,456 : INFO : EPOCH 5 - PROGRESS: at 35.45% examples, 1210790 words/s, in_qsize 24, out_qsize 1
2018-10-15 12:55:10,464 : INFO : EPOCH 5 - PROGRESS: at 39.99% examples, 1201958 words/s, in_qsize 25, out_qsize 0
2018-10-15 12:55:11,470 : INFO : EPOCH 5 - PROGRESS: at 44.37% examples, 1192406 words/s, in_qsize 24, out_qsize 1
2018-10-15 12:55:12,495 : INFO : EPOCH 5 - PROGRESS: at 49.02% examples, 1194706 words/s, in_qsize 25, out_qsize 0
2018-10-15 12:55:13,503 : INFO : EPOCH 5 - PROGRESS: at 53.93% examples, 1194902 words/s, in_qsize 23, out_qsize 2
2018-10-15 12:55:14,531 : INFO : EPOCH 5 - PROGRESS: at 58.73% examples, 1196698

Computing time for training the model:107.60802507400513 seconds
Number of words processed per second: 234673.47330863995
Word2Vec(vocab=92789, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'like', 'people', 'years', 'time', 'york', 'president', 'company']
 similarity evaluation: ((0.5907816769975799, 4.340519179468931e-33), SpearmanrResult(correlation=0.6176988196356281, pvalue=7.754545266535148e-37), 4.53257790368272)


2018-10-15 12:55:25,046 : INFO : capital-common-countries: 55.0% (231/420)
2018-10-15 12:55:27,196 : INFO : capital-world: 47.6% (382/803)
2018-10-15 12:55:27,612 : INFO : currency: 21.1% (32/152)
2018-10-15 12:55:32,880 : INFO : city-in-state: 14.8% (287/1943)
2018-10-15 12:55:33,803 : INFO : family: 73.4% (251/342)
2018-10-15 12:55:35,989 : INFO : gram1-adjective-to-adverb: 14.0% (114/812)
2018-10-15 12:55:37,602 : INFO : gram2-opposite: 16.5% (99/600)
2018-10-15 12:55:41,209 : INFO : gram3-comparative: 67.8% (903/1332)
2018-10-15 12:55:43,429 : INFO : gram4-superlative: 29.4% (239/812)
2018-10-15 12:55:45,208 : INFO : gram5-present-participle: 38.6% (251/650)
2018-10-15 12:55:48,312 : INFO : gram6-nationality-adjective: 72.4% (841/1161)
2018-10-15 12:55:52,510 : INFO : gram7-past-tense: 52.9% (826/1560)
2018-10-15 12:55:55,161 : INFO : gram8-plural: 52.7% (523/992)
2018-10-15 12:55:56,388 : INFO : gram9-plural-verbs: 35.7% (165/462)
2018-10-15 12:55:56,390 : INFO : total: 42.7% (514

----------------------------------------------------
starting the processing of 1997
Reading Corpus file and preprocessing time:62.97072958946228 seconds
 printing Top 2 and last sentences For sanity check
0 ['doctor', 'coach', 'pete', 'sampras', 'pleaded', 'guilty', 'yesterday', 'norwalk', 'calif', 'sexually', 'molesting', 'young', 'male', 'patients', 'peter', 'bruce', 'fischer', 'pediatric', 'endocrinologist', 'rolling', 'hills', 'calif', 'pleaded', 'guilty', 'counts', 'molestation', 'anal', 'genital', 'penetration', 'said', 'sandi', 'gibbons', 'spokeswoman', 'los', 'angeles', 'county', 'district', 'attorney', 'office', 'fischer', 'faces', 'years', 'prison', 'scheduled', 'sentencing', 'feb', 'norwalk', 'superior', 'court', 'fischer', 'originally', 'pleaded', 'innocent', 'charges', 'molesting', 'young', 'patients', 'office', 'home', 'pleaded', 'guilty', 'exchange', 'charges', 'dropped', 'alleged', 'victims', 'testified', 'fischer', 'molested', 'time', 'years', 'old', 'months', 'came',

2018-10-15 12:57:00,530 : INFO : collecting all words and their counts
2018-10-15 12:57:00,531 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 12:57:01,072 : INFO : PROGRESS: at sentence #10000, processed 2967883 words, keeping 87557 word types
2018-10-15 12:57:01,594 : INFO : PROGRESS: at sentence #20000, processed 5836443 words, keeping 116095 word types
2018-10-15 12:57:02,135 : INFO : PROGRESS: at sentence #30000, processed 8799769 words, keeping 139043 word types
2018-10-15 12:57:02,711 : INFO : PROGRESS: at sentence #40000, processed 11936578 words, keeping 160386 word types
2018-10-15 12:57:03,253 : INFO : PROGRESS: at sentence #50000, processed 14830167 words, keeping 175865 word types
2018-10-15 12:57:03,785 : INFO : PROGRESS: at sentence #60000, processed 17697040 words, keeping 190371 word types
2018-10-15 12:57:04,334 : INFO : PROGRESS: at sentence #70000, processed 20611966 words, keeping 203664 word types
2018-10-15 12:57:04,777 : INF

2018-10-15 12:57:44,069 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-15 12:57:44,069 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 12:57:44,070 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-15 12:57:44,079 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-15 12:57:44,080 : INFO : EPOCH - 2 : training on 22878500 raw words (22310647 effective words) took 18.5s, 1204261 effective words/s
2018-10-15 12:57:45,091 : INFO : EPOCH 3 - PROGRESS: at 5.13% examples, 1149286 words/s, in_qsize 25, out_qsize 0
2018-10-15 12:57:46,098 : INFO : EPOCH 3 - PROGRESS: at 10.33% examples, 1146399 words/s, in_qsize 24, out_qsize 1
2018-10-15 12:57:47,116 : INFO : EPOCH 3 - PROGRESS: at 15.76% examples, 1169329 words/s, in_qsize 23, out_qsize 2
2018-10-15 12:57:48,121 : INFO : EPOCH 3 - PROGRESS: at 21.40% examples, 1176833 words/s, in_qsize 26, out_qsize 1
2018-10-15 12:57:49,140 : INFO : EPO

2018-10-15 12:58:30,475 : INFO : EPOCH 5 - PROGRESS: at 56.16% examples, 1255473 words/s, in_qsize 23, out_qsize 2
2018-10-15 12:58:31,488 : INFO : EPOCH 5 - PROGRESS: at 61.96% examples, 1256215 words/s, in_qsize 26, out_qsize 0
2018-10-15 12:58:32,502 : INFO : EPOCH 5 - PROGRESS: at 67.82% examples, 1257925 words/s, in_qsize 25, out_qsize 0
2018-10-15 12:58:33,506 : INFO : EPOCH 5 - PROGRESS: at 73.49% examples, 1256380 words/s, in_qsize 26, out_qsize 1
2018-10-15 12:58:34,511 : INFO : EPOCH 5 - PROGRESS: at 79.36% examples, 1256658 words/s, in_qsize 24, out_qsize 1
2018-10-15 12:58:35,514 : INFO : EPOCH 5 - PROGRESS: at 85.04% examples, 1256749 words/s, in_qsize 25, out_qsize 0
2018-10-15 12:58:36,529 : INFO : EPOCH 5 - PROGRESS: at 90.74% examples, 1255890 words/s, in_qsize 24, out_qsize 1
2018-10-15 12:58:37,532 : INFO : EPOCH 5 - PROGRESS: at 96.58% examples, 1256682 words/s, in_qsize 26, out_qsize 1
2018-10-15 12:58:38,038 : INFO : worker thread finished; awaiting finish of 12 m

Computing time for training the model:97.57793307304382 seconds
Number of words processed per second: 234463.87189687512
Word2Vec(vocab=87541, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'like', 'people', 'years', 'time', 'york', 'percent', 'million']


2018-10-15 12:58:38,897 : INFO : Pearson correlation coefficient against wordsim353.tsv: 0.6036
2018-10-15 12:58:38,898 : INFO : Spearman rank-order correlation coefficient against wordsim353.tsv: 0.6360
2018-10-15 12:58:38,899 : INFO : Pairs with unknown words ratio: 3.7%
2018-10-15 12:58:38,935 : INFO : precomputing L2-norms of word weight vectors


 similarity evaluation: ((0.6035868452888292, 3.998263238471482e-35), SpearmanrResult(correlation=0.6360423282138952, pvalue=6.035164600854575e-40), 3.6827195467422094)


2018-10-15 12:58:40,093 : INFO : capital-common-countries: 56.1% (213/380)
2018-10-15 12:58:42,591 : INFO : capital-world: 52.2% (491/941)
2018-10-15 12:58:42,932 : INFO : currency: 15.6% (20/128)
2018-10-15 12:58:48,215 : INFO : city-in-state: 15.5% (311/2004)
2018-10-15 12:58:49,018 : INFO : family: 72.9% (223/306)
2018-10-15 12:58:51,313 : INFO : gram1-adjective-to-adverb: 9.8% (85/870)
2018-10-15 12:58:52,768 : INFO : gram2-opposite: 13.6% (75/552)
2018-10-15 12:58:56,279 : INFO : gram3-comparative: 66.2% (882/1332)
2018-10-15 12:58:58,422 : INFO : gram4-superlative: 31.9% (259/812)
2018-10-15 12:59:00,277 : INFO : gram5-present-participle: 37.5% (263/702)
2018-10-15 12:59:03,330 : INFO : gram6-nationality-adjective: 74.6% (866/1161)
2018-10-15 12:59:07,433 : INFO : gram7-past-tense: 50.1% (781/1560)
2018-10-15 12:59:10,048 : INFO : gram8-plural: 50.4% (500/992)
2018-10-15 12:59:11,383 : INFO : gram9-plural-verbs: 36.0% (182/506)
2018-10-15 12:59:11,384 : INFO : total: 42.1% (5151/

----------------------------------------------------
starting the processing of 1996
Reading Corpus file and preprocessing time:58.533310890197754 seconds
 printing Top 2 and last sentences For sanity check
0 ['sept', 'martin', 'luther', 'king', 'blumstein', 'department', 'store', 'street', 'signing', 'copies', 'new', 'book', 'stride', 'freedom', 'woman', 'lunged', 'stabbed', 'chest', 'letter', 'opener', 'rushed', 'harlem', 'hospital', 'surgeon', 'named', 'aubre', 'maynard', 'saved', 'life', 'interview', 'maynard', 'lives', 'lower', 'east', 'recalled', 'day', 'finished', 'rounds', 'morning', 'left', 'noon', 'downtown', 'hospital', 'private', 'patients', 'way', 'stopped', 'movie', 'tired', 'wanted', 'relax', 'saw', 'interested', 'bridget', 'bardot', 'parisienne', 'left', 'right', 'movie', 'ended', 'went', 'hospital', 'soon', 'got', 'rushed', 'told', 'wanted', 'badly', 'harlem', 'hospital', 'told', 'somebody', 'great', 'importance', 'injured', 'want', 'went', 'car', 'uptown', 'finally', 

2018-10-15 13:00:11,009 : INFO : collecting all words and their counts
2018-10-15 13:00:11,010 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 13:00:11,498 : INFO : PROGRESS: at sentence #10000, processed 2887671 words, keeping 87163 word types
2018-10-15 13:00:12,005 : INFO : PROGRESS: at sentence #20000, processed 5816878 words, keeping 117439 word types
2018-10-15 13:00:12,515 : INFO : PROGRESS: at sentence #30000, processed 8720476 words, keeping 139357 word types
2018-10-15 13:00:13,030 : INFO : PROGRESS: at sentence #40000, processed 11667892 words, keeping 158605 word types
2018-10-15 13:00:13,575 : INFO : PROGRESS: at sentence #50000, processed 14634081 words, keeping 175695 word types
2018-10-15 13:00:14,105 : INFO : PROGRESS: at sentence #60000, processed 17589751 words, keeping 190363 word types
2018-10-15 13:00:14,646 : INFO : PROGRESS: at sentence #70000, processed 20533488 words, keeping 204613 word types
2018-10-15 13:00:14,936 : INF

2018-10-15 13:00:51,558 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-15 13:00:51,563 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-15 13:00:51,564 : INFO : EPOCH - 2 : training on 22122888 raw words (21572594 effective words) took 17.3s, 1250446 effective words/s
2018-10-15 13:00:52,575 : INFO : EPOCH 3 - PROGRESS: at 5.54% examples, 1161588 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:00:53,578 : INFO : EPOCH 3 - PROGRESS: at 11.37% examples, 1190444 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:00:54,588 : INFO : EPOCH 3 - PROGRESS: at 17.06% examples, 1204506 words/s, in_qsize 26, out_qsize 1
2018-10-15 13:00:55,589 : INFO : EPOCH 3 - PROGRESS: at 22.72% examples, 1206695 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:00:56,590 : INFO : EPOCH 3 - PROGRESS: at 28.03% examples, 1196707 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:00:57,600 : INFO : EPOCH 3 - PROGRESS: at 33.90% examples, 1203355 words/s, in_qsize 25, 

2018-10-15 13:01:39,859 : INFO : EPOCH 5 - PROGRESS: at 75.15% examples, 1233474 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:01:40,866 : INFO : EPOCH 5 - PROGRESS: at 80.72% examples, 1233293 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:01:41,882 : INFO : EPOCH 5 - PROGRESS: at 86.50% examples, 1233847 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:01:42,884 : INFO : EPOCH 5 - PROGRESS: at 92.33% examples, 1234394 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:01:43,892 : INFO : EPOCH 5 - PROGRESS: at 97.93% examples, 1233675 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:01:44,138 : INFO : worker thread finished; awaiting finish of 12 more threads
2018-10-15 13:01:44,146 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-10-15 13:01:44,148 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-10-15 13:01:44,155 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-10-15 13:01:44,168 : INFO : worker thread finished; a

Computing time for training the model:93.19513177871704 seconds
Number of words processed per second: 237382.44238474482
Word2Vec(vocab=85781, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'like', 'people', 'years', 'time', 'york', 'million', 'percent']


2018-10-15 13:01:44,960 : INFO : Pearson correlation coefficient against wordsim353.tsv: 0.5970
2018-10-15 13:01:44,961 : INFO : Spearman rank-order correlation coefficient against wordsim353.tsv: 0.6219
2018-10-15 13:01:44,961 : INFO : Pairs with unknown words ratio: 4.8%
2018-10-15 13:01:44,996 : INFO : precomputing L2-norms of word weight vectors


 similarity evaluation: ((0.5970155813458635, 7.892593145775794e-34), SpearmanrResult(correlation=0.6219487418197314, pvalue=2.3503171431221636e-37), 4.815864022662889)


2018-10-15 13:01:46,148 : INFO : capital-common-countries: 60.3% (229/380)
2018-10-15 13:01:48,564 : INFO : capital-world: 50.1% (457/912)
2018-10-15 13:01:48,848 : INFO : currency: 7.5% (8/106)
2018-10-15 13:01:54,298 : INFO : city-in-state: 18.5% (384/2072)
2018-10-15 13:01:55,200 : INFO : family: 77.2% (264/342)
2018-10-15 13:01:57,342 : INFO : gram1-adjective-to-adverb: 9.4% (76/812)
2018-10-15 13:01:58,925 : INFO : gram2-opposite: 12.2% (73/600)
2018-10-15 13:02:02,438 : INFO : gram3-comparative: 63.4% (844/1332)
2018-10-15 13:02:04,292 : INFO : gram4-superlative: 35.2% (247/702)
2018-10-15 13:02:06,144 : INFO : gram5-present-participle: 42.5% (298/702)
2018-10-15 13:02:09,194 : INFO : gram6-nationality-adjective: 71.1% (825/1161)
2018-10-15 13:02:13,295 : INFO : gram7-past-tense: 52.2% (815/1560)
2018-10-15 13:02:16,070 : INFO : gram8-plural: 46.8% (494/1056)
2018-10-15 13:02:17,403 : INFO : gram9-plural-verbs: 35.2% (178/506)
2018-10-15 13:02:17,405 : INFO : total: 42.4% (5192/1

----------------------------------------------------
starting the processing of 1995
Reading Corpus file and preprocessing time:56.76971101760864 seconds
 printing Top 2 and last sentences For sanity check
0 ['gunmen', 'shot', 'killed', 'reporter', 'algeria', 'state', 'television', 'station', 'journalist', 'die', 'week', 'muslim', 'rebellion', 'began', 'years', 'ago', 'mourad', 'hamazia', 'killed', 'saturday', 'night', 'got', 'car', 'near', 'home', 'algiers', 'suburb', 'baraki', 'government', 'official', 'said', 'today', 'speaking', 'condition', 'anonymity', 'group', 'immediately', 'claimed', 'responsibility', 'official', 'blamed', 'killing', 'group', 'terrorists', 'term', 'government', 'muslim', 'militants', 'members', 'armed', 'islamic', 'group', 'journalists', 'intellectuals', 'teachers', 'public', 'officials', 'targets', 'fight', 'topple', 'algeria', 'military', 'backed', 'government', 'algerian', 'communication', 'minister', 'lamine', 'bechichi', 'said', 'recently', 'protecting', 

2018-10-15 13:03:15,250 : INFO : collecting all words and their counts
2018-10-15 13:03:15,250 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 13:03:15,709 : INFO : PROGRESS: at sentence #10000, processed 2765695 words, keeping 84935 word types
2018-10-15 13:03:16,176 : INFO : PROGRESS: at sentence #20000, processed 5473088 words, keeping 112890 word types
2018-10-15 13:03:16,648 : INFO : PROGRESS: at sentence #30000, processed 8186600 words, keeping 133403 word types
2018-10-15 13:03:17,104 : INFO : PROGRESS: at sentence #40000, processed 10804816 words, keeping 150650 word types
2018-10-15 13:03:17,543 : INFO : PROGRESS: at sentence #50000, processed 13365456 words, keeping 164573 word types
2018-10-15 13:03:18,019 : INFO : PROGRESS: at sentence #60000, processed 15999562 words, keeping 178243 word types
2018-10-15 13:03:18,507 : INFO : PROGRESS: at sentence #70000, processed 18734639 words, keeping 191547 word types
2018-10-15 13:03:18,897 : INF

2018-10-15 13:03:55,019 : INFO : EPOCH 3 - PROGRESS: at 11.83% examples, 1236219 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:03:56,031 : INFO : EPOCH 3 - PROGRESS: at 17.95% examples, 1246958 words/s, in_qsize 25, out_qsize 1
2018-10-15 13:03:57,046 : INFO : EPOCH 3 - PROGRESS: at 24.24% examples, 1255717 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:03:58,058 : INFO : EPOCH 3 - PROGRESS: at 30.45% examples, 1252314 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:03:59,075 : INFO : EPOCH 3 - PROGRESS: at 36.67% examples, 1255040 words/s, in_qsize 26, out_qsize 1
2018-10-15 13:04:00,084 : INFO : EPOCH 3 - PROGRESS: at 42.89% examples, 1258338 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:04:01,096 : INFO : EPOCH 3 - PROGRESS: at 49.33% examples, 1257404 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:04:02,106 : INFO : EPOCH 3 - PROGRESS: at 56.00% examples, 1260335 words/s, in_qsize 26, out_qsize 0
2018-10-15 13:04:03,118 : INFO : EPOCH 3 - PROGRESS: at 62.66% examples, 1263222

2018-10-15 13:04:41,288 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-10-15 13:04:41,290 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-10-15 13:04:41,291 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-10-15 13:04:41,303 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-10-15 13:04:41,307 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-10-15 13:04:41,311 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-15 13:04:41,321 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 13:04:41,327 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-15 13:04:41,330 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-15 13:04:41,331 : INFO : EPOCH - 5 : training on 20899132 raw words (20377982 effective words) took 16.2s, 1254989 effective words/s
2018-10-15 13:04:41,331 : INFO : trainin

Computing time for training the model:86.08268523216248 seconds
Number of words processed per second: 242779.7407066898
Word2Vec(vocab=82794, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'like', 'people', 'years', 'time', 'york', 'city', 'million']
 similarity evaluation: ((0.5825725739786306, 2.7542191802806346e-32), SpearmanrResult(correlation=0.6128962232276682, pvalue=1.8879630913256664e-36), 3.6827195467422094)


2018-10-15 13:04:42,107 : INFO : precomputing L2-norms of word weight vectors
2018-10-15 13:04:43,257 : INFO : capital-common-countries: 60.5% (230/380)
2018-10-15 13:04:45,306 : INFO : capital-world: 54.1% (418/772)
2018-10-15 13:04:45,649 : INFO : currency: 10.9% (14/128)
2018-10-15 13:04:50,955 : INFO : city-in-state: 20.2% (407/2014)
2018-10-15 13:04:51,858 : INFO : family: 66.7% (228/342)
2018-10-15 13:04:54,154 : INFO : gram1-adjective-to-adverb: 9.8% (85/870)
2018-10-15 13:04:55,737 : INFO : gram2-opposite: 16.5% (99/600)
2018-10-15 13:04:59,282 : INFO : gram3-comparative: 63.4% (844/1332)
2018-10-15 13:05:01,428 : INFO : gram4-superlative: 28.8% (234/812)
2018-10-15 13:05:03,148 : INFO : gram5-present-participle: 40.3% (262/650)
2018-10-15 13:05:06,386 : INFO : gram6-nationality-adjective: 69.3% (852/1229)
2018-10-15 13:05:10,513 : INFO : gram7-past-tense: 49.6% (773/1560)
2018-10-15 13:05:13,128 : INFO : gram8-plural: 50.4% (500/992)
2018-10-15 13:05:14,241 : INFO : gram9-plur

----------------------------------------------------
starting the processing of 1994
Reading Corpus file and preprocessing time:57.10536026954651 seconds
 printing Top 2 and last sentences For sanity check
0 ['capital', 'homeland', 'holdout', 'flipped', 'intently', 'file', 'mangosuthu', 'buthelezi', 'chief', 'minister', 'zulu', 'state', 'kwazulu', 'agreed', 'interview', 'questions', 'air', 'today', 'momentous', 'threats', 'civil', 'war', 'carnage', 'downtown', 'johannesburg', 'allegations', 'lieutenants', 'complicity', 'police', 'hit', 'squads', 'boycott', 'election', 'existence', 'domain', 'scheduled', 'disappear', 'apartheid', 'homelands', 'new', 'constitution', 'takes', 'effect', 'april', 'moment', 'things', 'interested', 'buthelezi', 'file', 'held', 'assortment', 'interviewer', 'writings', 'buthelezi', 'scouring', 'insults', 'remember', 'petulant', 'described', 'said', 'temperamental', 'terms', 'course', 'minutes', 'buthelezi', 'answered', 'deflected', 'questions', 'kept', 'returni

2018-10-15 13:06:12,375 : INFO : collecting all words and their counts
2018-10-15 13:06:12,375 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 13:06:12,847 : INFO : PROGRESS: at sentence #10000, processed 2800237 words, keeping 84399 word types
2018-10-15 13:06:13,338 : INFO : PROGRESS: at sentence #20000, processed 5679793 words, keeping 112880 word types
2018-10-15 13:06:13,856 : INFO : PROGRESS: at sentence #30000, processed 8577656 words, keeping 145224 word types
2018-10-15 13:06:14,327 : INFO : PROGRESS: at sentence #40000, processed 11308855 words, keeping 162273 word types
2018-10-15 13:06:14,827 : INFO : PROGRESS: at sentence #50000, processed 14144248 words, keeping 178564 word types
2018-10-15 13:06:15,321 : INFO : PROGRESS: at sentence #60000, processed 16974342 words, keeping 192816 word types
2018-10-15 13:06:15,833 : INFO : PROGRESS: at sentence #70000, processed 19824161 words, keeping 206405 word types
2018-10-15 13:06:16,079 : INF

2018-10-15 13:06:51,828 : INFO : EPOCH 3 - PROGRESS: at 5.79% examples, 1168087 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:06:52,829 : INFO : EPOCH 3 - PROGRESS: at 12.06% examples, 1221907 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:06:53,840 : INFO : EPOCH 3 - PROGRESS: at 18.21% examples, 1240918 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:06:54,849 : INFO : EPOCH 3 - PROGRESS: at 24.22% examples, 1248142 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:06:55,861 : INFO : EPOCH 3 - PROGRESS: at 30.47% examples, 1250839 words/s, in_qsize 25, out_qsize 2
2018-10-15 13:06:56,874 : INFO : EPOCH 3 - PROGRESS: at 36.52% examples, 1253953 words/s, in_qsize 26, out_qsize 1
2018-10-15 13:06:57,875 : INFO : EPOCH 3 - PROGRESS: at 42.70% examples, 1253422 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:06:58,884 : INFO : EPOCH 3 - PROGRESS: at 49.29% examples, 1255470 words/s, in_qsize 26, out_qsize 1
2018-10-15 13:06:59,909 : INFO : EPOCH 3 - PROGRESS: at 55.45% examples, 1256705 

2018-10-15 13:07:39,661 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-10-15 13:07:39,663 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-10-15 13:07:39,668 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-10-15 13:07:39,671 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-10-15 13:07:39,673 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-10-15 13:07:39,679 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-10-15 13:07:39,692 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-10-15 13:07:39,694 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-10-15 13:07:39,696 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-15 13:07:39,701 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 13:07:39,714 : INFO : worker thread finished; awaiting finish of 1 more threa

Computing time for training the model:87.34233093261719 seconds
Number of words processed per second: 242211.6833167746
Word2Vec(vocab=83508, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'like', 'people', 'years', 'york', 'time', 'percent', 'city']
 similarity evaluation: ((0.5892583229400608, 8.560604941084824e-33), SpearmanrResult(correlation=0.6100215083185451, pvalue=1.2500342563435321e-35), 4.815864022662889)


2018-10-15 13:07:41,092 : INFO : capital-common-countries: 60.5% (207/342)
2018-10-15 13:07:43,594 : INFO : capital-world: 50.4% (475/942)
2018-10-15 13:07:43,938 : INFO : currency: 11.7% (15/128)
2018-10-15 13:07:49,426 : INFO : city-in-state: 19.8% (412/2076)
2018-10-15 13:07:50,235 : INFO : family: 72.5% (222/306)
2018-10-15 13:07:52,539 : INFO : gram1-adjective-to-adverb: 11.4% (99/870)
2018-10-15 13:07:54,125 : INFO : gram2-opposite: 15.2% (91/600)
2018-10-15 13:07:57,634 : INFO : gram3-comparative: 64.6% (860/1332)
2018-10-15 13:07:59,776 : INFO : gram4-superlative: 32.1% (261/812)
2018-10-15 13:08:01,636 : INFO : gram5-present-participle: 40.6% (285/702)
2018-10-15 13:08:04,699 : INFO : gram6-nationality-adjective: 76.7% (890/1161)
2018-10-15 13:08:08,816 : INFO : gram7-past-tense: 54.3% (847/1560)
2018-10-15 13:08:11,434 : INFO : gram8-plural: 49.5% (491/992)
2018-10-15 13:08:12,891 : INFO : gram9-plural-verbs: 35.3% (195/552)
2018-10-15 13:08:12,893 : INFO : total: 43.2% (5350

----------------------------------------------------
starting the processing of 1993
Reading Corpus file and preprocessing time:56.65894937515259 seconds
 printing Top 2 and last sentences For sanity check
0 ['yankees', 'survivors', 'yesterday', 'overcoming', 'twists', 'turns', 'day', 'injury', 'plagued', 'season', 'cast', 'errors', 'got', 'erratic', 'outing', 'bob', 'wickman', 'lineup', 'laden', 'players', 'season', 'began', 'expected', 'spend', 'time', 'chomping', 'sunflower', 'seeds', 'dugout', 'flinch', 'chicago', 'jack', 'mcdowell', 'tried', 'game', 'winner', 'american', 'league', 'yanks', 'ignored', 'errors', 'led', 'unearned', 'run', 'thrived', 'wickman', 'relievers', 'followed', 'shut', 'white', 'sox', 'sixth', 'inning', 'thrilled', 'new', 'prime', 'time', 'players', 'robust', 'bats', 'powered', 'yankees', 'victory', 'game', 'sweep', 'white', 'sox', 'yankee', 'stadium', 'surprising', 'yankees', 'displayed', 'survival', 'techniques', 'afternoon', 'wickman', 'ultimate', 'survivor

2018-10-15 13:09:10,559 : INFO : collecting all words and their counts
2018-10-15 13:09:10,560 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 13:09:11,030 : INFO : PROGRESS: at sentence #10000, processed 2798986 words, keeping 82630 word types
2018-10-15 13:09:11,516 : INFO : PROGRESS: at sentence #20000, processed 5608141 words, keeping 111005 word types
2018-10-15 13:09:12,019 : INFO : PROGRESS: at sentence #30000, processed 8433495 words, keeping 134228 word types
2018-10-15 13:09:12,521 : INFO : PROGRESS: at sentence #40000, processed 11282369 words, keeping 153658 word types
2018-10-15 13:09:13,022 : INFO : PROGRESS: at sentence #50000, processed 14074180 words, keeping 169705 word types
2018-10-15 13:09:13,525 : INFO : PROGRESS: at sentence #60000, processed 16861089 words, keeping 183753 word types
2018-10-15 13:09:14,044 : INFO : PROGRESS: at sentence #70000, processed 19715844 words, keeping 197155 word types
2018-10-15 13:09:14,332 : INF

2018-10-15 13:09:50,333 : INFO : EPOCH 3 - PROGRESS: at 5.78% examples, 1165514 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:09:51,344 : INFO : EPOCH 3 - PROGRESS: at 11.93% examples, 1220218 words/s, in_qsize 26, out_qsize 2
2018-10-15 13:09:52,346 : INFO : EPOCH 3 - PROGRESS: at 18.15% examples, 1234509 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:09:53,360 : INFO : EPOCH 3 - PROGRESS: at 24.37% examples, 1242240 words/s, in_qsize 26, out_qsize 0
2018-10-15 13:09:54,372 : INFO : EPOCH 3 - PROGRESS: at 30.25% examples, 1243580 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:09:55,380 : INFO : EPOCH 3 - PROGRESS: at 36.37% examples, 1249548 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:09:56,395 : INFO : EPOCH 3 - PROGRESS: at 42.59% examples, 1250888 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:09:57,399 : INFO : EPOCH 3 - PROGRESS: at 48.42% examples, 1248338 words/s, in_qsize 26, out_qsize 0
2018-10-15 13:09:58,411 : INFO : EPOCH 3 - PROGRESS: at 54.53% examples, 1246530 

2018-10-15 13:10:39,283 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-10-15 13:10:39,286 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-10-15 13:10:39,289 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-10-15 13:10:39,290 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-10-15 13:10:39,302 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-10-15 13:10:39,315 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-10-15 13:10:39,318 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-10-15 13:10:39,319 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-10-15 13:10:39,324 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-15 13:10:39,332 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 13:10:39,335 : INFO : worker thread finished; awaiting finish of 1 more threa

Computing time for training the model:88.7825038433075 seconds
Number of words processed per second: 240029.78151653468
Word2Vec(vocab=82996, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'like', 'people', 'york', 'years', 'time', 'president', 'percent']


2018-10-15 13:10:40,065 : INFO : Pearson correlation coefficient against wordsim353.tsv: 0.5758
2018-10-15 13:10:40,066 : INFO : Spearman rank-order correlation coefficient against wordsim353.tsv: 0.6085
2018-10-15 13:10:40,066 : INFO : Pairs with unknown words ratio: 4.0%
2018-10-15 13:10:40,101 : INFO : precomputing L2-norms of word weight vectors


 similarity evaluation: ((0.5758307101459837, 2.4870379538634234e-31), SpearmanrResult(correlation=0.6085194955877656, pvalue=1.0136108503538948e-35), 3.9660056657223794)


2018-10-15 13:10:41,253 : INFO : capital-common-countries: 70.5% (268/380)
2018-10-15 13:10:44,023 : INFO : capital-world: 57.9% (604/1044)
2018-10-15 13:10:44,428 : INFO : currency: 11.8% (18/152)
2018-10-15 13:10:49,724 : INFO : city-in-state: 16.7% (334/2006)
2018-10-15 13:10:50,626 : INFO : family: 79.5% (272/342)
2018-10-15 13:10:52,927 : INFO : gram1-adjective-to-adverb: 9.9% (86/870)
2018-10-15 13:10:54,510 : INFO : gram2-opposite: 14.7% (88/600)
2018-10-15 13:10:58,027 : INFO : gram3-comparative: 62.0% (826/1332)
2018-10-15 13:11:00,029 : INFO : gram4-superlative: 31.7% (240/756)
2018-10-15 13:11:01,892 : INFO : gram5-present-participle: 40.7% (286/702)
2018-10-15 13:11:04,777 : INFO : gram6-nationality-adjective: 66.3% (725/1094)
2018-10-15 13:11:08,907 : INFO : gram7-past-tense: 49.6% (774/1560)
2018-10-15 13:11:11,699 : INFO : gram8-plural: 50.7% (535/1056)
2018-10-15 13:11:12,923 : INFO : gram9-plural-verbs: 35.7% (165/462)
2018-10-15 13:11:12,924 : INFO : total: 42.3% (522

----------------------------------------------------
starting the processing of 1992
Reading Corpus file and preprocessing time:58.943642139434814 seconds
 printing Top 2 and last sentences For sanity check
0 ['women', 'run', 'countries', 'domineering', 'spiteful', 'mothering', 'skills', 'enter', 'formula', 'women', 'leadership', 'roles', 'women', 'ruled', 'world', 'documentaries', 'mothers', 'charge', 'countries', 'mary', 'scott', 'port', 'jefferson', 'starting', 'examine', 'questions', 'film', 'chief', 'wilma', 'mankiller', 'pilot', 'series', 'minute', 'examination', 'elected', 'female', 'chief', 'cherokee', 'nation', 'oklahoma', 'filmed', 'stilwell', 'okla', 'amazing', 'days', 'mrs', 'scott', 'recalled', 'treated', 'kindly', 'respectfully', 'folks', 'new', 'york', 'learned', 'lot', 'oklahoma', 'time', 'added', 'tell', 'yes', 'comes', 'time', 'important', 'mrs', 'scott', 'said', 'crew', 'year', 'old', 'chief', 'mankiller', 'amazing', 'story', 'envisioned', 'story', 'unfolded', 'shoot

2018-10-15 13:12:12,862 : INFO : collecting all words and their counts
2018-10-15 13:12:12,862 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 13:12:13,322 : INFO : PROGRESS: at sentence #10000, processed 2788819 words, keeping 82624 word types
2018-10-15 13:12:13,795 : INFO : PROGRESS: at sentence #20000, processed 5600611 words, keeping 111731 word types
2018-10-15 13:12:14,286 : INFO : PROGRESS: at sentence #30000, processed 8426035 words, keeping 133993 word types
2018-10-15 13:12:14,784 : INFO : PROGRESS: at sentence #40000, processed 11301492 words, keeping 153360 word types
2018-10-15 13:12:15,275 : INFO : PROGRESS: at sentence #50000, processed 14057787 words, keeping 169422 word types
2018-10-15 13:12:15,786 : INFO : PROGRESS: at sentence #60000, processed 16873881 words, keeping 183857 word types
2018-10-15 13:12:16,288 : INFO : PROGRESS: at sentence #70000, processed 19686603 words, keeping 197752 word types
2018-10-15 13:12:16,693 : INF

2018-10-15 13:12:53,663 : INFO : EPOCH 3 - PROGRESS: at 5.49% examples, 1187150 words/s, in_qsize 26, out_qsize 1
2018-10-15 13:12:54,663 : INFO : EPOCH 3 - PROGRESS: at 11.58% examples, 1230785 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:12:55,670 : INFO : EPOCH 3 - PROGRESS: at 17.52% examples, 1238547 words/s, in_qsize 22, out_qsize 3
2018-10-15 13:12:56,671 : INFO : EPOCH 3 - PROGRESS: at 23.30% examples, 1240730 words/s, in_qsize 22, out_qsize 3
2018-10-15 13:12:57,694 : INFO : EPOCH 3 - PROGRESS: at 29.38% examples, 1247557 words/s, in_qsize 25, out_qsize 3
2018-10-15 13:12:58,697 : INFO : EPOCH 3 - PROGRESS: at 35.05% examples, 1247281 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:12:59,704 : INFO : EPOCH 3 - PROGRESS: at 41.36% examples, 1253115 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:13:00,714 : INFO : EPOCH 3 - PROGRESS: at 47.21% examples, 1252821 words/s, in_qsize 25, out_qsize 2
2018-10-15 13:13:01,726 : INFO : EPOCH 3 - PROGRESS: at 52.93% examples, 1256044 

2018-10-15 13:13:43,565 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-10-15 13:13:43,573 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-10-15 13:13:43,583 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-10-15 13:13:43,584 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-10-15 13:13:43,590 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-10-15 13:13:43,602 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-10-15 13:13:43,603 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-10-15 13:13:43,604 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-10-15 13:13:43,608 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-15 13:13:43,612 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 13:13:43,616 : INFO : worker thread finished; awaiting finish of 1 more threa

Computing time for training the model:90.76216435432434 seconds
Number of words processed per second: 241921.5116365154
Word2Vec(vocab=83701, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'years', 'people', 'like', 'york', 'percent', 'time', 'president']
 similarity evaluation: ((0.5823743803431207, 5.46459921239915e-32), SpearmanrResult(correlation=0.5983813737450715, pvalue=4.122140027063514e-34), 4.53257790368272)


2018-10-15 13:13:45,652 : INFO : capital-common-countries: 71.2% (299/420)
2018-10-15 13:13:49,184 : INFO : capital-world: 53.3% (704/1322)
2018-10-15 13:13:49,527 : INFO : currency: 7.8% (10/128)
2018-10-15 13:13:54,849 : INFO : city-in-state: 24.9% (503/2018)
2018-10-15 13:13:55,658 : INFO : family: 85.3% (261/306)
2018-10-15 13:13:57,953 : INFO : gram1-adjective-to-adverb: 13.1% (114/870)
2018-10-15 13:13:59,671 : INFO : gram2-opposite: 17.1% (111/650)
2018-10-15 13:14:03,182 : INFO : gram3-comparative: 63.7% (849/1332)
2018-10-15 13:14:05,033 : INFO : gram4-superlative: 28.6% (201/702)
2018-10-15 13:14:06,887 : INFO : gram5-present-participle: 37.0% (260/702)
2018-10-15 13:14:10,310 : INFO : gram6-nationality-adjective: 70.4% (915/1299)
2018-10-15 13:14:14,417 : INFO : gram7-past-tense: 51.6% (805/1560)
2018-10-15 13:14:17,205 : INFO : gram8-plural: 50.8% (536/1056)
2018-10-15 13:14:18,427 : INFO : gram9-plural-verbs: 37.0% (171/462)
2018-10-15 13:14:18,429 : INFO : total: 44.7% (5

----------------------------------------------------
starting the processing of 1991
Reading Corpus file and preprocessing time:57.929203510284424 seconds
 printing Top 2 and last sentences For sanity check
0 ['mitsubishi', 'international', 'corporation', 'suing', 'telephone', 'broken', 'hackers', 'thousands', 'illegal', 'calls', 'pakistan', 'egypt', 'places', 'lawsuit', 'filed', 'late', 'friday', 'federal', 'district', 'court', 'manhattan', 'contends', 'american', 'telephone', 'telegraph', 'company', 'failed', 'provide', 'secure', 'failed', 'warn', 'mitsubishi', 'potential', 'unauthorized', 'use', 'mitsubishi', 'seeking', 'million', 'punitive', 'damages', 'dismissal', 'billed', 'phone', 'calls', 'attributed', 'unauthorized', 'users', 'mistubishi', 'andrew', 'myers', 'telephone', 'company', 'spokesman', 'declined', 'comment', 'suit', 'yesterday', 'said', 'federal', 'communications', 'law', 'customers', 'clearly', 'responsible', 'authorized', 'unauthorized', 'service', 'called', 'privat

2018-10-15 13:15:17,358 : INFO : collecting all words and their counts
2018-10-15 13:15:17,359 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 13:15:17,825 : INFO : PROGRESS: at sentence #10000, processed 2791369 words, keeping 86181 word types
2018-10-15 13:15:18,299 : INFO : PROGRESS: at sentence #20000, processed 5471179 words, keeping 115283 word types
2018-10-15 13:15:18,758 : INFO : PROGRESS: at sentence #30000, processed 8097915 words, keeping 136887 word types
2018-10-15 13:15:19,247 : INFO : PROGRESS: at sentence #40000, processed 10901771 words, keeping 155952 word types
2018-10-15 13:15:19,739 : INFO : PROGRESS: at sentence #50000, processed 13696320 words, keeping 172429 word types
2018-10-15 13:15:20,237 : INFO : PROGRESS: at sentence #60000, processed 16489885 words, keeping 187692 word types
2018-10-15 13:15:20,722 : INFO : PROGRESS: at sentence #70000, processed 19239591 words, keeping 201543 word types
2018-10-15 13:15:21,141 : INF

2018-10-15 13:15:57,401 : INFO : EPOCH 3 - PROGRESS: at 5.90% examples, 1210425 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:15:58,408 : INFO : EPOCH 3 - PROGRESS: at 11.67% examples, 1240883 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:15:59,410 : INFO : EPOCH 3 - PROGRESS: at 17.76% examples, 1254724 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:16:00,432 : INFO : EPOCH 3 - PROGRESS: at 24.18% examples, 1255773 words/s, in_qsize 26, out_qsize 3
2018-10-15 13:16:01,448 : INFO : EPOCH 3 - PROGRESS: at 30.76% examples, 1263212 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:16:02,462 : INFO : EPOCH 3 - PROGRESS: at 37.04% examples, 1264624 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:16:03,465 : INFO : EPOCH 3 - PROGRESS: at 43.08% examples, 1264222 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:16:04,487 : INFO : EPOCH 3 - PROGRESS: at 49.25% examples, 1267007 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:16:05,487 : INFO : EPOCH 3 - PROGRESS: at 54.88% examples, 1267002 

2018-10-15 13:16:45,878 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-10-15 13:16:45,884 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-10-15 13:16:45,886 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-10-15 13:16:45,886 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-10-15 13:16:45,889 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-10-15 13:16:45,890 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-10-15 13:16:45,899 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-10-15 13:16:45,913 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-10-15 13:16:45,914 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-15 13:16:45,922 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 13:16:45,927 : INFO : worker thread finished; awaiting finish of 1 more threa

Computing time for training the model:88.57302165031433 seconds
Number of words processed per second: 243730.56939650414
Word2Vec(vocab=83258, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'years', 'people', 'york', 'percent', 'like', 'time', 'million']
 similarity evaluation: ((0.5971098449995156, 7.664184449991872e-34), SpearmanrResult(correlation=0.6201458320610114, pvalue=4.3327249281799506e-37), 4.815864022662889)


2018-10-15 13:16:47,862 : INFO : capital-common-countries: 53.4% (203/380)
2018-10-15 13:16:51,040 : INFO : capital-world: 49.4% (591/1197)
2018-10-15 13:16:51,332 : INFO : currency: 5.6% (6/108)
2018-10-15 13:16:57,016 : INFO : city-in-state: 25.2% (540/2142)
2018-10-15 13:16:57,919 : INFO : family: 76.0% (260/342)
2018-10-15 13:17:00,225 : INFO : gram1-adjective-to-adverb: 12.9% (112/870)
2018-10-15 13:17:01,815 : INFO : gram2-opposite: 17.3% (104/600)
2018-10-15 13:17:05,329 : INFO : gram3-comparative: 61.9% (825/1332)
2018-10-15 13:17:07,327 : INFO : gram4-superlative: 27.1% (205/756)
2018-10-15 13:17:09,048 : INFO : gram5-present-participle: 40.5% (263/650)
2018-10-15 13:17:11,938 : INFO : gram6-nationality-adjective: 73.9% (808/1094)
2018-10-15 13:17:16,058 : INFO : gram7-past-tense: 49.5% (772/1560)
2018-10-15 13:17:18,845 : INFO : gram8-plural: 50.5% (533/1056)
2018-10-15 13:17:20,069 : INFO : gram9-plural-verbs: 39.2% (181/462)
2018-10-15 13:17:20,071 : INFO : total: 43.1% (54

----------------------------------------------------
starting the processing of 1990
Reading Corpus file and preprocessing time:61.07842206954956 seconds
 printing Top 2 and last sentences For sanity check
0 ['sir', 'edmund', 'hillary', 'climbed', 'mount', 'everest', 'said', 'player', 'price', 'high', 'mountain', 'claims', 'attention', 'reason', 'addition', 'raises', 'questions', 'relative', 'value', 'addressed', 'comparison', 'model', 'cheaper', 'sounds', 'accuphase', 'lists', 'madrigal', 'proceed', 'common', 'including', 'frankly', 'promulgated', 'claims', 'best', 'look', 'madrigal', 'player', 'confirms', 'extraordinary', 'shape', 'sets', 'apart', 'seen', 'nearly', 'cubical', 'measuring', 'inches', 'inches', 'deep', 'dimensions', 'impossible', 'stack', 'unit', 'stereo', 'components', 'drawback', 'listeners', 'shape', 'mere', 'caprice', 'allows', 'circuit', 'boards', 'inside', 'player', 'stand', 'edge', 'right', 'angles', 'lying', 'flat', 'parallel', 'madrigal', 'engineers', 'internal

2018-10-15 13:18:22,155 : INFO : collecting all words and their counts
2018-10-15 13:18:22,156 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 13:18:22,664 : INFO : PROGRESS: at sentence #10000, processed 2976544 words, keeping 87759 word types
2018-10-15 13:18:23,148 : INFO : PROGRESS: at sentence #20000, processed 5774251 words, keeping 117363 word types
2018-10-15 13:18:23,659 : INFO : PROGRESS: at sentence #30000, processed 8657915 words, keeping 142294 word types
2018-10-15 13:18:24,140 : INFO : PROGRESS: at sentence #40000, processed 11414600 words, keeping 161224 word types
2018-10-15 13:18:24,662 : INFO : PROGRESS: at sentence #50000, processed 14280409 words, keeping 179527 word types
2018-10-15 13:18:25,169 : INFO : PROGRESS: at sentence #60000, processed 17107514 words, keeping 194493 word types
2018-10-15 13:18:25,689 : INFO : PROGRESS: at sentence #70000, processed 19938680 words, keeping 208448 word types
2018-10-15 13:18:26,212 : INF

2018-10-15 13:19:03,665 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 13:19:03,666 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-15 13:19:03,667 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-15 13:19:03,668 : INFO : EPOCH - 2 : training on 22962915 raw words (22406356 effective words) took 17.6s, 1272585 effective words/s
2018-10-15 13:19:04,687 : INFO : EPOCH 3 - PROGRESS: at 5.15% examples, 1197919 words/s, in_qsize 26, out_qsize 2
2018-10-15 13:19:05,695 : INFO : EPOCH 3 - PROGRESS: at 10.77% examples, 1239754 words/s, in_qsize 26, out_qsize 2
2018-10-15 13:19:06,703 : INFO : EPOCH 3 - PROGRESS: at 16.18% examples, 1249514 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:19:07,709 : INFO : EPOCH 3 - PROGRESS: at 22.32% examples, 1256186 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:19:08,717 : INFO : EPOCH 3 - PROGRESS: at 28.17% examples, 1264642 words/s, in_qsize 24, out_qsize 1
2018-10-15 1

2018-10-15 13:19:51,381 : INFO : EPOCH 5 - PROGRESS: at 67.12% examples, 1247983 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:19:52,381 : INFO : EPOCH 5 - PROGRESS: at 72.62% examples, 1246932 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:19:53,391 : INFO : EPOCH 5 - PROGRESS: at 78.30% examples, 1246212 words/s, in_qsize 26, out_qsize 1
2018-10-15 13:19:54,406 : INFO : EPOCH 5 - PROGRESS: at 84.09% examples, 1246078 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:19:55,413 : INFO : EPOCH 5 - PROGRESS: at 89.65% examples, 1245188 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:19:56,430 : INFO : EPOCH 5 - PROGRESS: at 95.34% examples, 1244306 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:19:57,194 : INFO : worker thread finished; awaiting finish of 12 more threads
2018-10-15 13:19:57,198 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-10-15 13:19:57,198 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-10-15 13:19:57,200 : INFO : w

Computing time for training the model:95.10757303237915 seconds
Number of words processed per second: 241441.4990085209
Word2Vec(vocab=86603, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'years', 'people', 'percent', 'york', 'time', 'like', 'million']
 similarity evaluation: ((0.5697363507395432, 2.1493536377907095e-30), SpearmanrResult(correlation=0.5965179608443081, pvalue=7.384737254286604e-34), 4.53257790368272)


2018-10-15 13:19:59,315 : INFO : capital-common-countries: 55.5% (233/420)
2018-10-15 13:20:02,830 : INFO : capital-world: 53.2% (705/1325)
2018-10-15 13:20:03,018 : INFO : currency: 8.6% (6/70)
2018-10-15 13:20:08,160 : INFO : city-in-state: 26.6% (518/1947)
2018-10-15 13:20:09,061 : INFO : family: 68.1% (233/342)
2018-10-15 13:20:11,373 : INFO : gram1-adjective-to-adverb: 16.1% (140/870)
2018-10-15 13:20:13,093 : INFO : gram2-opposite: 12.9% (84/650)
2018-10-15 13:20:16,604 : INFO : gram3-comparative: 67.0% (893/1332)
2018-10-15 13:20:18,460 : INFO : gram4-superlative: 39.7% (279/702)
2018-10-15 13:20:20,176 : INFO : gram5-present-participle: 42.6% (277/650)
2018-10-15 13:20:23,231 : INFO : gram6-nationality-adjective: 70.8% (822/1161)
2018-10-15 13:20:27,336 : INFO : gram7-past-tense: 49.3% (769/1560)
2018-10-15 13:20:29,955 : INFO : gram8-plural: 53.8% (534/992)
2018-10-15 13:20:31,176 : INFO : gram9-plural-verbs: 36.8% (170/462)
2018-10-15 13:20:31,177 : INFO : total: 45.4% (5663/

----------------------------------------------------
starting the processing of 1989
Reading Corpus file and preprocessing time:61.62369084358215 seconds
 printing Top 2 and last sentences For sanity check
0 ['college', 'game', 'come', 'pro', 'basketball', 'arrived', 'new', 'york', 'way', 'providence', 'college', 'rick', 'pitino', 'knicks', 'coach', 'july', 'came', 'trimmings', 'point', 'basket', 'uptempo', 'offense', 'court', 'trap', 'pressure', 'defenses', 'slice', 'seconds', 'team', 'ability', 'run', 'offense', 'featured', 'enthusiasm', 'camaraderie', 'familiar', 'college', 'campus', 'national', 'basketball', 'association', 'skepticism', 'pitino', 'coaching', 'style', 'caught', 'knicks', 'past', 'midway', 'point', 'season', 'knicks', 'fifth', 'best', 'record', 'league', 'lead', 'atlantic', 'division', 'games', 'patrick', 'ewing', 'charles', 'oakley', 'mark', 'jackson', 'trent', 'tucker', 'rank', 'seven', 'league', 'individual', 'statistical', 'categories', 'home', 'knicks', 'road', 

2018-10-15 13:21:33,857 : INFO : collecting all words and their counts
2018-10-15 13:21:33,858 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 13:21:34,333 : INFO : PROGRESS: at sentence #10000, processed 2870114 words, keeping 85917 word types
2018-10-15 13:21:34,817 : INFO : PROGRESS: at sentence #20000, processed 5694075 words, keeping 117596 word types
2018-10-15 13:21:35,311 : INFO : PROGRESS: at sentence #30000, processed 8490555 words, keeping 141200 word types
2018-10-15 13:21:35,792 : INFO : PROGRESS: at sentence #40000, processed 11219495 words, keeping 158394 word types
2018-10-15 13:21:36,318 : INFO : PROGRESS: at sentence #50000, processed 14177674 words, keeping 176487 word types
2018-10-15 13:21:36,805 : INFO : PROGRESS: at sentence #60000, processed 16959750 words, keeping 191362 word types
2018-10-15 13:21:37,317 : INFO : PROGRESS: at sentence #70000, processed 19800480 words, keeping 205541 word types
2018-10-15 13:21:37,805 : INF

2018-10-15 13:22:15,540 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 13:22:15,545 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-15 13:22:15,551 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-15 13:22:15,552 : INFO : EPOCH - 2 : training on 23193809 raw words (22637493 effective words) took 17.8s, 1272879 effective words/s
2018-10-15 13:22:16,571 : INFO : EPOCH 3 - PROGRESS: at 5.22% examples, 1198420 words/s, in_qsize 26, out_qsize 2
2018-10-15 13:22:17,571 : INFO : EPOCH 3 - PROGRESS: at 10.90% examples, 1241980 words/s, in_qsize 26, out_qsize 1
2018-10-15 13:22:18,573 : INFO : EPOCH 3 - PROGRESS: at 16.26% examples, 1246701 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:22:19,596 : INFO : EPOCH 3 - PROGRESS: at 21.89% examples, 1246872 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:22:20,603 : INFO : EPOCH 3 - PROGRESS: at 28.02% examples, 1253770 words/s, in_qsize 22, out_qsize 3
2018-10-15 1

2018-10-15 13:23:03,503 : INFO : EPOCH 5 - PROGRESS: at 67.44% examples, 1259203 words/s, in_qsize 26, out_qsize 0
2018-10-15 13:23:04,519 : INFO : EPOCH 5 - PROGRESS: at 73.16% examples, 1259745 words/s, in_qsize 22, out_qsize 3
2018-10-15 13:23:05,529 : INFO : EPOCH 5 - PROGRESS: at 79.13% examples, 1262092 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:23:06,530 : INFO : EPOCH 5 - PROGRESS: at 84.30% examples, 1260665 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:23:07,534 : INFO : EPOCH 5 - PROGRESS: at 90.33% examples, 1261252 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:23:08,550 : INFO : EPOCH 5 - PROGRESS: at 95.92% examples, 1262944 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:23:09,249 : INFO : worker thread finished; awaiting finish of 12 more threads
2018-10-15 13:23:09,262 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-10-15 13:23:09,263 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-10-15 13:23:09,270 : INFO : w

Computing time for training the model:95.4626145362854 seconds
Number of words processed per second: 242962.2225691715
Word2Vec(vocab=87055, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'years', 'people', 'time', 'percent', 'york', 'like', 'company']


2018-10-15 13:23:10,090 : INFO : Pearson correlation coefficient against wordsim353.tsv: 0.5623
2018-10-15 13:23:10,091 : INFO : Spearman rank-order correlation coefficient against wordsim353.tsv: 0.5848
2018-10-15 13:23:10,091 : INFO : Pairs with unknown words ratio: 4.5%
2018-10-15 13:23:10,131 : INFO : precomputing L2-norms of word weight vectors


 similarity evaluation: ((0.5622862912589404, 1.7403870543557219e-29), SpearmanrResult(correlation=0.584757890852303, pvalue=2.684996494281017e-32), 4.53257790368272)


2018-10-15 13:23:11,414 : INFO : capital-common-countries: 66.4% (279/420)
2018-10-15 13:23:14,200 : INFO : capital-world: 50.8% (529/1042)
2018-10-15 13:23:14,489 : INFO : currency: 2.8% (3/108)
2018-10-15 13:23:20,169 : INFO : city-in-state: 22.1% (474/2146)
2018-10-15 13:23:21,084 : INFO : family: 71.3% (244/342)
2018-10-15 13:23:23,405 : INFO : gram1-adjective-to-adverb: 13.1% (114/870)
2018-10-15 13:23:25,135 : INFO : gram2-opposite: 17.4% (113/650)
2018-10-15 13:23:28,655 : INFO : gram3-comparative: 67.1% (894/1332)
2018-10-15 13:23:30,515 : INFO : gram4-superlative: 33.3% (234/702)
2018-10-15 13:23:32,245 : INFO : gram5-present-participle: 33.4% (217/650)
2018-10-15 13:23:35,329 : INFO : gram6-nationality-adjective: 74.2% (862/1161)
2018-10-15 13:23:39,454 : INFO : gram7-past-tense: 47.4% (739/1560)
2018-10-15 13:23:42,246 : INFO : gram8-plural: 55.9% (590/1056)
2018-10-15 13:23:43,467 : INFO : gram9-plural-verbs: 45.2% (209/462)
2018-10-15 13:23:43,469 : INFO : total: 44.0% (55

----------------------------------------------------
starting the processing of 1988
Reading Corpus file and preprocessing time:63.77541255950928 seconds
 printing Top 2 and last sentences For sanity check
0 ['arm', 'discussed', 'reverential', 'tones', 'yankee', 'camp', 'notable', 'pitchers', 'whitey', 'ford', 'eddie', 'lopat', 'cautiously', 'nurtured', 'springs', 'speak', 'glowingly', 'young', 'pitcher', 'attached', 'arm', 'hesitation', 'happen', 'year', 'leiter', 'years', 'old', 'leiter', 'handsome', 'friendly', 'self', 'assured', 'gifted', 'verge', 'finding', 'place', 'yankee', 'pitching', 'staff', 'time', 'club', 'breaks', 'camp', 'weeks', 'start', 'relieve', 'matter', 'growing', 'job', 'arm', 'said', 'billy', 'martin', 'yankee', 'manager', 'ultimately', 'final', 'decision', 'leiter', 'working', 'mind', 'building', 'confidence', 'putting', 'pressure', 'patting', 'lot', 'harnessing', 'speed', 'yankees', 'doubts', 'leiter', 'ability', 'throw', 'ball', 'hard', 'season', 'called', 'sep

2018-10-15 13:24:49,233 : INFO : collecting all words and their counts
2018-10-15 13:24:49,234 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 13:24:49,700 : INFO : PROGRESS: at sentence #10000, processed 2811963 words, keeping 85206 word types
2018-10-15 13:24:50,184 : INFO : PROGRESS: at sentence #20000, processed 5616234 words, keeping 115427 word types
2018-10-15 13:24:50,670 : INFO : PROGRESS: at sentence #30000, processed 8424705 words, keeping 138151 word types
2018-10-15 13:24:51,179 : INFO : PROGRESS: at sentence #40000, processed 11302559 words, keeping 159043 word types
2018-10-15 13:24:51,681 : INFO : PROGRESS: at sentence #50000, processed 14104610 words, keeping 175860 word types
2018-10-15 13:24:52,188 : INFO : PROGRESS: at sentence #60000, processed 16951178 words, keeping 191409 word types
2018-10-15 13:24:52,682 : INFO : PROGRESS: at sentence #70000, processed 19728168 words, keeping 205394 word types
2018-10-15 13:24:53,210 : INF

2018-10-15 13:25:32,876 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-10-15 13:25:32,893 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-10-15 13:25:32,896 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-15 13:25:32,898 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 13:25:32,907 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-15 13:25:32,909 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-15 13:25:32,909 : INFO : EPOCH - 2 : training on 23879758 raw words (23315992 effective words) took 18.5s, 1263552 effective words/s
2018-10-15 13:25:33,926 : INFO : EPOCH 3 - PROGRESS: at 5.35% examples, 1203075 words/s, in_qsize 26, out_qsize 1
2018-10-15 13:25:34,939 : INFO : EPOCH 3 - PROGRESS: at 10.76% examples, 1242209 words/s, in_qsize 26, out_qsize 2
2018-10-15 13:25:35,961 : INFO : EPOCH 3 - PROGRESS: at 16.50% examples, 1250330 wor

2018-10-15 13:26:17,664 : INFO : EPOCH 5 - PROGRESS: at 43.58% examples, 1266371 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:26:18,666 : INFO : EPOCH 5 - PROGRESS: at 49.12% examples, 1267118 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:26:19,670 : INFO : EPOCH 5 - PROGRESS: at 54.54% examples, 1267631 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:26:20,675 : INFO : EPOCH 5 - PROGRESS: at 60.22% examples, 1267773 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:26:21,676 : INFO : EPOCH 5 - PROGRESS: at 65.70% examples, 1269339 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:26:22,686 : INFO : EPOCH 5 - PROGRESS: at 71.18% examples, 1269706 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:26:23,706 : INFO : EPOCH 5 - PROGRESS: at 76.93% examples, 1269209 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:26:24,710 : INFO : EPOCH 5 - PROGRESS: at 82.25% examples, 1268661 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:26:25,724 : INFO : EPOCH 5 - PROGRESS: at 87.41% examples, 1268688

Computing time for training the model:98.73383951187134 seconds
Number of words processed per second: 241859.9146762524
Word2Vec(vocab=87926, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'years', 'percent', 'people', 'time', 'york', 'state', 'company']
 similarity evaluation: ((0.5882682593051178, 9.327500455311314e-33), SpearmanrResult(correlation=0.6152781238723405, pvalue=1.7436486934475453e-36), 4.53257790368272)


2018-10-15 13:26:29,581 : INFO : capital-common-countries: 49.5% (208/420)
2018-10-15 13:26:32,970 : INFO : capital-world: 48.1% (616/1280)
2018-10-15 13:26:33,253 : INFO : currency: 5.7% (6/106)
2018-10-15 13:26:39,219 : INFO : city-in-state: 21.4% (484/2260)
2018-10-15 13:26:40,121 : INFO : family: 76.3% (261/342)
2018-10-15 13:26:42,423 : INFO : gram1-adjective-to-adverb: 15.4% (134/870)
2018-10-15 13:26:44,142 : INFO : gram2-opposite: 19.7% (128/650)
2018-10-15 13:26:47,650 : INFO : gram3-comparative: 65.9% (878/1332)
2018-10-15 13:26:49,793 : INFO : gram4-superlative: 30.0% (244/812)
2018-10-15 13:26:51,517 : INFO : gram5-present-participle: 43.8% (285/650)
2018-10-15 13:26:54,410 : INFO : gram6-nationality-adjective: 75.8% (830/1095)
2018-10-15 13:26:58,520 : INFO : gram7-past-tense: 47.7% (744/1560)
2018-10-15 13:27:01,147 : INFO : gram8-plural: 49.9% (495/992)
2018-10-15 13:27:02,484 : INFO : gram9-plural-verbs: 37.4% (189/506)
2018-10-15 13:27:02,486 : INFO : total: 42.7% (550

----------------------------------------------------
starting the processing of 1987
Reading Corpus file and preprocessing time:63.17447328567505 seconds
 printing Top 2 and last sentences For sanity check
0 ['seymour', 'hersh', 'charges', 'principal', 'purpose', 'libya', 'air', 'raid', 'assassinate', 'col', 'muammar', 'qaddafi', 'family', 'right', 'article', 'fails', 'case', 'virtually', 'fact', 'hersh', 'adduces', 'attributed', 'nameless', 'source', 'hersh', 'states', 'outset', 'interviewed', 'current', 'federal', 'officials', 'agreed', 'talk', 'names', 'strains', 'credulity', 'acceptable', 'limits', 'recognize', 'necessary', 'rely', 'nameless', 'source', 'facts', 'including', 'longer', 'government', 'service', 'afraid', 'identify', 'mind', 'boggles', 'william', 'mclean', 'ann', 'arbor', 'mich']
1 ['public', 'officials', 'scholars', 'experts', 'gathering', 'forums', 'new', 'york', 'city', 'nation', 'discuss', 'constitution', 'anniversary', 'signing', 'week', 'panel', 'moderated', 'ma

2018-10-15 13:28:06,758 : INFO : collecting all words and their counts
2018-10-15 13:28:06,759 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 13:28:07,228 : INFO : PROGRESS: at sentence #10000, processed 2764932 words, keeping 84117 word types
2018-10-15 13:28:07,701 : INFO : PROGRESS: at sentence #20000, processed 5505694 words, keeping 113906 word types
2018-10-15 13:28:08,206 : INFO : PROGRESS: at sentence #30000, processed 8374626 words, keeping 138365 word types
2018-10-15 13:28:08,709 : INFO : PROGRESS: at sentence #40000, processed 11272084 words, keeping 158403 word types
2018-10-15 13:28:09,222 : INFO : PROGRESS: at sentence #50000, processed 14030050 words, keeping 174995 word types
2018-10-15 13:28:09,724 : INFO : PROGRESS: at sentence #60000, processed 16828356 words, keeping 190662 word types
2018-10-15 13:28:10,222 : INFO : PROGRESS: at sentence #70000, processed 19631031 words, keeping 204645 word types
2018-10-15 13:28:10,716 : INF

2018-10-15 13:28:49,388 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-10-15 13:28:49,392 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-15 13:28:49,393 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 13:28:49,394 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-15 13:28:49,399 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-15 13:28:49,400 : INFO : EPOCH - 2 : training on 23594786 raw words (23039677 effective words) took 18.2s, 1269269 effective words/s
2018-10-15 13:28:50,407 : INFO : EPOCH 3 - PROGRESS: at 5.22% examples, 1213761 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:28:51,426 : INFO : EPOCH 3 - PROGRESS: at 11.07% examples, 1243049 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:28:52,426 : INFO : EPOCH 3 - PROGRESS: at 16.76% examples, 1256809 words/s, in_qsize 23, out_qsize 3
2018-10-15 13:28:53,434 : INFO : EPOCH 3 - PROGRESS: at 22.4

2018-10-15 13:29:36,656 : INFO : EPOCH 5 - PROGRESS: at 60.44% examples, 1260396 words/s, in_qsize 26, out_qsize 0
2018-10-15 13:29:37,669 : INFO : EPOCH 5 - PROGRESS: at 66.01% examples, 1260443 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:29:38,674 : INFO : EPOCH 5 - PROGRESS: at 71.40% examples, 1259190 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:29:39,683 : INFO : EPOCH 5 - PROGRESS: at 77.26% examples, 1260220 words/s, in_qsize 26, out_qsize 0
2018-10-15 13:29:40,705 : INFO : EPOCH 5 - PROGRESS: at 82.45% examples, 1259822 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:29:41,709 : INFO : EPOCH 5 - PROGRESS: at 88.01% examples, 1260650 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:29:42,709 : INFO : EPOCH 5 - PROGRESS: at 93.56% examples, 1260403 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:29:43,723 : INFO : EPOCH 5 - PROGRESS: at 99.44% examples, 1260017 words/s, in_qsize 16, out_qsize 1
2018-10-15 13:29:43,757 : INFO : worker thread finished; awaiting finish of 12 m

Computing time for training the model:97.0613124370575 seconds
Number of words processed per second: 243091.5614838898
Word2Vec(vocab=86896, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'years', 'people', 'time', 'york', 'percent', 'president', 'american']
 similarity evaluation: ((0.6024327140246603, 9.126539780257895e-35), SpearmanrResult(correlation=0.6283161488690189, pvalue=1.5834629975384862e-38), 4.2492917847025495)


2018-10-15 13:29:45,892 : INFO : capital-common-countries: 56.9% (239/420)
2018-10-15 13:29:48,384 : INFO : capital-world: 54.7% (515/941)
2018-10-15 13:29:48,569 : INFO : currency: 8.8% (6/68)
2018-10-15 13:29:54,372 : INFO : city-in-state: 22.4% (491/2196)
2018-10-15 13:29:55,182 : INFO : family: 68.6% (210/306)
2018-10-15 13:29:57,482 : INFO : gram1-adjective-to-adverb: 16.4% (143/870)
2018-10-15 13:29:59,197 : INFO : gram2-opposite: 19.4% (126/650)
2018-10-15 13:30:02,719 : INFO : gram3-comparative: 68.7% (915/1332)
2018-10-15 13:30:04,575 : INFO : gram4-superlative: 34.8% (244/702)
2018-10-15 13:30:06,435 : INFO : gram5-present-participle: 42.7% (300/702)
2018-10-15 13:30:09,320 : INFO : gram6-nationality-adjective: 74.2% (812/1095)
2018-10-15 13:30:13,446 : INFO : gram7-past-tense: 52.2% (815/1560)
2018-10-15 13:30:16,239 : INFO : gram8-plural: 53.4% (564/1056)
2018-10-15 13:30:17,467 : INFO : gram9-plural-verbs: 35.7% (165/462)
2018-10-15 13:30:17,469 : INFO : total: 44.9% (5545

----------------------------------------------------
starting the processing of 1986
Reading Corpus file and preprocessing time:65.56159973144531 seconds
 printing Top 2 and last sentences For sanity check
0 ['protestant', 'leaders', 'hope', 'portray', 'results', 'parliamentary', 'elections', 'thursday', 'referendum', 'decision', 'thatcher', 'government', 'irish', 'republic', 'consultative', 'role', 'affairs', 'northern', 'ireland', 'vote', 'seats', 'house', 'commons', 'held', 'unionist', 'members', 'parliament', 'want', 'northern', 'ireland', 'stay', 'british', 'resigned', 'masse', 'force', 'vote', 'referendum', 'british', 'irish', 'accord', 'signed', 'november', 'prime', 'minister', 'margaret', 'thatcher', 'irish', 'prime', 'minister', 'garret', 'fitzgerald', 'approved', 'parliaments', 'voting', 'mark', 'largest', 'number', 'simultaneous', 'elections', 'recent', 'memory', 'campaign', 'probably', 'strangest', 'unionists', 'nominated', 'unidentified', 'candidate', 'irish', 'foreign', '

2018-10-15 13:31:24,132 : INFO : collecting all words and their counts
2018-10-15 13:31:24,133 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 13:31:24,616 : INFO : PROGRESS: at sentence #10000, processed 2908052 words, keeping 87240 word types
2018-10-15 13:31:25,066 : INFO : PROGRESS: at sentence #20000, processed 5533518 words, keeping 114247 word types
2018-10-15 13:31:25,548 : INFO : PROGRESS: at sentence #30000, processed 8283432 words, keeping 137035 word types
2018-10-15 13:31:26,017 : INFO : PROGRESS: at sentence #40000, processed 10974117 words, keeping 155196 word types
2018-10-15 13:31:26,479 : INFO : PROGRESS: at sentence #50000, processed 13607444 words, keeping 171393 word types
2018-10-15 13:31:26,971 : INFO : PROGRESS: at sentence #60000, processed 16360939 words, keeping 186810 word types
2018-10-15 13:31:27,464 : INFO : PROGRESS: at sentence #70000, processed 19129613 words, keeping 201506 word types
2018-10-15 13:31:27,933 : INF

2018-10-15 13:32:06,831 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-15 13:32:06,841 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 13:32:06,845 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-15 13:32:06,848 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-15 13:32:06,848 : INFO : EPOCH - 2 : training on 23743421 raw words (23185726 effective words) took 18.1s, 1278469 effective words/s
2018-10-15 13:32:07,852 : INFO : EPOCH 3 - PROGRESS: at 5.07% examples, 1197081 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:32:08,863 : INFO : EPOCH 3 - PROGRESS: at 9.88% examples, 1217904 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:32:09,871 : INFO : EPOCH 3 - PROGRESS: at 15.45% examples, 1236019 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:32:10,887 : INFO : EPOCH 3 - PROGRESS: at 21.37% examples, 1246693 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:32:11,900 : INFO : EPOC

2018-10-15 13:32:52,726 : INFO : EPOCH 5 - PROGRESS: at 49.04% examples, 1250175 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:32:53,726 : INFO : EPOCH 5 - PROGRESS: at 54.29% examples, 1250678 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:32:54,740 : INFO : EPOCH 5 - PROGRESS: at 60.06% examples, 1250551 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:32:55,756 : INFO : EPOCH 5 - PROGRESS: at 65.61% examples, 1249490 words/s, in_qsize 26, out_qsize 2
2018-10-15 13:32:56,758 : INFO : EPOCH 5 - PROGRESS: at 70.94% examples, 1250245 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:32:57,764 : INFO : EPOCH 5 - PROGRESS: at 76.27% examples, 1249777 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:32:58,773 : INFO : EPOCH 5 - PROGRESS: at 81.41% examples, 1248727 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:32:59,780 : INFO : EPOCH 5 - PROGRESS: at 87.09% examples, 1248398 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:33:00,783 : INFO : EPOCH 5 - PROGRESS: at 92.62% examples, 1248682

Computing time for training the model:98.03175783157349 seconds
Number of words processed per second: 242201.31848286474
Word2Vec(vocab=86302, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'years', 'time', 'people', 'percent', 'york', 'million', 'president']
 similarity evaluation: ((0.5979055378456114, 3.0657839905829225e-34), SpearmanrResult(correlation=0.6223591573354281, pvalue=9.757376814410778e-38), 3.9660056657223794)


2018-10-15 13:33:04,258 : INFO : capital-common-countries: 51.4% (216/420)
2018-10-15 13:33:06,986 : INFO : capital-world: 55.2% (566/1025)
2018-10-15 13:33:07,170 : INFO : currency: 13.2% (9/68)
2018-10-15 13:33:13,178 : INFO : city-in-state: 24.6% (559/2273)
2018-10-15 13:33:13,987 : INFO : family: 69.3% (212/306)
2018-10-15 13:33:16,294 : INFO : gram1-adjective-to-adverb: 14.5% (126/870)
2018-10-15 13:33:18,152 : INFO : gram2-opposite: 17.1% (120/702)
2018-10-15 13:33:21,677 : INFO : gram3-comparative: 66.9% (891/1332)
2018-10-15 13:33:23,539 : INFO : gram4-superlative: 31.8% (223/702)
2018-10-15 13:33:25,402 : INFO : gram5-present-participle: 34.3% (241/702)
2018-10-15 13:33:28,653 : INFO : gram6-nationality-adjective: 78.4% (964/1229)
2018-10-15 13:33:32,775 : INFO : gram7-past-tense: 53.9% (841/1560)
2018-10-15 13:33:35,744 : INFO : gram8-plural: 43.4% (487/1122)
2018-10-15 13:33:36,972 : INFO : gram9-plural-verbs: 37.4% (173/462)
2018-10-15 13:33:36,974 : INFO : total: 44.1% (56

----------------------------------------------------
starting the processing of 1985
Reading Corpus file and preprocessing time:62.19493556022644 seconds
 printing Top 2 and last sentences For sanity check
0 ['things', 'ray', 'williams', 'arriving', 'boston', 'garden', 'yesterday', 'afternoon', 'look', 'floor', 'new', 'locker', 'sitting', 'neatly', 'sneakers', 'new', 'green', 'sneakers', 'moment', 'stood', 'stared', 'took', 'old', 'white', 'ones', 'painted', 'said', 'looked', 'weird', 'pair', 'good', 'job', 'looked', 'brand', 'new', 'thing', 'williams', 'officially', 'boston', 'celtic', 'friday', 'knick', 'signed', 'offer', 'sheet', 'team', 'pay', 'remainder', 'season', 'week', 'williams', 'endured', 'dizzying', 'schedule', 'included', 'excursions', 'boston', 'home', 'englewood', 'cliffs', 'tend', 'personal', 'matters', 'scoreless', 'debut', 'debut', 'night', 'san', 'antonio', 'spurs', 'playing', 'little', 'minute', 'second', 'quarter', 'scoring', 'marked', 'time', 'played', 'professio

2018-10-15 13:34:40,242 : INFO : collecting all words and their counts
2018-10-15 13:34:40,243 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 13:34:40,668 : INFO : PROGRESS: at sentence #10000, processed 2558056 words, keeping 80017 word types
2018-10-15 13:34:41,165 : INFO : PROGRESS: at sentence #20000, processed 5403381 words, keeping 114503 word types
2018-10-15 13:34:41,642 : INFO : PROGRESS: at sentence #30000, processed 8056685 words, keeping 137683 word types
2018-10-15 13:34:42,141 : INFO : PROGRESS: at sentence #40000, processed 10901259 words, keeping 159042 word types
2018-10-15 13:34:42,652 : INFO : PROGRESS: at sentence #50000, processed 13652779 words, keeping 177617 word types
2018-10-15 13:34:43,129 : INFO : PROGRESS: at sentence #60000, processed 16285765 words, keeping 193182 word types
2018-10-15 13:34:43,610 : INFO : PROGRESS: at sentence #70000, processed 18953818 words, keeping 207177 word types
2018-10-15 13:34:44,093 : INF

2018-10-15 13:35:21,423 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 13:35:21,426 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-15 13:35:21,435 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-15 13:35:21,436 : INFO : EPOCH - 2 : training on 22593750 raw words (22047590 effective words) took 17.8s, 1241515 effective words/s
2018-10-15 13:35:22,449 : INFO : EPOCH 3 - PROGRESS: at 5.71% examples, 1136564 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:35:23,452 : INFO : EPOCH 3 - PROGRESS: at 11.32% examples, 1166871 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:35:24,469 : INFO : EPOCH 3 - PROGRESS: at 16.11% examples, 1171540 words/s, in_qsize 25, out_qsize 0
2018-10-15 13:35:25,469 : INFO : EPOCH 3 - PROGRESS: at 21.16% examples, 1168314 words/s, in_qsize 24, out_qsize 1
2018-10-15 13:35:26,481 : INFO : EPOCH 3 - PROGRESS: at 27.09% examples, 1175729 words/s, in_qsize 25, out_qsize 0
2018-10-15 1

2018-10-15 13:36:09,471 : INFO : EPOCH 5 - PROGRESS: at 67.88% examples, 1242838 words/s, in_qsize 22, out_qsize 3
2018-10-15 13:36:10,483 : INFO : EPOCH 5 - PROGRESS: at 73.62% examples, 1243186 words/s, in_qsize 26, out_qsize 2
2018-10-15 13:36:11,489 : INFO : EPOCH 5 - PROGRESS: at 79.54% examples, 1243101 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:36:12,492 : INFO : EPOCH 5 - PROGRESS: at 84.97% examples, 1242956 words/s, in_qsize 26, out_qsize 0
2018-10-15 13:36:13,518 : INFO : EPOCH 5 - PROGRESS: at 91.16% examples, 1242113 words/s, in_qsize 23, out_qsize 2
2018-10-15 13:36:14,522 : INFO : EPOCH 5 - PROGRESS: at 96.79% examples, 1242538 words/s, in_qsize 26, out_qsize 0
2018-10-15 13:36:15,017 : INFO : worker thread finished; awaiting finish of 12 more threads
2018-10-15 13:36:15,020 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-10-15 13:36:15,030 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-10-15 13:36:15,035 : INFO : w

Computing time for training the model:94.84030890464783 seconds
Number of words processed per second: 238229.40120023958
Word2Vec(vocab=85042, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'years', 'percent', 'time', 'york', 'people', 'state', 'million']


2018-10-15 13:36:15,857 : INFO : Pearson correlation coefficient against wordsim353.tsv: 0.5905
2018-10-15 13:36:15,858 : INFO : Spearman rank-order correlation coefficient against wordsim353.tsv: 0.6088
2018-10-15 13:36:15,859 : INFO : Pairs with unknown words ratio: 4.5%
2018-10-15 13:36:15,899 : INFO : precomputing L2-norms of word weight vectors


 similarity evaluation: ((0.5904819378166569, 4.756788310271214e-33), SpearmanrResult(correlation=0.6088101209747272, pvalue=1.468680064795531e-35), 4.53257790368272)


2018-10-15 13:36:16,958 : INFO : capital-common-countries: 57.0% (195/342)
2018-10-15 13:36:19,474 : INFO : capital-world: 55.5% (526/947)
2018-10-15 13:36:19,660 : INFO : currency: 7.4% (5/68)
2018-10-15 13:36:25,343 : INFO : city-in-state: 25.7% (550/2144)
2018-10-15 13:36:26,248 : INFO : family: 67.0% (229/342)
2018-10-15 13:36:28,554 : INFO : gram1-adjective-to-adverb: 14.5% (126/870)
2018-10-15 13:36:30,276 : INFO : gram2-opposite: 16.8% (109/650)
2018-10-15 13:36:33,817 : INFO : gram3-comparative: 63.9% (851/1332)
2018-10-15 13:36:35,678 : INFO : gram4-superlative: 34.0% (239/702)
2018-10-15 13:36:37,404 : INFO : gram5-present-participle: 38.0% (247/650)
2018-10-15 13:36:40,309 : INFO : gram6-nationality-adjective: 84.7% (928/1095)
2018-10-15 13:36:44,442 : INFO : gram7-past-tense: 50.5% (788/1560)
2018-10-15 13:36:47,404 : INFO : gram8-plural: 50.1% (562/1122)
2018-10-15 13:36:48,744 : INFO : gram9-plural-verbs: 31.4% (159/506)
2018-10-15 13:36:48,745 : INFO : total: 44.7% (5514

----------------------------------------------------


## Sanity check to make sure models have been saved

In [6]:
#Sanity check to make sure models have been saved
from gensim.models.word2vec import Word2Vec
import gensim

year = '2002'
print(year)
filePath = os.path.join(outputPath, year)
model = gensim.models.Word2Vec.load(filePath) # you can continue training with the loaded model!
print(model)
model.wv.index2word[:10]

2018-10-15 11:55:10,585 : INFO : loading Word2Vec object from D:\data\nyt\word2vecModels\2002


2002


2018-10-15 11:55:11,227 : INFO : loading wv recursively from D:\data\nyt\word2vecModels\2002.wv.* with mmap=None
2018-10-15 11:55:11,228 : INFO : loading vectors from D:\data\nyt\word2vecModels\2002.wv.vectors.npy with mmap=None
2018-10-15 11:55:11,285 : INFO : setting ignored attribute vectors_norm to None
2018-10-15 11:55:11,286 : INFO : loading vocabulary recursively from D:\data\nyt\word2vecModels\2002.vocabulary.* with mmap=None
2018-10-15 11:55:11,287 : INFO : loading trainables recursively from D:\data\nyt\word2vecModels\2002.trainables.* with mmap=None
2018-10-15 11:55:11,287 : INFO : setting ignored attribute cum_table to None
2018-10-15 11:55:11,288 : INFO : loaded D:\data\nyt\word2vecModels\2002


Word2Vec(vocab=97437, size=300, alpha=0.025)


['said',
 'new',
 'year',
 'like',
 'people',
 'time',
 'years',
 'company',
 'york',
 'percent']

In [51]:
model.wv.most_similar('man')

  if np.issubdtype(vec.dtype, np.int):


[('woman', 0.657943069934845),
 ('policeman', 0.5610487461090088),
 ('gentleman', 0.5473320484161377),
 ('men', 0.5011500120162964),
 ('guy', 0.4981834292411804),
 ('balding', 0.4937747120857239),
 ('gruff', 0.4927574396133423),
 ('assailant', 0.4926893711090088),
 ('drifter', 0.49006587266921997),
 ('lover', 0.4891093969345093)]

In [3]:
def similarityMetricsAlternative(model):
    #listDiversityTerms=['diversity', 'diverse']
    listDiversityTerms=['diversity']
    
     
    listDemographicsTermsUpdated=[w for w in listDemographicsTerms if w in model.wv.vocab]
    listIntellectualTermsUpdated=[w for w in listIntellectualTerms if w in model.wv.vocab]
    
    similarityDemographics = model.wv.n_similarity(listDiversityTerms,listDemographicsTermsUpdated)
    similarityIntellectual = model.wv.n_similarity(listDiversityTerms,listIntellectualTermsUpdated)
    return (similarityDemographics,similarityIntellectual)

In [4]:
from commonVariables import listDemographicsTerms, listIntellectualTerms
similarityDemographics2, similarityIntellectual2= similarityMetricsAlternative(model)
similarityDemographics2,similarityIntellectual2

(0.3289945841610287, 0.09222000420760601)