### Logging

In [20]:
import logging
#logging.getLogger('').handlers = []  #To delete previous logging configuration

logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
    #If log into standard output and to a file is desired:
    #handlers=[logging.FileHandler("{0}/{1}.log".format('./', uniName)), #path, File name
    #         logging.StreamHandler()]
)

# Model Training

In [52]:
import os
#from word2vecTraining import preprocessing, training, evaluation

# Where Text Corpus is located
inputPath = os.path.normpath(r'D:\data\nyt') 
#inputPath = os.path.normpath(r'.') 

# Get all CSV Text Corpus files In target Path
#onlyfiles = [f for f in listdir(inputPath) if isfile(join(inputPath, f)) and f.endswith('.csv')]
#onlyfiles = onlyfiles[:2] #For testing
# print(onlyfiles)
# len(onlyfiles)

#Where output word2vec models will be stored
outputPath = os.path.normpath(r'D:\data\nyt\word2vecModels') #\ to \\
#outputPath = os.path.normpath(r'.\models') 

## Preprocessing

In [59]:
import re
import csv
import time
from gensim.parsing.preprocessing import * #provides a number of convenience preprocessing functions optimized for speed
from gensim.models.word2vec import Word2Vec
import gensim
from multiprocessing import cpu_count




def preprocessing(file):

    CUSTOM_FILTERS = [lambda x: x.lower(), #To lowercase
                      lambda text: re.sub(r'https?:\/\/.*\s', '', text, flags=re.MULTILINE), #To Strip away URLs
                      #split_alphanum, #Add spaces between digits & letters in s using RE_AL_NUM.
                      strip_tags, #Remove tags from s using RE_TAGS.
                      strip_non_alphanum,#Remove non-alphabetic characters from s using RE_NONALPHA.
                      strip_punctuation, #Replace punctuation characters with spaces in s using RE_PUNCT.
                      strip_numeric, #Remove digits from s using RE_NUMERIC.
                      strip_multiple_whitespaces,#Remove repeating whitespace characters (spaces, tabs, line breaks) from s and turns tabs & line breaks into spaces using RE_WHITESPACE.
                      remove_stopwords, # Set of 339 stopwords from Stone, Denis, Kwantes (2010).
                      #lambda x:" ".join(w for w in x.split() if w not in stopword_file) #Custom stopwords
                      lambda x: strip_short(x, minsize=3), #Remove words with length lesser than minsize from s.
                      #stem_text #Transform s into lowercase and stem it.
                     ]

    tic = time.time() # Start timing


    csv.field_size_limit(2147483647)
    #csv.field_size_limit(sys.maxsize)
    #Option 1: split paragraphs Into  sentences
    with open(file,'r', newline='',encoding="utf-8") as inpFile:

        csvObject = csv.reader(inpFile, delimiter=',',quotechar='"')

        wordThreshold=5 #Important: filter out sentences with less than wordThreshold words

        sentences = []
        for csvEntry in csvObject:
            if len(csvEntry)>1:
                #IMPORTANT: If all of your sentences have been loaded as one sentence, Word2vec training could take a very long time.
                #That’s because Word2vec is a sentence-level algorithm, so sentence boundaries are very important, because
                #co-occurrence statistics are gathered sentence by sentence. For many corpora, average sentence length is six words.
                #That means that with a window size of 5 you have, say, 30 (random number here) rounds of skip-gram calculations.
                #If you forget to specify your sentence boundaries, you may load a “sentence” that’s 10,000 words long.
                #In that case, Word2vec would attempt a full skip-gram cycle for the whole 10,000-word “sentence”. Hence, I split
                #the CSV entries By paragraphs '\n
                lines = csvEntry[0].split('\n') #csvEntry[0] is url csvEntry[1] is text Fetched from URL

                for line in lines: #Different elements appear in their own line
                    words = preprocess_string(line,CUSTOM_FILTERS)

                    if len(words)>wordThreshold: #Important: filter out sentences with less than wordThreshold words
                        sentences.append(words)

    toc = time.time() # Start timing
    computationTime = toc-tic

    print("Reading Corpus file and preprocessing time:" +str(computationTime)+" seconds")

    print(" printing Top 2 and last sentences For sanity check")
    for i, s in enumerate(sentences[0:2]):
        print(i,s)
    print(len(sentences),sentences[-1])

    print(" stats about Corpus read from file")
    wordsInCorpus = 0
    for i, s in enumerate(sentences):
        wordsInCorpus += len(s)
    print("Number of words in corpus:",wordsInCorpus)
    print("Number of sentences in corpus:",len(sentences))
    #for i, s in enumerate(sentences[0:30]):
    #    print(i,s)
    return sentences

year = '2001'


print("starting the processing of " + year)
inputFilePath = os.path.join(inputPath, year+'.csv')
sentences = preprocessing(inputFilePath)

starting the processing of 2001
Reading Corpus file and preprocessing time:74.14247250556946 seconds
 printing Top 2 and last sentences For sanity check
0 ['drug', 'explosives', 'detection', 'equipment', 'maker', 'barringer', 'technologies', 'said', 'yesterday', 'agreed', 'acquired', 'diversified', 'manufacturer', 'smiths', 'group', 'million', 'barringer', 'said', 'agreed', 'acquired', 'smiths', 'share', 'represents', 'percent', 'premium', 'barringer', 'shares', 'based', 'thursday', 'closing', 'stock', 'price', 'share', 'separately', 'barringer', 'reported', 'surge', 'fourth', 'quarter', 'net', 'income', 'cents', 'diluted', 'share', 'cents', 'share', 'year', 'earlier', 'revenue', 'rose', 'million', 'million']
1 ['nanny', 'upper', 'east', 'work', 'schedule', 'cut', 'days', 'week', 'wife', 'household', 'lost', 'job', 'wall', 'street', 'limousine', 'drivers', 'complain', 'carrying', 'fewer', 'passengers', 'earning', 'percent', 'restaurants', 'waiters', 'working', 'days', 'week', 'instead'

## Training

In [60]:
def training(sentences):
    #Training the model
    tic = time.time() # Start timing

    #For the Score method to work hs And negative Parameters need to be specified
    #A good heuristic For Word vectors dimensions `size` thats frequently used is the square-root of the length of the vocabulary, after pre-processing

    model = Word2Vec(sentences, # The sentences iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network
                     sg=0, #Defines the training algorithm. If 1, skip-gram is employed; otherwise, CBOW is used
                     size=300,#Dimensionality of the feature vectors
                     window=10,# The maximum distance between the current and predicted word within a sentence
                     min_count=5, #Ignores all words with total frequency lower than this
                     workers=cpu_count()-3, #Use these many worker threads to train the model (=faster training with multicore machines).
                     hs = 0, # int {1,0}) – If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
                     negative = 10, # If > 0, negative sampling will be used, specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
                     sample = 0.001, # The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
                     iter = 5, # (int) – Number of iterations (epochs) over the corpus. //5
                    )

    toc = time.time() # Start timing
    computationTime = toc-tic
    print("Computing time for training the model:" +str(computationTime)+" seconds")
    wordsInCorpus = sum([len(l) for l in sentences])
    print("Number of words processed per second:",wordsInCorpus/computationTime)
    print(model)
    print("Most frequent words In model: ", model.wv.index2word[:10])

    return model

model = training(sentences)

2018-10-15 11:38:38,000 : INFO : collecting all words and their counts
2018-10-15 11:38:38,001 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 11:38:38,520 : INFO : PROGRESS: at sentence #10000, processed 3069108 words, keeping 90254 word types
2018-10-15 11:38:39,121 : INFO : PROGRESS: at sentence #20000, processed 6464838 words, keeping 126750 word types
2018-10-15 11:38:39,651 : INFO : PROGRESS: at sentence #30000, processed 9542519 words, keeping 148851 word types
2018-10-15 11:38:40,197 : INFO : PROGRESS: at sentence #40000, processed 12697773 words, keeping 167860 word types
2018-10-15 11:38:40,750 : INFO : PROGRESS: at sentence #50000, processed 15824708 words, keeping 185342 word types
2018-10-15 11:38:41,274 : INFO : PROGRESS: at sentence #60000, processed 18758872 words, keeping 201594 word types
2018-10-15 11:38:41,803 : INFO : PROGRESS: at sentence #70000, processed 21710308 words, keeping 217517 word types
2018-10-15 11:38:42,334 : INF

2018-10-15 11:39:27,315 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-10-15 11:39:27,323 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-10-15 11:39:27,324 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-10-15 11:39:27,332 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-10-15 11:39:27,334 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-10-15 11:39:27,339 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-10-15 11:39:27,349 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-10-15 11:39:27,356 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-10-15 11:39:27,359 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-15 11:39:27,365 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 11:39:27,366 : INFO : worker thread finished; awaiting finish of 1 more threa

2018-10-15 11:40:10,456 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-15 11:40:10,462 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-15 11:40:10,468 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-15 11:40:10,473 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-15 11:40:10,474 : INFO : EPOCH - 4 : training on 26454392 raw words (25788572 effective words) took 21.5s, 1202220 effective words/s
2018-10-15 11:40:11,486 : INFO : EPOCH 5 - PROGRESS: at 4.52% examples, 1133431 words/s, in_qsize 24, out_qsize 1
2018-10-15 11:40:12,498 : INFO : EPOCH 5 - PROGRESS: at 9.36% examples, 1178690 words/s, in_qsize 25, out_qsize 0
2018-10-15 11:40:13,500 : INFO : EPOCH 5 - PROGRESS: at 13.71% examples, 1184543 words/s, in_qsize 26, out_qsize 1
2018-10-15 11:40:14,518 : INFO : EPOCH 5 - PROGRESS: at 17.91% examples, 1187352 words/s, in_qsize 23, out_qsize 2
2018-10-15 11:40:15,520 : INFO : EPOC

Computing time for training the model:114.142902135849 seconds
Number of words processed per second: 231765.54568863934
Word2Vec(vocab=95917, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'like', 'people', 'time', 'years', 'york', 'city', 'percent']


## Free memory

In [61]:
#Discard parameters that are used in training and score. Use if you’re sure you’re done training a model.
#If replace_word_vectors_with_normalized is set, forget the original vectors and only keep 
#the normalized ones = saves lots of memory!
model.delete_temporary_training_data(replace_word_vectors_with_normalized=False)    

## Evaluation

In [62]:
# Evaluation
print(" similarity evaluation:", model.wv.evaluate_word_pairs('wordsim353.tsv', restrict_vocab=50000))

  if np.issubdtype(vec.dtype, np.int):
2018-10-15 11:40:32,520 : INFO : Pearson correlation coefficient against wordsim353.tsv: 0.5760
2018-10-15 11:40:32,521 : INFO : Spearman rank-order correlation coefficient against wordsim353.tsv: 0.6124
2018-10-15 11:40:32,521 : INFO : Pairs with unknown words ratio: 4.8%


 similarity evaluation: ((0.5759757665432773, 4.38315812303549e-31), SpearmanrResult(correlation=0.6124050885215764, pvalue=5.72660038108291e-36), 4.815864022662889)


In [63]:
# Analogies
r = model.wv.accuracy('questions-words.txt', restrict_vocab=30000)

  
2018-10-15 11:40:33,086 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):
2018-10-15 11:40:34,467 : INFO : capital-common-countries: 57.8% (267/462)
2018-10-15 11:40:37,137 : INFO : capital-world: 54.6% (555/1016)
2018-10-15 11:40:37,367 : INFO : currency: 12.8% (11/86)
2018-10-15 11:40:42,769 : INFO : city-in-state: 15.2% (313/2064)
2018-10-15 11:40:43,662 : INFO : family: 80.1% (274/342)
2018-10-15 11:40:45,942 : INFO : gram1-adjective-to-adverb: 14.8% (129/870)
2018-10-15 11:40:47,267 : INFO : gram2-opposite: 16.2% (82/506)
2018-10-15 11:40:50,742 : INFO : gram3-comparative: 70.9% (945/1332)
2018-10-15 11:40:52,872 : INFO : gram4-superlative: 25.9% (210/812)
2018-10-15 11:40:54,707 : INFO : gram5-present-participle: 47.0% (330/702)
2018-10-15 11:40:58,099 : INFO : gram6-nationality-adjective: 71.3% (926/1299)
2018-10-15 11:41:02,179 : INFO : gram7-past-tense: 57.1% (891/1560)
2018-10-15 11:41:04,779 : INFO : gram8-plural: 58.1% (576/992)

## Save model

In [64]:
outputFilePath = os.path.join(outputPath, year)
model.save(outputFilePath) #binary=False saves the vectors as Textual data
print("----------------------------------------------------")

2018-10-15 11:41:05,895 : INFO : saving Word2Vec object under D:\data\nyt\word2vecModels\2001, separately None
2018-10-15 11:41:05,896 : INFO : storing np array 'vectors' to D:\data\nyt\word2vecModels\2001.wv.vectors.npy
2018-10-15 11:41:06,346 : INFO : not storing attribute vectors_norm
2018-10-15 11:41:06,347 : INFO : not storing attribute cum_table
2018-10-15 11:41:06,556 : INFO : saved D:\data\nyt\word2vecModels\2001


----------------------------------------------------


## Sanity check to make sure models have been saved

In [48]:
#Sanity check to make sure models have been saved
from gensim.models.word2vec import Word2Vec
import gensim


print(year)
filePath = os.path.join(outputPath, year)
model = gensim.models.Word2Vec.load(filePath) # you can continue training with the loaded model!
print(model)
model.wv.index2word[:10]

2018-10-15 11:31:30,203 : INFO : loading Word2Vec object from D:\data\nyt\word2vecModels\2000


2000


2018-10-15 11:31:30,860 : INFO : loading wv recursively from D:\data\nyt\word2vecModels\2000.wv.* with mmap=None
2018-10-15 11:31:30,861 : INFO : loading vectors from D:\data\nyt\word2vecModels\2000.wv.vectors.npy with mmap=None
2018-10-15 11:31:30,925 : INFO : setting ignored attribute vectors_norm to None
2018-10-15 11:31:30,925 : INFO : loading vocabulary recursively from D:\data\nyt\word2vecModels\2000.vocabulary.* with mmap=None
2018-10-15 11:31:30,926 : INFO : loading trainables recursively from D:\data\nyt\word2vecModels\2000.trainables.* with mmap=None
2018-10-15 11:31:30,926 : INFO : setting ignored attribute cum_table to None
2018-10-15 11:31:30,927 : INFO : loaded D:\data\nyt\word2vecModels\2000


Word2Vec(vocab=97229, size=300, alpha=0.025)


['said',
 'new',
 'year',
 'like',
 'people',
 'time',
 'years',
 'york',
 'state',
 'company']

In [51]:
model.wv.most_similar('man')

  if np.issubdtype(vec.dtype, np.int):


[('woman', 0.657943069934845),
 ('policeman', 0.5610487461090088),
 ('gentleman', 0.5473320484161377),
 ('men', 0.5011500120162964),
 ('guy', 0.4981834292411804),
 ('balding', 0.4937747120857239),
 ('gruff', 0.4927574396133423),
 ('assailant', 0.4926893711090088),
 ('drifter', 0.49006587266921997),
 ('lover', 0.4891093969345093)]

In [3]:
def similarityMetricsAlternative(model):
    #listDiversityTerms=['diversity', 'diverse']
    listDiversityTerms=['diversity']
    
     
    listDemographicsTermsUpdated=[w for w in listDemographicsTerms if w in model.wv.vocab]
    listIntellectualTermsUpdated=[w for w in listIntellectualTerms if w in model.wv.vocab]
    
    similarityDemographics = model.wv.n_similarity(listDiversityTerms,listDemographicsTermsUpdated)
    similarityIntellectual = model.wv.n_similarity(listDiversityTerms,listIntellectualTermsUpdated)
    return (similarityDemographics,similarityIntellectual)

In [4]:
from commonVariables import listDemographicsTerms, listIntellectualTerms
similarityDemographics2, similarityIntellectual2= similarityMetricsAlternative(model)
similarityDemographics2,similarityIntellectual2

(0.3289945841610287, 0.09222000420760601)