### Logging

In [2]:
import logging
#logging.getLogger('').handlers = []  #To delete previous logging configuration

logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
    #If log into standard output and to a file is desired:
    #handlers=[logging.FileHandler("{0}/{1}.log".format('./', uniName)), #path, File name
    #         logging.StreamHandler()]
)

# Batch Model Training

In [1]:
import os
#from word2vecTraining import preprocessing, training, evaluation

# Where Text Corpus is located
inputPath = os.path.normpath(r'D:\data\nyt') 
#inputPath = os.path.normpath(r'.') 

# Get all CSV Text Corpus files In target Path
#onlyfiles = [f for f in listdir(inputPath) if isfile(join(inputPath, f)) and f.endswith('.csv')]
#onlyfiles = onlyfiles[:2] #For testing
# print(onlyfiles)
# len(onlyfiles)

#Where output word2vec models will be stored
outputPath = os.path.normpath(r'D:\data\nyt\word2vecModels') #\ to \\
#outputPath = os.path.normpath(r'.\models') 

## Preprocessing

In [4]:
import re
import csv
import time
from gensim.parsing.preprocessing import * #provides a number of convenience preprocessing functions optimized for speed
from gensim.models.word2vec import Word2Vec
import gensim
from multiprocessing import cpu_count




def preprocessing(file):

    CUSTOM_FILTERS = [lambda x: x.lower(), #To lowercase
                      lambda text: re.sub(r'https?:\/\/.*\s', '', text, flags=re.MULTILINE), #To Strip away URLs
                      #split_alphanum, #Add spaces between digits & letters in s using RE_AL_NUM.
                      strip_tags, #Remove tags from s using RE_TAGS.
                      strip_non_alphanum,#Remove non-alphabetic characters from s using RE_NONALPHA.
                      strip_punctuation, #Replace punctuation characters with spaces in s using RE_PUNCT.
                      strip_numeric, #Remove digits from s using RE_NUMERIC.
                      strip_multiple_whitespaces,#Remove repeating whitespace characters (spaces, tabs, line breaks) from s and turns tabs & line breaks into spaces using RE_WHITESPACE.
                      remove_stopwords, # Set of 339 stopwords from Stone, Denis, Kwantes (2010).
                      #lambda x:" ".join(w for w in x.split() if w not in stopword_file) #Custom stopwords
                      lambda x: strip_short(x, minsize=3), #Remove words with length lesser than minsize from s.
                      #stem_text #Transform s into lowercase and stem it.
                     ]

    tic = time.time() # Start timing


    csv.field_size_limit(2147483647)
    #csv.field_size_limit(sys.maxsize)
    #Option 1: split paragraphs Into  sentences
    with open(file,'r', newline='',encoding="utf-8") as inpFile:

        csvObject = csv.reader(inpFile, delimiter=',',quotechar='"')

        wordThreshold=5 #Important: filter out sentences with less than wordThreshold words

        sentences = []
        for csvEntry in csvObject:
            if len(csvEntry)>1:
                #IMPORTANT: If all of your sentences have been loaded as one sentence, Word2vec training could take a very long time.
                #That’s because Word2vec is a sentence-level algorithm, so sentence boundaries are very important, because
                #co-occurrence statistics are gathered sentence by sentence. For many corpora, average sentence length is six words.
                #That means that with a window size of 5 you have, say, 30 (random number here) rounds of skip-gram calculations.
                #If you forget to specify your sentence boundaries, you may load a “sentence” that’s 10,000 words long.
                #In that case, Word2vec would attempt a full skip-gram cycle for the whole 10,000-word “sentence”. Hence, I split
                #the CSV entries By paragraphs '\n
                lines = csvEntry[0].split('\n') #csvEntry[0] is url csvEntry[1] is text Fetched from URL

                for line in lines: #Different elements appear in their own line
                    words = preprocess_string(line,CUSTOM_FILTERS)

                    if len(words)>wordThreshold: #Important: filter out sentences with less than wordThreshold words
                        sentences.append(words)

    toc = time.time() # Start timing
    computationTime = toc-tic

    print("Reading Corpus file and preprocessing time:" +str(computationTime)+" seconds")

    print(" printing Top 2 and last sentences For sanity check")
    for i, s in enumerate(sentences[0:2]):
        print(i,s)
    print(len(sentences),sentences[-1])

    print(" stats about Corpus read from file")
    wordsInCorpus = 0
    for i, s in enumerate(sentences):
        wordsInCorpus += len(s)
    print("Number of words in corpus:",wordsInCorpus)
    print("Number of sentences in corpus:",len(sentences))
    #for i, s in enumerate(sentences[0:30]):
    #    print(i,s)
    return sentences

def training(sentences):
    #Training the model
    tic = time.time() # Start timing

    #For the Score method to work hs And negative Parameters need to be specified
    #A good heuristic For Word vectors dimensions `size` thats frequently used is the square-root of the length of the vocabulary, after pre-processing

    model = Word2Vec(sentences, # The sentences iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network
                     sg=0, #Defines the training algorithm. If 1, skip-gram is employed; otherwise, CBOW is used
                     size=300,#Dimensionality of the feature vectors
                     window=10,# The maximum distance between the current and predicted word within a sentence
                     min_count=5, #Ignores all words with total frequency lower than this
                     workers=cpu_count()-6, #Use these many worker threads to train the model (=faster training with multicore machines).
                     hs = 0, # int {1,0}) – If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
                     negative = 10, # If > 0, negative sampling will be used, specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
                     sample = 0.001, # The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
                     iter = 5, # (int) – Number of iterations (epochs) over the corpus. //5
                    )

    toc = time.time() # Start timing
    computationTime = toc-tic
    print("Computing time for training the model:" +str(computationTime)+" seconds")
    wordsInCorpus = sum([len(l) for l in sentences])
    print("Number of words processed per second:",wordsInCorpus/computationTime)
    print(model)
    print("Most frequent words In model: ", model.wv.index2word[:10])

    return model

In [5]:
years = ['1984','1983','1982','1981']

for year in years: 
    print("starting the processing of " + year)
    inputFilePath = os.path.join(inputPath, year+'.csv')
    sentences = preprocessing(inputFilePath)
    model = training(sentences)
    #Discard parameters that are used in training and score. Use if you’re sure you’re done training a model.
    #If replace_word_vectors_with_normalized is set, forget the original vectors and only keep 
    #the normalized ones = saves lots of memory!
    model.delete_temporary_training_data(replace_word_vectors_with_normalized=False)   
    # Evaluation
    print(" similarity evaluation:", model.wv.evaluate_word_pairs('wordsim353.tsv', restrict_vocab=50000))
    # Analogies
    r = model.wv.accuracy('questions-words.txt', restrict_vocab=30000)
    outputFilePath = os.path.join(outputPath, year)
    model.save(outputFilePath) #binary=False saves the vectors as Textual data
    print("----------------------------------------------------")    

starting the processing of 1984
Reading Corpus file and preprocessing time:61.15896129608154 seconds
 printing Top 2 and last sentences For sanity check
0 ['women', 'long', 'role', 'models', 'road', 'equality', 'acquired', 'share', 'heroes', 'year', 'woman', 'run', 'vice', 'presidency', 'ticket', 'major', 'party', 'american', 'woman', 'walk', 'space', 'team', 'american', 'female', 'athletes', 'ran', 'swam', 'cycled', 'somersaulted', 'dribbled', 'basketballs', 'competed', 'win', 'gold', 'medals', 'summer', 'olympics', 'risks', 'taken', 'women', 'barriers', 'broken', 'victories', 'savored', 'political', 'defeats', 'suffered', 'visible', 'seeing', 'faces', 'television', 'tremendous', 'high', 'said', 'joanne', 'finney', 'brooklyn', 'radiologist', 'speaking', 'geraldine', 'ferraro', 'democratic', 'candidate', 'kathryn', 'sullivan', 'astronaut', 'joan', 'benoit', 'winner', 'olympic', 'women', 'marathon', 'felt', 'good', 'women', 'taking', 'chances', 'sweating', 'going', 'said', 'won', 'winne

Computing time for training the model:102.06591749191284 seconds
Number of words processed per second: 221268.83836408405
Word2Vec(vocab=83491, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'years', 'percent', 'president', 'york', 'million', 'time', 'people']
 similarity evaluation: ((0.574472005049634, 3.694875638649784e-31), SpearmanrResult(correlation=0.5956204217495568, pvalue=6.282775383672726e-34), 3.9660056657223794)


  if np.issubdtype(vec.dtype, np.int):
  from ipykernel import kernelapp as app


----------------------------------------------------
starting the processing of 1983
Reading Corpus file and preprocessing time:58.68063259124756 seconds
 printing Top 2 and last sentences For sanity check
0 ['room', 'christopher', 'gruhn', 'classrooms', 'agnes', 'cathedral', 'elementary', 'school', 'morning', 'sound', 'sniffling', 'mixed', 'rain', 'pounding', 'windows', 'particularly', 'sad', 'place', 'discuss', 'murder', 'christopher', 'gruhn', 'michael', 'glaubinger', 'chief', 'security', 'rockville', 'centre', 'roman', 'catholic', 'diocese', 'told', 'children', 'tugged', 'nervously', 'school', 'uniforms', 'lot', 'sick', 'people', 'chris', 'death', 'reality', 'glaubinger', 'spent', 'morning', 'going', 'room', 'room', 'agnes', 'describing', 'briefly', 'students', 'stabbing', 'year', 'old', 'eighth', 'grader', 'killed', 'newspaper', 'delivery', 'route', 'sunday', 'morning', 'classroom', 'glaubinger', 'ended', 'asking', 'chris', 'classmates', 'questions', 'boys', 'raised', 'hands', 'gi

Computing time for training the model:96.13812279701233 seconds
Number of words processed per second: 224291.7000317184
Word2Vec(vocab=82695, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'years', 'percent', 'million', 'york', 'president', 'state', 'time']
 similarity evaluation: ((0.5611929177337013, 2.8497407633005855e-29), SpearmanrResult(correlation=0.5917034200198885, pvalue=4.066290246953666e-33), 4.815864022662889)
----------------------------------------------------
starting the processing of 1982
Reading Corpus file and preprocessing time:59.27297115325928 seconds
 printing Top 2 and last sentences For sanity check
0 ['years', 'martha', 'badger', 'lived', 'area', 'queens', 'called', 'crescents', 'burglaries', 'near', 'years', 'ago', 'blocks', 'away', 'second', 'closer', 'time', 'place', 'occurred', 'month', 'street', 'kicked', 'door', 'said', 'shudder', 'looking', 'door', 'hard', 'think', 'crescents', 'high', 'crime', 'area', 'said', 'capt', 'wi

Computing time for training the model:97.46725821495056 seconds
Number of words processed per second: 224853.6011105144
Word2Vec(vocab=84564, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'years', 'percent', 'president', 'state', 'york', 'million', 'time']
 similarity evaluation: ((0.6002562361039608, 2.2839067883364727e-34), SpearmanrResult(correlation=0.6123652795100302, pvalue=4.5801935352469934e-36), 4.53257790368272)
----------------------------------------------------
starting the processing of 1981
Reading Corpus file and preprocessing time:60.480207204818726 seconds
 printing Top 2 and last sentences For sanity check
0 ['saudi', 'arabia', 'minister', 'oil', 'said', 'yesterday', 'country', 'considered', 'israel', 'immediate', 'threat', 'soviet', 'union', 'sheik', 'ahmed', 'zaki', 'yamani', 'told', 'standing', 'room', 'audience', 'plaza', 'hotel', 'saudi', 'arabia', 'faced', 'threats', 'international', 'communism', 'israel', 'said', 'threat', 'bols

Computing time for training the model:101.7693829536438 seconds
Number of words processed per second: 222719.79393178047
Word2Vec(vocab=85934, size=300, alpha=0.025)
Most frequent words In model:  ['said', 'new', 'year', 'years', 'percent', 'president', 'york', 'million', 'state', 'time']
 similarity evaluation: ((0.588820297228682, 9.775371129377608e-33), SpearmanrResult(correlation=0.6070154987506634, pvalue=3.314366412352933e-35), 4.815864022662889)
----------------------------------------------------


## Sanity check to make sure models have been saved

In [6]:
#Sanity check to make sure models have been saved
from gensim.models.word2vec import Word2Vec
import gensim

year = '2002'
print(year)
filePath = os.path.join(outputPath, year)
model = gensim.models.Word2Vec.load(filePath) # you can continue training with the loaded model!
print(model)
model.wv.index2word[:10]

2018-10-15 11:55:10,585 : INFO : loading Word2Vec object from D:\data\nyt\word2vecModels\2002


2002


2018-10-15 11:55:11,227 : INFO : loading wv recursively from D:\data\nyt\word2vecModels\2002.wv.* with mmap=None
2018-10-15 11:55:11,228 : INFO : loading vectors from D:\data\nyt\word2vecModels\2002.wv.vectors.npy with mmap=None
2018-10-15 11:55:11,285 : INFO : setting ignored attribute vectors_norm to None
2018-10-15 11:55:11,286 : INFO : loading vocabulary recursively from D:\data\nyt\word2vecModels\2002.vocabulary.* with mmap=None
2018-10-15 11:55:11,287 : INFO : loading trainables recursively from D:\data\nyt\word2vecModels\2002.trainables.* with mmap=None
2018-10-15 11:55:11,287 : INFO : setting ignored attribute cum_table to None
2018-10-15 11:55:11,288 : INFO : loaded D:\data\nyt\word2vecModels\2002


Word2Vec(vocab=97437, size=300, alpha=0.025)


['said',
 'new',
 'year',
 'like',
 'people',
 'time',
 'years',
 'company',
 'york',
 'percent']

In [51]:
model.wv.most_similar('man')

  if np.issubdtype(vec.dtype, np.int):


[('woman', 0.657943069934845),
 ('policeman', 0.5610487461090088),
 ('gentleman', 0.5473320484161377),
 ('men', 0.5011500120162964),
 ('guy', 0.4981834292411804),
 ('balding', 0.4937747120857239),
 ('gruff', 0.4927574396133423),
 ('assailant', 0.4926893711090088),
 ('drifter', 0.49006587266921997),
 ('lover', 0.4891093969345093)]

In [3]:
def similarityMetricsAlternative(model):
    #listDiversityTerms=['diversity', 'diverse']
    listDiversityTerms=['diversity']
    
     
    listDemographicsTermsUpdated=[w for w in listDemographicsTerms if w in model.wv.vocab]
    listIntellectualTermsUpdated=[w for w in listIntellectualTerms if w in model.wv.vocab]
    
    similarityDemographics = model.wv.n_similarity(listDiversityTerms,listDemographicsTermsUpdated)
    similarityIntellectual = model.wv.n_similarity(listDiversityTerms,listIntellectualTermsUpdated)
    return (similarityDemographics,similarityIntellectual)

In [4]:
from commonVariables import listDemographicsTerms, listIntellectualTerms
similarityDemographics2, similarityIntellectual2= similarityMetricsAlternative(model)
similarityDemographics2,similarityIntellectual2

(0.3289945841610287, 0.09222000420760601)