In [7]:
import pandas as pd
import numpy as np
from nltk.tokenize import regexp_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import gensim.corpora as corpora
import gensim
from datetime import datetime, timedelta
import time
import pickle
import pprint

import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fergu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\fergu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fergu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Model Topics Using LDA

In [2]:
''' This function takes in a df of pre-processed articles (or any other documents) and creates the
user-defined number of LDA topics. How many topics to use was a matter of trial and error, since any run of the function
took several hours, and so a rigorous determination as to the optimal number of topics was not practicable. 
There are  print statements throughout the code, just as a marker of progress and to show that this long-running 
algo is computing correctly'''

def lda_compute(df,num_topics,passes=7):
    start = time.time()
    print(f'The time is {datetime.now()}')
    # Define id2word
    id2word = corpora.Dictionary(df['article_words'])
    
    articles = df['article_words']
      
    corpus = [id2word.doc2bow(article) for article in articles]
         
    print('Now for LDA modeling...')
    
    # Instantiate LDA with desired # of topics and number of LDA passes
    lda_model = gensim.models.LdaModel(corpus=corpus,id2word=id2word,num_topics=num_topics,
                                           passes=passes,random_state=4)

    print('LDA done!')
    print('-' * 10)
    doc_lda = lda_model[corpus]
    
    
    lda_lists = [lda_model[article] for article in corpus]
    print(f'lda_lists[-1]: {lda_lists[-1]}')
    lda_coeffs = [[pair[1] for pair in article] for article in lda_lists]
    print(f'lda_coeffs[-1]: {lda_coeffs[-1]}')
    lda_inds = [[pair[0] for pair in article] for article in lda_lists]
    print(f'lda_inds[-1]: {lda_inds[-1]}')
    
    # Identify any 'rogue' rows where the LDA model was not able to assign any word to a topic
    rogues = [i for i in range(len(lda_inds)) if len(lda_inds[i])==0]
    
    lda_lists = [lda_lists[i] for i in range(len(lda_lists)) if i not in rogues]
    lda_coeffs = [lda_coeffs[i] for i in range(len(lda_coeffs)) if i not in rogues]
    lda_inds = [lda_inds[i] for i in range(len(lda_inds)) if i not in rogues]
    
    # Drop rogue rows as they will break further processing
    df.drop(rogues,inplace=True,axis=0)
    
    # Identify the lDA topic having the largest LDA coefficient
    # This will be considered the primary topic to which a given article 'belongs'
    lda_argmax = [np.argmax(tup) for tup in lda_coeffs]
      
    print(f'lda_argmax[-1]: {lda_argmax[-1]}')
    
    # Identify the main topic for every article
    lda_topics = [lda_inds[i][lda_argmax[i]] for i in range(len(lda_lists))]
    
    # Set a threshold for reporting any LDA topics beyond the primary one. 
    # Example: If primary topic has an LDA coeff of 0.60 and the sec_threshold is set to 0.5, 
    # any other topic would have to have an LDA coeff >= 0.3 to be considered a secondary topic. 
    # In this example, there can only be (at most) one secondary topic. If the primary topic had a 
    # lower LDA coeff, there could potentiallly be multiple secondary topics
    sec_threshold = 0.5
    

    # Define secondary topics (if any) according to the above criterion.
    secondary_topics = [[pair[0] for pair in lda_lists[i] if 1 > pair[1] / lda_coeffs[i][lda_argmax[i]] >= sec_threshold] for i in range(len(lda_lists))]
    
    # Put the primary and secondary topics into our dataframe
    df['lda_topic'] = lda_topics    
    df['other_topics'] = secondary_topics   
    
    interval = round((time.time() - start)/60,2)
    print(f'That  took {interval} mins.')
    return df, lda_model, corpus#, doc_lda

In [32]:
id2word = corpora.Dictionary(df_ht['article_words'])
filename = 'models/id2word_265.pkl'
pickle.dump(id2word,open(filename,'wb'))

In [34]:
# Run LDA on the dataset, setting desired number of topics and passes
df_ht, lda_model, corpus = lda_compute(df_ht,num_topics=265,passes=14)

Now for LDA modeling...
LDA done!
----------
lda_lists[-1]: [(26, 0.06111979), (33, 0.05380561), (60, 0.011781786), (61, 0.15894896), (93, 0.08945436), (96, 0.083074786), (110, 0.01735857), (114, 0.01212051), (120, 0.01918941), (158, 0.027230652), (167, 0.01825403), (188, 0.021116503), (201, 0.035287004), (222, 0.019063195), (235, 0.015335164), (252, 0.24858978), (257, 0.033509772), (260, 0.019460177)]
lda_coeffs[-1]: [0.06111979, 0.05380561, 0.011781786, 0.15894896, 0.08945436, 0.083074786, 0.01735857, 0.01212051, 0.01918941, 0.027230652, 0.01825403, 0.021116503, 0.035287004, 0.019063195, 0.015335164, 0.24858978, 0.033509772, 0.019460177]
lda_inds[-1]: [26, 33, 60, 61, 93, 96, 110, 114, 120, 158, 167, 188, 201, 222, 235, 252, 257, 260]
lda_argmax[-1]: 15
That  took 297.55 mins.


In [37]:
# Save df, model for later use
df_ht.to_csv('models/14passes_265_topics_df_ht_lda.csv')
lda_model.save('models/14passes_265_topics_ldamodel_ht')

In [38]:
df_ht.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,year,month,day,author,title,article,url,section,publication,article_words,lda_topic,other_topics
0,7,7,2018-05-02 17:09:00,2018,5.0,2,Caroline Williams,You Can Trick Your Brain Into Being More Focused,If only every day could be like this. You can’...,https://www.vice.com/en_us/article/9kgp4v/how-...,Health,Vice,"[every, day, could, like, put, finger, maybe, ...",257,"[61, 166, 227]"
1,41,41,2018-10-05 19:35:00,2018,10.0,5,Caroline Haskins,Trash Geyser Spews Garbage In Yellowstone Nati...,Geyser eruptions are known as one of the most...,https://www.vice.com/en_us/article/evwq47/ear-...,Tech by VICE,Vice,"[geyser, eruption, know, one, beautiful, event...",227,[]


### Naming Topics
Once LDA modeling is complete, we have to read through a sample of article titles, in order to discern a unifying theme and give an appropriate name to each topic. These topic names are added to a master dictionary. When predicting on test data, the numerical model output in the form of an LDA topic number, is mapped to a meaningful name according to the topic dictionary.

In [6]:
df_ht = pd.read_csv('models/14passes_265_topics_df_ht_lda.csv')

In [149]:
filename = 'models/14passes_265_topics_dict.pkl'
with open(filename,'wb') as file:
    pickle.dump(topics_dict,file)  

In [92]:
# Import trained LDA model
lda_model = gensim.models.ldamodel.LdaModel.load('models/14passes_265_topics_ldamodel_ht') 

In [90]:
print(lda_model)

LdaModel(num_terms=430329, num_topics=185, decay=0.5, chunksize=2000)


In [71]:
# A version of the hand-labeled topic dictionary

topics_dict = {0:'mexican telecommunications',1:'education & student life',2:'tfr',3:'intl big business',
               4:'drugs, clinical trials, approvals',5:'robots & robotics',6:'indian tech & business',7:'meat substitutes',
               8:'fish',9:'israeli tech & business',10:'tech m&a',11:'11',12:'lodging marketplaces',13:'laptops, mobile devices, gadgets',
               14:'audio tech',15:'climate science',16:'intl govt relations',17:'digital advertising',18:'tfr',19:'marijuana & CBD',
               20:'ridesharing services e-scooters & e-bikes',21:'tfr',22:'wireless charging technology',23:'diversity & discrimination',
               24:'basketball',25:'mass transit',26:'e-commerce & online delivery',27:'european alternative energy',28:'menstrual health',
               29:'cancer treatments & trials',30:'tfr',31:'apple devices',32:'motor racing',33:'food poisoning, allergies, household chemicals',
               34:'health insurance & washington legislation',35:'asian business',36:'astrophysics',37:'chinese big business',
               38:'latino-american business',39:'international trade, subsidies & tariffs',40:'automotive technology',41:'tfr',
               42:'virtual reality',43:'organ donation',44:'audio streaming and apps',45:'photo & video sharing',
               46:'virtual assistants, smart home tech',47:'',48:'international oil & gas',
               49:'data breaches & data privacy',50:'livestock viral diseases',51:'skincare',
               52:'endangered species, illegal animal trading',53:'tfr',54:'data protection & govt intervention',55:'food & diet',
               56:'ice hockey',57:'tfr',58:'tfr',59:'incarceration',60:'government legislation',61:'6',62:'power & renewable energy',
               63:'drones & unmanned aerial tech',64:'5g % mobile networks',65:'global big business',66:'fitness, exercise & diet',
               67:'us politics',68:'mid-east business',69:'mood disorders',70:'studies and polls',71:'autonomous vehicles',
               72:'product recalls',73:'73',74:'big business earnings',75:'facebook govt intervention',76:'tv and movies',
               77:'hacks & data theft',78:'gaming platforms',79:'dakota',80:'astrophysics',81:'global currency, equity and fixed income',
               82:'brazilian/french business',83:'italian politics',84:'tfr',85:'battery technology',86:'tfr',87:'tech investments, vc, new initiatives',
               88:'plants & trees',89:'guns and shootings',90:'venezuelan business',91:'space exploration',92:'sleep health',
               93:'infectious diseases',94:'big tech govt intervention',95:'sexual assault & harassment',96:'fake news, misinformation, dangerous online content',
               97:'tfr',98:'tfr',99:'uk',100:'tfr',101:'cryptocurrency',102:'human rights in developing world',103:'health aspects of body art',104:'tfr',
               105:'asia activism',106:'twitter',107:'cybersecurity',108:'infant nutrition',109:'mining',110:'cities & urban life',111:'mid-east politics, terrorism',
               112:'art and design',113:'wearable devices',114:'food delivery',115:'tfr',116:'north korea politics',117:'smoking, vaping & e-cigs',
               118:'gene editing, genetic engineering',119:'water resources',120:'bloodborne diseases & vaccines',121:'australasian business',
               122:'international equities/derivatives',123:'sports/esports',124:'rare earth minerals',125:'intl politics',
               126:'cutting-edge mobile devices, gadgets',127:'disney',128:'tfr',129:'quantum computing',130:'intl macroeconomics',
               131:'fintech & payments',132:'weed & legal matters',133:'puerto rico',134:'tfr',135:'microsoft apps, os, devices',136:'medical devices',
               137:'telecoms networks & providers',138:'intl banking & finance',139:'financial markets',140:'smartphone tech',
               141:'medical devices',142:'earth sciences',143:'global currency, equity and fixed income',144:'faang',145:'tfr',146:'sports',
               147:'climate science & technology',148:'148',149:'price movements & fluctuations',150:'baseball',151:'meteorology',152:'151',
               153:'aliens',154:'tfr',155:'automotive technology',156:'tech investments, vc, new initiatives',157:'artificial intelligence',
               158:'158',159:'tfr',160:'chips & processors',161:'tfr',162:'astronomy & astrophysics',163:'bacteria & viruses',164:'intl stock markets',
               165:'tfr',166:'the human brain',167:'fashion & apparel',168:'tfr',169:'tfr',170:'tfr',171:'tfr',172:'blood and blood diseases',
               173:'tfr',174:'global currency, equity and fixed income',175:'software/service glitches, bugs & outages',176:'tfr',177:'video games',
               178:'natural & man-made disasters',179:'tech investments, vc, new initiatives',180:'tfr',181:'nuclear tech',182:'tfr',183:'artificial intelligence',
               184:'tfr',185:'sex and relationships',186:'tfr',187:'commodities, natural resources',188:'clinical care',189:'sports',190:'european macroeconomics',
               191:'19',192:'intl banking & finance',193:'african govt affairs',194:'canadian business',195:'tfr',196:'tfr',197:'mid-east oil & gas',
               198:'emoji',199:'big business earnings',200:'bloodborne diseases & vaccines',201:'tfr',202:'tfr',203:'litigation & court rulings',
               204:'tfr',205:'tfr',206:'messaging platforms',207:'aviation & aerospace',208:'video games',209:'tfr',210:'amazon',
               211:'birth control & reproductive health',212:'212',213:'european business',214:'marine life & maritime affairs',215:'215',
               216:'216',217:'217',218:'tfr',219:'code & software',220:'co-working spaces',221:'221',222:"children's health",
               223:'computer peripherals',224:'astronomy & astrophysics',225:'tfr',226:'animals',227:'video games',228:'tfr',229:'international oil & gas',
               230:'satellites & rockets',231:'marine life & maritime affairs',232:'dark web & online privacy',233:'tfr',234:'supermarkets',
               235:'global currency, equity and fixed income',236:'tfr',237:'tech investments, vc, new initiatives',238:'tfr',239:'autonomous vehicles',       
               240:'tfr',241:'lgbtq issues',242:'surveillance tech',243:'tfr',244:'ecology, environment & archaeology',245:'mobile networks, carriers & eqpt',
               246:'tfr',247:'tfr',248:'global currency, equity and fixed income5',249:'tfr',250:'space exploration',251:'tfr',
               252:'employment issues',253:'tfr',254:'tfr',255:'cutting-edge mobile devices, gadgets',256:'256',257:'mental health & disorders',
               258:'executive level management',259:'tfr',260:'prescription drugs',261:'bees',262:'tfr',263:'experimental studies',
               264:'apps, gadgets & devices'
    
              }
               

In [123]:
filename = 'models/topics_dict.pkl'
pickle.dump(topics_dict,open(filename,'wb'))  

In [109]:
# Iterate through the LDA topics
# For each topic, print out a number of article titles and try to discern a common theme 
# Name the topic accordingly
n = 61

test_df = df_ht[df_ht['lda_topic'] == n]
print(f'{len(test_df)} records:','\n','*' * 10)
for i in range(550):
    try:      
        print(test_df.iloc[i,8])
        print(test_df.iloc[i,11])
        print('-' * 15)
    except:
        break

7623 records: 
 **********
Discussing the Enduring Appeal of Abe with Oddworld Creator Lorne Lanning
Games
---------------
In Search for Any Excuse But Guns, Trump to Meet With Video Game Industry
Games
---------------
Shortages of Injectable Estrogen Are Screwing Over Trans Women
Health
---------------
What Are Whip Its? The Side Effects and Dangers of Doing Them
Health
---------------
I Worry That Everything I Eat Is Going to Give Me Food Poisoning
Health
---------------
Bloggers, Rejoice: Gage Skidmore Is at CPAC
Tech by VICE
---------------
​Letters to the Editor: Waze Politics and Very Expensive Bicycles
Tech by VICE
---------------
Meet the 28-Year-Old Geneticist Fighting the Future of Superbugs
Health
---------------
Is It Worse to Sit or Squat on a Public Toilet?
Health
---------------
19 Everyday Things That Trigger My OCD
Health
---------------
The Creator of the First Online Dating Site Is Still Dating Online
Tech by VICE
---------------
The Case for Giving Everyone Free Mon

In [74]:
''' These functions, as the names suggest, are for cleaning queries and predicting LDA topics on short user-input data. 
The output from the topic_predict function is the name of the primary topic, followed by the name(s) of any secondary
topic(s)'''

def clean_query(query):
    '''Tokenize, remove special characters, lemmatize '''
    tokenized = regexp_tokenize(query,pattern)
    indiv_words = [word for word in tokenized if word.isalpha()]
    lemmatized = func_lemmatize(indiv_words)
    words = [word.lower() for word in lemmatized if word not in stop]

    return words


def topic_predict(query):  
    tokenized_input = clean_query(query)
    
    corpus = id2word.doc2bow(tokenized_input)
    
    np.random.seed(4)
    output = list(lda_model[corpus])
    
    ordered = sorted(output,key=lambda x:x[1],reverse=True)
    
    # Determine primary topic
    primary_topic = ordered[0][0]
    # Set threshold for any secondary topics, as explained in greater detail in lda_compute function
    threshold = 0.5
    
    # Identify secondary topics, if any
    secondary_topics = [pair[0] for pair in ordered[1:] if pair[1] / ordered[0][1] > threshold]
    
    # Name the primary topic by reference to the topics_dict
    print(f'primary topic: {topics_dict[primary_topic]}')
    # Name any secondary topics if there are any
    if secondary_topics:
        print('-' * 10, '\n', 'other topics:')
        for topic in secondary_topics:
            print(topics_dict[topic])

In [122]:
p = 'hello how are you?'
#p = 'The 20-year US military presence in Afghanistan is over. The head of US Central Command, Gen Kenneth McKenzie, announced that the last flight out of Kabul “is now clearing the airspace above Afghanistan”.'
#p = 'Western powers have been forced to accept the reality of the Taliban’s control of Afghanistan as they swung behind a watered down UN resolution that says it “expects” the Taliban to honour a commitment to allow Afghans to leave the country and “requests” that Kabul airport be securely reopened, but falls short of demanding a UN-sponsored safe zone in the Afghan capital.'
#p = 'British troops and international allies could return to Kabul airport to help police a UN safe zone in the capital in order to allow safe passage for people trying to leave Afghanistan.'
#p = "If you're looking to drop your quarantine 15, lower your stress level, sleep better, or boost your immunity amid the pandemic, we've tested plenty of smart health and fitness products and apps to help you meet your goals."
#p = "Nervous about going to the gym amid the COVID-19 pandemic, or just too lazy to get your butt out of bed and travel there in the morning?"
#p = 'Gaining insight into your sleeping patterns is the first step toward identifying problems and fixing them. When you’re evaluating wearable options, look for a device with continuous heart rate monitoring and an SpO2 (or Pulse Ox) sensor that tracks your blood oxygen saturation levels as you sleep.'
#p = 'I’m not going to pretend that I know how to interpret the jobs and inflation data of the past few months. My view is that this is still an economy warped by the pandemic, and that the dynamics are so strange and so unstable that it will be some time before we know its true state. But the reaction to the early numbers and anecdotes has revealed something deeper and more constant in our politics.'

#p = 'The Defense Department’s space agency on Aug. 30 released a request for proposals from satellite manufacturers that would compete for contracts to build as many as 144 satellites. The satellites will make up the Space Development Agency’s Transport Layer Tranche 1 — a mesh network of communications satellites in low Earth orbit projected to start launching in late 2024.'
#p = "There are details from the R. Kelly trial intended to startle us: The accusation that the singer believed he had gotten then 15-year-old singer Aaliyah pregnant and arranged to marry her so she couldn't testify against him. The accusation that he knowingly spread herpes to several of his young victims. The accusation that one of the victims, then 16, was slapped and choked until she passed out because she texted a friend. \
#But sexual assault experts say some revelations should not be surprising: that Black girls were brutally victimized, that many individuals around Kelly were complicit and that it's taken more than two decades for much of the public to care." 
#158

#p = 'are you suffering from some mental illness?'
p = 'are robots going to take over the world?' 
#video games 2
p = 'what percentage of people in latin america have access to smartphones?'
p = 'Up to 2 million people in and around New Orleans were without power after Hurricane Ida, a 150mph monster storm that was the most powerful ever to hit Louisiana. At least one person was killed, by a falling tree, but the governor, John Bel Edwards, warned that the death toll will probably rise'
p = 'Igot chatting to an American consultant a few days ago who mentioned that her boyfriend was soon arriving in London from New York. “So I’m taking some PTO next week,” she said. PTO? Was this some kind of new sexual slang I didn’t know about? Best to ask and make sure. “Oh, no, no,” she explained, “it’s, like, paid time off.” I think I might have preferred it to be something more alarming and athletic. “Like a holiday?” I asked. The American nodded.'
#61
p = 'International donors including the World Bank and European Union froze funding to Afghanistan shortly afterwards. “One of the great risks for the health system here is basically to collapse because of lack of support,” said Filipe Ribeiro, Afghanistan representative for Doctors Without Borders (Medecins Sans Frontieres, or MSF), one of the largest medical aid agencies in the country.'
p = '‘Our hospital right now is at capacity’ Lee Health CEO calls for more vaccinations, unity\
On Sunday, Lee Health admitted 92 COVID-19 positive patients to its hospitals, the most in a single day throughout the pandemic. \
Lee Health CEO Larry Antonucci said, “That 92 is just chilling to me,” during a news conference Monday at Golisano Children’s Hospital.'
p = "In an effort to understand how the internal state of the body influences the brain's decision-making processes, scientists analyzed the data from a previous study pre-clinical study. They found that two of the brain's decision-making centers contain neurons that may exclusively monitor the body's internal dynamics. Furthermore, a heightened state of arousal appeared to rewire one of the centers by turning some decision-making neurons into internal state monitors."
p = 'The breast milk of lactating mothers vaccinated against COVID-19 contains a significant supply of antibodies that may help protect nursing infants from the illness, according to new research from the University of Florida. "Our findings show that vaccination results in a significant increase in antibodies against SARS-CoV-2 -- the virus that causes COVID-19 -- in breast milk, suggesting that vaccinated mothers can pass on this immunity to their babies, something we are working to confirm in our ongoing research," said Joseph Larkin III, Ph.D., senior author of the study and an associate professor in the UF/IFAS department of microbiology and cell science."'
p = 'what is the future of ai going to be?'
p = "Scientists at Cambridge and Leeds have successfully reversed age-related memory loss in mice and say their discovery could lead to the development of treatments to prevent memory loss in people as they age. In a study published today in Molecular Psychiatry, the team show that changes in the extracellular matrix of the brain -- 'scaffolding' around nerve cells -- lead to loss of memory with ageing, but that it is possible to reverse these using genetic treatments."
p = "People taking certain drugs to lower blood sugar for type 2 diabetes had less amyloid in the brain, a biomarker of Alzheimer's disease, when compared to both people with type 2 diabetes not taking the drugs and people without diabetes. The new study also found people taking these drugs, called dipeptidyl peptidase-4 inhibitors, showed slower cognitive decline than people in the other two groups."
p = "If you are forgetful or make mistakes when in a hurry, a new study from Michigan State University -- the largest of its kind to-date -- found that meditation could help you to become less error prone. The research, published in Brain Sciences, tested how open monitoring meditation -- or, meditation that focuses awareness on feelings, thoughts or sensations as they unfold in one's mind and body -- altered brain activity in a way that suggests increased error recognition."
p = 'i suffer from various sports injuries, since i used to play soccer'
p = 'Sending human travelers to Mars would require scientists and engineers to overcome a range of technological and safety obstacles. One of them is the grave risk posed by particle radiation from the sun, distant stars and galaxies.'
p = "Your smart device could soon be even smarter with a new infrared light emitting diode (LED) that is 'tuneable' to different wavelengths of light -- it could enable your fridge to tell you when your food is going off and your phone to tell you if that Gucci purse is real. The technology has been developed by the University of Melbourne, the Lawrence Berkeley National Laboratory, the University of California, Berkeley, and the Australian Research Council Centre of Excellence for Transformative Meta-Optical Systems (TMOS). They have come up with a device that could identify a suite of gases, potentially including lethal ones, improving the safety of firefighters, miners, the military, and your local plumber. The work appeared in the journal, Nature."



topic_predict(p)

primary topic: artificial intelligence
---------- 
 other topics:
???
education & student life
astronomy & astrophysics


### Word2vec Alternative to LDA
Several different versions of LDA were tried as an alternative to the LDA approach. Most of the iterations were attempts to make it run faster. For this, a portion of the article was used (specifically the title plus a short 'intro', typically 30 words. Attempts to cluster the resulting vectors did not produce good results; we later found that it would have been worth creating vectors only for the most commonly occurring words, rather than for each and every word. This may well have produced better 
clustering results, and significantly reduced processing time. Due to time constraints, this avenue was left unexplored.

In [81]:
def create_w2v(col1,col2,max_epochs,vec_size,window,min_count,resume=False):
    now  = datetime.now()
    start_time_str = now.strftime('%H:%M:%S')
    print('started at',start_time_str)
        
    start_time = time.time()
    # Create a list of words for each article
    word_list = [[words for words in line] for line in col1]
    # unpack it to create a flat list of all words in the corpus
    unpacked_word_list = [word for line in word_list for word in line]
    
    filename = 'model.sav'
    
    # Either train a new model, or open a saved model and load pre-calculated vectors 
    # depending on whether resume=True or False
    if resume == False:
        model = Word2Vec(col1,vector_size=vec_size,window=window,min_count=min_count,workers=-1)
        model.train(col1,total_examples=len(col1),epochs=max_epochs)
        print('word2vec model has been trained.')
        pickle.dump(model,open(filename,'wb'))
    else:
        file = open(filename,'rb')
        model = pickle.load(file)
        
    averaged_vectors = [] if resume == False else pd.read_csv('intermediate_results.csv',index_col=0)
    
    if resume == True:
        averaged_vectors = [list(averaged_vectors.loc[val,:]) for val in averaged_vectors.index]
    
    j = start_index= len(averaged_vectors)
    # Report on percentage completion before starting
    if j != 0:
        print(f'{round(j/len(col1),3) * 100}% complete.')
    start_checkpoint_time = time.time()
    
    progress_indicators = []
    
    print('Now to create vectors...')
    for row in range(j,len(col1)):
        list_of_words = col1[row]
        list_of_vectors = [model.wv[word] for word in list_of_words if unpacked_word_list.count(word) >= min_count]
        # Compute an average vector of all vectors in the document
        avg_vector = np.mean(list_of_vectors,axis=0) if list_of_vectors else np.zeros(vec_size)
        averaged_vectors.append(avg_vector)
        
        j += 1
        
        # Every j records, we save the results so far
        if j % 10000000 == 0:
            intermediate_results = pd.DataFrame(averaged_vectors)
            intermediate_results.to_csv('intermediate_results.csv')
            
        percent_complete = round(j/len(col1) * 100)
        if (percent_complete % 10 == 0) & (percent_complete not in progress_indicators):
            cum_time_raw = (time.time() - start_time)/60
            cum_time = round((time.time() - start_time)/60,1)
            print('{}% complete. Time elapsed: {} min'.format(percent_complete,cum_time))
            progress_indicators.append(percent_complete)
            
            remaining_time = cum_time_raw * (len(col1) - j) / (j - start_index)
            
            est_finish_time = datetime.now() + timedelta(minutes=remaining_time)
            est_finish_time = est_finish_time.strftime('%D:%H:%M:%S')
            print(f'Estimated finish time: {est_finish_time}')
            print('-' * 15)
    
    # Create df of averaged vectors and append the col of titles to create final output df
    df_of_vectors = pd.DataFrame(averaged_vectors)
        
    intermediate_table = pd.concat([col2,df_of_vectors],axis=1)
        
    rows_to_remove = [ind for ind in intermediate_table.index if intermediate_table.iloc[ind,2] == 0]
        
    removed_comments = pd.DataFrame(col2[rows_to_remove])
    final_table = intermediate_table.drop(rows_to_remove,axis=0)
        
    interval = round((time.time() - start_time)/60,1)
    print('-' * 25)
    print(f'Finished. That took {interval} min.')
    now = datetime.now()
    end_time_str = now.strftime('%H:%M:%S')
    print(f'Actual finish time: {end_time_str}')
        
    return model, final_table, removed_comments

In [51]:
df_clean_test = df[:10000]

In [52]:
df_clean_test.shape

(10000, 15)

In [82]:
model, final_table, removed_comments = create_w2v(df_clean_test['article_words'],df_clean_test['title'],12,32,5,6)

started at 15:40:24
word2vec model has been trained.
Now to create vectors...
0% complete. Time elapsed: 0.0 min
Estimated finish time: 09/01/21:15:48:28
---------------
10% complete. Time elapsed: 0.6 min
Estimated finish time: 09/01/21:15:46:58
---------------
20% complete. Time elapsed: 1.3 min
Estimated finish time: 09/01/21:15:47:10
---------------
30% complete. Time elapsed: 2.0 min
Estimated finish time: 09/01/21:15:47:11
---------------
40% complete. Time elapsed: 2.7 min
Estimated finish time: 09/01/21:15:47:13
---------------
50% complete. Time elapsed: 3.4 min
Estimated finish time: 09/01/21:15:47:15
---------------
60% complete. Time elapsed: 4.0 min
Estimated finish time: 09/01/21:15:47:11
---------------
70% complete. Time elapsed: 4.7 min
Estimated finish time: 09/01/21:15:47:08
---------------
80% complete. Time elapsed: 5.3 min
Estimated finish time: 09/01/21:15:47:06
---------------
90% complete. Time elapsed: 6.1 min
Estimated finish time: 09/01/21:15:47:12
---------

In [116]:
file = open('model.sav','rb')
model = pickle.load(file)

In [128]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [134]:
filename = 'wv.pkl'
pickle.dump(wv,open(filename,'wb'))

In [108]:
final_table

Unnamed: 0,title,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
0,Blood-Drinking White Nationalist Augustus Sol ...,-0.000605,-0.006243,0.004921,0.008215,-0.003610,-0.009190,0.012097,0.007827,-0.007459,...,0.013707,-0.007273,0.004699,-0.001444,-0.003752,0.006744,-0.005175,-0.007744,-0.007225,-0.003199
1,"7'6"" Tacko Fall Hilariously Crushing NBA All-S...",-0.001506,-0.004733,0.006476,0.009223,-0.004918,-0.007279,0.012278,0.008090,-0.004250,...,0.013761,-0.006336,0.005625,-0.001943,-0.002035,0.007474,-0.002311,-0.003758,-0.005967,0.000006
2,"Diddy and Future Ride Jet Skis in Miami, No Ba...",-0.001536,-0.005365,0.006279,0.010038,-0.006711,-0.006252,0.011247,0.009217,-0.004643,...,0.013012,-0.005645,0.005994,-0.002998,-0.002464,0.004610,-0.003649,-0.007745,-0.005570,-0.002505
3,Teresa Giudice Says Joe Made Her Sign a Prenup,-0.003308,-0.004476,0.005751,0.010023,-0.005718,-0.007607,0.011692,0.008058,-0.004116,...,0.013897,-0.006695,0.007948,-0.002395,-0.000758,0.004718,-0.001802,-0.004820,-0.005878,-0.001522
4,Deshaun Watson Parties For Smokin' Hot Model G...,-0.000638,-0.005164,0.006295,0.008395,-0.004369,-0.005744,0.011664,0.008124,-0.003841,...,0.012784,-0.004430,0.006240,-0.003484,-0.001750,0.005462,-0.001104,-0.005720,-0.006495,0.000040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"Thousands of protesters march in Hong Kong, ta...",-0.001606,-0.003971,0.005529,0.010431,-0.006318,-0.007483,0.014894,0.005884,-0.004427,...,0.012890,-0.005857,0.006109,-0.005710,-0.000500,0.004571,-0.001157,-0.005949,-0.006397,-0.001053
996,"GRAINS-U.S. wheat, soybeans maintain trade aro...",-0.001754,-0.003692,0.006071,0.008879,-0.004978,-0.007814,0.012594,0.007965,-0.003782,...,0.014035,-0.006440,0.005894,-0.004394,-0.002331,0.006427,-0.001155,-0.004889,-0.007471,-0.000932
997,Indonesia's top court bars plantation activity...,-0.000900,-0.006288,0.007222,0.008562,-0.006824,-0.005623,0.011263,0.006428,-0.004865,...,0.012968,-0.006415,0.005289,-0.000614,-0.001714,0.003032,-0.003538,-0.007346,-0.007920,-0.003392
998,Louisville pounds Mississippi State in Music C...,0.000169,-0.007635,0.004418,0.010149,-0.003550,-0.006456,0.010407,0.008568,-0.004324,...,0.013198,-0.004455,0.006592,-0.001808,-0.001230,0.003866,-0.003047,-0.006924,-0.007418,-0.001899


### W2V Other Versions

In [80]:
######EXPERIMENTAL VERSION!!!!!!!!!!!!!
def create_w2vexp(col1,col2,max_epochs,vec_size,window,min_count,resume=False):
    now  = datetime.now()
    start_time_str = now.strftime('%H:%M:%S')
    print('started at',start_time_str)
        
    start_time = time.time()
    word_list = [[words for words in line] for line in col1]
    unpacked_word_list = [word for line in word_list for word in line]
    
    filename = 'model.sav'
    
    if resume == False:
        model = Word2Vec(col1,vector_size=vec_size,window=window,min_count=min_count,workers=3)
        model.train(col1,total_examples=len(col1),epochs=max_epochs)
        print('word2vec model has been trained.')
        pickle.dump(model,open(filename,'wb'))
    else:
        file = open(filename,'rb')
        model = pickle.load(file)
        
    averaged_vectors = [] if resume == False else pd.read_csv('intermediate_results.csv',index_col=0)
    
    if resume == True:
        averaged_vectors = [list(averaged_vectors.loc[val,:]) for val in averaged_vectors.index]
    
    j = start_index= len(averaged_vectors)
    # Report on percentage completion before starting
    if j != 0:
        print(f'{round(j/len(col1),3) * 100}% complete.')
    start_checkpoint_time = time.time()
    
    progress_indicators = []
    
    print('Now to create vectors...'

    
    averaged_vectors = [np.mean([model.wv[word] for word in list_of_words if unpacked_word_list.count(word) >= min_count],axis=0) for list_of_words in col1] 
    
    
    
    
    print('vectors created!')
            
    df_of_vectors = pd.DataFrame(averaged_vectors)
    print(f'df_of_vectors.shape:{df_of_vectors.shape}')
        
    intermediate_table = pd.concat([col2,df_of_vectors],axis=1)
    print(f'intermediate_table.shape:{intermediate_table.shape}')
        
    rows_to_remove = [ind for ind in intermediate_table.index if intermediate_table.iloc[ind,2] == 0]
        
    removed_comments = pd.DataFrame(col2[rows_to_remove])
    final_table = intermediate_table.drop(rows_to_remove,axis=0)
        
    interval = round((time.time() - start_time)/60,1)
    print('-' * 25)
    print(f'Finished. That took {interval} min.')
    now = datetime.now()
    end_time_str = now.strftime('%H:%M:%S')
    print(f'Actual finish time: {end_time_str}')
        
    return final_table, removed_comments

In [66]:
######EXPERIMENTAL VERSION!!!!!!!!!!!!!
def create_w2vexp2(col1,col2,max_epochs,vec_size,window,min_count,resume=False):
    now  = datetime.now()
    start_time_str = now.strftime('%H:%M:%S')
    print('started at',start_time_str)
        
    start_time = time.time()
    word_list = [[words for words in line] for line in col1]
    unpacked_word_list = [word for line in word_list for word in line]
    
    filename = 'model.sav'
    
    if resume == False:
        model = Word2Vec(col1,vector_size=vec_size,window=window,min_count=min_count,workers=-1)
        model.train(col1,total_examples=len(col1),epochs=max_epochs)
        print('word2vec model has been trained.')
        pickle.dump(model,open(filename,'wb'))
    else:
        file = open(filename,'rb')
        model = pickle.load(file)
        
    averaged_vectors = [] if resume == False else pd.read_csv('intermediate_results.csv',index_col=0)
    
    if resume == True:
        averaged_vectors = [list(averaged_vectors.loc[val,:]) for val in averaged_vectors.index]
    
    j = start_index= len(averaged_vectors)
    # Report on percentage completion before starting
    if j != 0:
        print(f'{round(j/len(col1),3) * 100}% complete.')
    start_checkpoint_time = time.time()
    
    progress_indicators = []
    
    vectors_dict = {}
 
    for row in range(j,len(col1)):
        list_of_words = [word for word in col1[row] if unpacked_word_list.count(word) >= min_count]
        list_of_vectors = [vectors_dict[word] if word in vectors_dict.keys() else model.wv[word] for word in list_of_words]
        
        vectors_dict.update({word:vector for word,vector in zip(list_of_words,list_of_vectors)})
        
        avg_vector = np.mean(list_of_vectors,axis=0) if list_of_vectors else np.zeros(vec_size)
               
        
        averaged_vectors.append(avg_vector)
        
        j += 1

        if j % 1000 == 0:
            intermediate_results = pd.DataFrame(averaged_vectors)
            intermediate_results.to_csv('intermediate_results.csv')
            
        percent_complete = round(j/len(col1) * 100)
        if (percent_complete % 10 == 0) & (percent_complete not in progress_indicators):
            cum_time_raw = (time.time() - start_time)/60
            cum_time = round((time.time() - start_time)/60,1)
            print('{}% complete. Time elapsed: {} min'.format(percent_complete,cum_time))
            progress_indicators.append(percent_complete)
            
            remaining_time = cum_time_raw * (len(col1) - j) / (j - start_index)
            
            est_finish_time = datetime.now() + timedelta(minutes=remaining_time)
            est_finish_time = est_finish_time.strftime('%D:%H:%M:%S')
            print(f'Estimated finish time: {est_finish_time}')
            print('-' * 15)
            
    df_of_vectors = pd.DataFrame(averaged_vectors)
    print(f'df_of_vectors.shape:{df_of_vectors.shape}')
        
    intermediate_table = pd.concat([col2,df_of_vectors],axis=1)
    print(f'intermediate_table.shape:{intermediate_table.shape}')
        
    rows_to_remove = [ind for ind in intermediate_table.index if intermediate_table.iloc[ind,2] == 0]
        
    removed_comments = pd.DataFrame(col2[rows_to_remove])
    final_table = intermediate_table.drop(rows_to_remove,axis=0)
        
    interval = round((time.time() - start_time)/60,1)
    print('-' * 25)
    print(f'Finished. That took {interval} min.')
    now = datetime.now()
    end_time_str = now.strftime('%H:%M:%S')
    print(f'Actual finish time: {end_time_str}')
        
    return final_table, removed_comments

In [75]:
######EXPERIMENTAL VERSION!!!!!!!!!!!!!
def create_w2vexp3(col1,col2,max_epochs,vec_size,window,min_count,resume=False):
    now  = datetime.now()
    start_time_str = now.strftime('%H:%M:%S')
    print('started at',start_time_str)
        
    start_time = time.time()
    word_list = [[words for words in line] for line in col1]
    unpacked_word_list = [word for line in word_list for word in line]
    
    filename = 'model.sav'
    
    if resume == False:
        model = Word2Vec(col1,vector_size=vec_size,window=window,min_count=min_count,workers=-1)
        model.train(col1,total_examples=len(col1),epochs=max_epochs)
        print('word2vec model has been trained.')
        pickle.dump(model,open(filename,'wb'))
    else:
        file = open(filename,'rb')
        model = pickle.load(file)
        
    averaged_vectors = [] if resume == False else pd.read_csv('intermediate_results.csv',index_col=0)
    
    
    
    if resume == True:
        averaged_vectors = [list(averaged_vectors.loc[val,:]) for val in averaged_vectors.index]
    
    j = start_index= len(averaged_vectors)
    # Report on percentage completion before starting
    if j != 0:
        print(f'{round(j/len(col1),3) * 100}% complete.')
    start_checkpoint_time = time.time()
    
    progress_indicators = []

    for row in range(j,len(col1)):
        list_of_words = [word for word in col1[row] if unpacked_word_list.count(word) >= min_count]

        avg_vector = [model[' '.join(words)] for words in list_of_words]
               
        
        averaged_vectors.append(avg_vector)

        j += 1

        if j % 1000 == 0:
            intermediate_results = pd.DataFrame(averaged_vectors)
            intermediate_results.to_csv('intermediate_results.csv')
            
        percent_complete = round(j/len(col1) * 100)
        if (percent_complete % 10 == 0) & (percent_complete not in progress_indicators):
            cum_time_raw = (time.time() - start_time)/60
            cum_time = round((time.time() - start_time)/60,1)
            print('{}% complete. Time elapsed: {} min'.format(percent_complete,cum_time))
            progress_indicators.append(percent_complete)
            
            remaining_time = cum_time_raw * (len(col1) - j) / (j - start_index)
            
            est_finish_time = datetime.now() + timedelta(minutes=remaining_time)
            est_finish_time = est_finish_time.strftime('%D:%H:%M:%S')
            print(f'Estimated finish time: {est_finish_time}')
            print('-' * 15)
            
    df_of_vectors = pd.DataFrame(averaged_vectors)
    print(f'df_of_vectors.shape:{df_of_vectors.shape}')
        
    intermediate_table = pd.concat([col2,df_of_vectors],axis=1)
    print(f'intermediate_table.shape:{intermediate_table.shape}')
        
    rows_to_remove = [ind for ind in intermediate_table.index if intermediate_table.iloc[ind,2] == 0]
        
    removed_comments = pd.DataFrame(col2[rows_to_remove])
    final_table = intermediate_table.drop(rows_to_remove,axis=0)
        
    interval = round((time.time() - start_time)/60,1)
    print('-' * 25)
    print(f'Finished. That took {interval} min.')
    now = datetime.now()
    end_time_str = now.strftime('%H:%M:%S')
    print(f'Actual finish time: {end_time_str}')
        
    return final_table, removed_comments

In [69]:
final_table_orig, removed_comments = create_w2v(df_clean_test['article_words'],df_clean_test['title'],12,32,5,6)

started at 14:56:08
word2vec model has been trained.
Now to create vectors...
0% complete. Time elapsed: 0.0 min
Estimated finish time: 09/01/21:15:05:23
---------------
10% complete. Time elapsed: 0.7 min
Estimated finish time: 09/01/21:15:03:00
---------------
20% complete. Time elapsed: 1.4 min
Estimated finish time: 09/01/21:15:03:10
---------------
30% complete. Time elapsed: 2.1 min
Estimated finish time: 09/01/21:15:03:11
---------------
40% complete. Time elapsed: 2.8 min
Estimated finish time: 09/01/21:15:03:12
---------------
50% complete. Time elapsed: 3.5 min
Estimated finish time: 09/01/21:15:03:15
---------------
60% complete. Time elapsed: 4.2 min
Estimated finish time: 09/01/21:15:03:11
---------------
70% complete. Time elapsed: 4.9 min
Estimated finish time: 09/01/21:15:03:09
---------------
80% complete. Time elapsed: 5.6 min
Estimated finish time: 09/01/21:15:03:07
---------------
90% complete. Time elapsed: 6.3 min
Estimated finish time: 09/01/21:15:03:10
---------

In [67]:
final_table, removed_comments = create_w2vexp2(df_clean_test['article_words'],df_clean_test['title'],12,32,5,6)

started at 13:29:51
word2vec model has been trained.
0% complete. Time elapsed: 0.0 min
Estimated finish time: 09/01/21:13:38:57
---------------
10% complete. Time elapsed: 0.7 min
Estimated finish time: 09/01/21:13:37:12
---------------
20% complete. Time elapsed: 1.5 min
Estimated finish time: 09/01/21:13:37:41
---------------
30% complete. Time elapsed: 2.3 min
Estimated finish time: 09/01/21:13:37:49
---------------
40% complete. Time elapsed: 3.3 min
Estimated finish time: 09/01/21:13:38:18
---------------
50% complete. Time elapsed: 4.2 min
Estimated finish time: 09/01/21:13:38:23
---------------
60% complete. Time elapsed: 5.1 min
Estimated finish time: 09/01/21:13:38:26
---------------
70% complete. Time elapsed: 5.8 min
Estimated finish time: 09/01/21:13:38:15
---------------
80% complete. Time elapsed: 6.6 min
Estimated finish time: 09/01/21:13:38:06
---------------
90% complete. Time elapsed: 7.3 min
Estimated finish time: 09/01/21:13:38:03
---------------
100% complete. Tim

In [164]:
######EXPERIMENTAL VERSION!!!!!!!!!!!!!
def create_w2vexp4(col1,col2,min_count,resume=False):
    now  = datetime.now()
    start_time_str = now.strftime('%H:%M:%S')
    print('started at',start_time_str)
        
    start_time = time.time()
    word_list = [[words for words in line] for line in col1]
    unpacked_word_list = [word for line in word_list for word in line]
    
    filename = 'model.sav'
    
    averaged_vectors = [] if resume == False else pd.read_csv('intermediate_results.csv',index_col=0)
    
    if resume == True:
        averaged_vectors = [list(averaged_vectors.loc[val,:]) for val in averaged_vectors.index]
    
    j = start_index = len(averaged_vectors)
    # Report on percentage completion before starting
    if j != 0:
        print(f'{round(j/len(col1),3) * 100}% complete.')
    start_checkpoint_time = time.time()
    
    progress_indicators = []
    
    print('Now to create vectors...')
        
    filename = 'wv.pkl'
    file = open(filename,'rb')
    wv = pickle.load(file)
    
    
    for row in range(j,len(col1)):

        list_of_words = [word for word in list_of_words if (unpacked_word_list.count(word) >= min_count) & \
                                                                   (word in wv.key_to_index)]
        avg_vector = wv[list_of_words]
        
        averaged_vectors.append(avg_vector)
        
        j += 1
        print(f'{j} done')
        if j % 10000000 == 0:
            intermediate_results = pd.DataFrame(averaged_vectors)
            intermediate_results.to_csv('intermediate_results.csv')
            
        percent_complete = round(j/len(col1) * 100)
        if (percent_complete % 10 == 0) & (percent_complete not in progress_indicators):
            cum_time_raw = (time.time() - start_time)/60
            cum_time = round((time.time() - start_time)/60,1)
            print('{}% complete. Time elapsed: {} min'.format(percent_complete,cum_time))
            progress_indicators.append(percent_complete)
            
            remaining_time = cum_time_raw * (len(col1) - j) / (j - start_index)
            
            est_finish_time = datetime.now() + timedelta(minutes=remaining_time)
            est_finish_time = est_finish_time.strftime('%D:%H:%M:%S')
            print(f'Estimated finish time: {est_finish_time}')
            print('-' * 15)
            
    df_of_vectors = pd.DataFrame(averaged_vectors)
        
    intermediate_table = pd.concat([col2,df_of_vectors],axis=1)
        
    rows_to_remove = [ind for ind in intermediate_table.index if intermediate_table.iloc[ind,2] == 0]
        
    removed_comments = pd.DataFrame(col2[rows_to_remove])
    final_table = intermediate_table.drop(rows_to_remove,axis=0)
        
    interval = round((time.time() - start_time)/60,1)
    print('-' * 25)
    print(f'Finished. That took {interval} min.')
    now = datetime.now()
    end_time_str = now.strftime('%H:%M:%S')
    print(f'Actual finish time: {end_time_str}')
        
    return model, final_table, removed_comments

In [None]:
final_table_new, removed_comments = create_w2vexp4(df_clean_test['article_words'],df_clean_test['title'],6)

In [173]:
######EXPERIMENTAL VERSION!!!!!!!!!!!!!
def create_w2vexp5(col1,col2,resume=False):
    now  = datetime.now()
    start_time_str = now.strftime('%H:%M:%S')
    print('started at',start_time_str)
        
    start_time = time.time()
    word_list = [[words for words in line] for line in col1]
    unpacked_word_list = [word for line in word_list for word in line]
    
#    filename = 'model.sav'
    
    filename = 'wv.pkl'
    file = open(filename,'rb')
    wv = pickle.load(file)
    
    print('Now to create vectors...')
    
    averaged_vectors = [[wv[word] for word in list_of_words if word in wv.key_to_index] for list_of_words in col1] 
    
    
    
    
    print('vectors created!')
 
    df_of_vectors = pd.DataFrame(averaged_vectors)
    print(f'df_of_vectors.shape:{df_of_vectors.shape}')
    
    return df_of_vectors
        
    intermediate_table = pd.concat([col2,df_of_vectors],axis=1)
    print(f'intermediate_table.shape:{intermediate_table.shape}')
        
    rows_to_remove = [ind for ind in intermediate_table.index if intermediate_table.iloc[ind,2] == 0]
        
    removed_comments = pd.DataFrame(col2[rows_to_remove])
    final_table = intermediate_table.drop(rows_to_remove,axis=0)
        
    interval = round((time.time() - start_time)/60,1)
    print('-' * 25)
    print(f'Finished. That took {interval} min.')
    now = datetime.now()
    end_time_str = now.strftime('%H:%M:%S')
    print(f'Actual finish time: {end_time_str}')
        
    return final_table, removed_comments

In [174]:
df_of_vectors = create_w2vexp5(df_clean_test['article_words'],df_clean_test['title'])

started at 17:52:09
Now to create vectors...
vectors created!
df_of_vectors.shape:(1000, 223)


In [10]:
######EXPERIMENTAL VERSION!!!!!!!!!!!!!
def create_w2vexp6(col1,col2,max_epochs,vec_size,window,min_count,resume=False):
    now  = datetime.now()
    start_time_str = now.strftime('%H:%M:%S')
    print('started at',start_time_str)
        
    start_time = time.time()
    word_list = [[words for words in line] for line in col1]
    unpacked_word_list = [word for line in word_list for word in line]

    filename = 'model.sav'

    if resume == False:
        model = Word2Vec(col1,vector_size=vec_size,window=window,min_count=min_count,workers=3)
        model.train(col1,total_examples=len(col1),epochs=max_epochs)
        print('word2vec model has been trained.')
        pickle.dump(model,open(filename,'wb'))
    else:
        file = open(filename,'rb')
        model = pickle.load(file)
        
    print('Now to create vectors...')
    
    averaged_vectors = [np.mean([model.wv[word] for word in list_of_words if word in model.wv.key_to_index],axis=0) for list_of_words in col1]
    
    filename = 'models/intermediate_pickle.pkl'
    pickle.dump(averaged_vectors,open(filename,'wb'))
    
    
    print('vectors created!')

    df_of_vectors = pd.DataFrame(averaged_vectors)
    print(f'df_of_vectors.shape:{df_of_vectors.shape}')
    
    df_of_vectors.to_csv('df_of_vectors.csv')

    df_of_vectors['title'] = col2
    intermediate_table = df_of_vectors.copy()
    
    
    print(f'intermediate_table.shape:{intermediate_table.shape}')
        
    rows_to_remove = [ind for ind in intermediate_table.index if intermediate_table.iloc[ind,2] == 0]
        
    removed_comments = pd.DataFrame(col2[rows_to_remove])
    final_table = intermediate_table.drop(rows_to_remove,axis=0)
        
    interval = round((time.time() - start_time)/60,1)
    print('-' * 25)
    print(f'Finished. That took {interval} min.')
    now = datetime.now()
    end_time_str = now.strftime('%H:%M:%S')
    print(f'Actual finish time: {end_time_str}')
        
    return final_table, removed_comments

In [None]:
# Est finish time: 10:30pm
final_table,removed_comments = create_w2vexp6(df_clean_summ['article_words'],df_clean_summ['title'],20,256,5,6)

started at 12:36:51
word2vec model has been trained.
Now to create vectors...
vectors created!


## Try LDA on Summaries

In [6]:
df_clean_summ = pd.read_csv('df_clean_summ.csv')

In [7]:
inds = np.random.choice(df_clean_summ.index,1000000,replace=False)
df_clean_summ = df_clean_summ.loc[inds,:]
df_clean_summ.reset_index(drop=True,inplace=True)

In [8]:
df_clean_summ.shape

(1000000, 13)

In [9]:
df_clean_summ['article_words'] = [eval(words) for words in df_clean_summ['article_words']]
df_clean_summ.head(1)

Unnamed: 0.1,Unnamed: 0,date,year,month,day,author,title,url,section,publication,intro,title_intro,article_words
0,1239833,2016-03-08,2016,3.0,8,"Bozorgmehr Sharafedin, Doina Chiacu","Iran fires ballistic missiles, U.S. hints at d...",http://www.reuters.com/article/iran-missiles-i...,World News,Reuters,DUBAI/WASHINGTON (Reuters) - Iran’s Islamic Re...,"Iran fires ballistic missiles, U.S. hints at d...","[iran, fire, ballistic, missile, u, hint, dipl..."


In [None]:
#22:58
df_clean_summ, lda_model, corpus = lda_compute(df_clean_summ,num_topics=265,passes=14)

The time is 2021-09-04 22:58:10.800612
Now for LDA modeling...


## Try LDA on 500k (full) Records

In [3]:
df_9 = pd.read_csv('df9_clean.csv')
df_10 = pd.read_csv('df10_clean.csv')

In [4]:
dfs = [df_9,df_10]
df_9_10 = pd.DataFrame()
for df in dfs:
    df_9_10 = df_9_10.append(df)
df_9_10.reset_index(drop=True,inplace=True)
df_9_10['article_words'] = [eval(words) for words in df_9_10['article_words']]

In [14]:
df_9_10.shape

(584165, 14)

In [15]:
df_test = df_9_10.loc[:,['title','article_words']]
df_test.info(memory_usage='deep',verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584165 entries, 0 to 584164
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   title          584165 non-null  object
 1   article_words  584165 non-null  object
dtypes: object(2)
memory usage: 1.7 GB


In [17]:
# 1000 records, 4.0 MB, 5 passes: 1.4 mins
# 10,000 records, 37.9 MB, 5 passes: 10.59 mins
# All 584k records, 1.7 GB, 5 passes: ...     mins. Est runtime: 618 mins. Est finish time: 00:48
df_test, lda_model, corpus = lda_compute(df_test,num_topics=265,passes=5)

The time is 2021-09-05 14:29:40.032143
Now for LDA modeling...
LDA done!
----------
lda_lists[-1]: [(18, 0.040069945), (38, 0.04205921), (58, 0.033842996), (62, 0.22275679), (65, 0.04956388), (70, 0.047659423), (124, 0.12391218), (132, 0.016120192), (133, 0.036833566), (145, 0.16715892), (149, 0.010586675), (227, 0.029774087), (234, 0.020300869), (261, 0.06283979), (263, 0.0126904175)]
lda_coeffs[-1]: [0.040069945, 0.04205921, 0.033842996, 0.22275679, 0.04956388, 0.047659423, 0.12391218, 0.016120192, 0.036833566, 0.16715892, 0.010586675, 0.029774087, 0.020300869, 0.06283979, 0.0126904175]
lda_inds[-1]: [18, 38, 58, 62, 65, 70, 124, 132, 133, 145, 149, 227, 234, 261, 263]
lda_argmax[-1]: 3
That  took 1092.74 mins.


In [21]:
df_test.to_csv('models/5passes_df_test.csv')

In [22]:
lda_model.save('models/5passes_265_topics_ldamodel')

In [23]:
id2word = corpora.Dictionary(df_test['article_words'])
filename = 'models/5passes_id2word_265.pkl'
pickle.dump(id2word,open(filename,'wb'))