In [65]:
%reload_ext autoreload
import numpy as np
import pandas as pd

from multiprocessing import Pool
from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
%matplotlib inline

# import own scripts
import sys
sys.path.insert(0, '/src/')

%autoreload 2
from utils import format_raw_documents, myNLP, merge_2_string_lists, add_top_5_topics, validate_topics

In [2]:
print("Number of available CPU cores: ")
!sysctl -n hw.ncpu

Number of available CPU cores: 
4


In [3]:
# Notebook functions
def parallelize(function, tasks, workers=4):
    '''
    Performs a task as defined by `function` in parallel and returns the result.
    '''
    if __name__ == '__main__':
        with Pool(processes=workers) as p:
            results = list(tqdm_notebook(p.imap(function, tasks), total=len(tasks)))   
        return results 

In [4]:
# load news articles
a1 = pd.read_csv('data/Articles.csv', encoding = "ISO-8859-1")
a2 = pd.read_csv('data/all-the-news/articles1.csv', encoding = "ISO-8859-1")
a3 = pd.read_csv('data/all-the-news/articles2.csv', encoding = "ISO-8859-1")
a4 = pd.read_csv('data/all-the-news/articles3.csv', encoding = "ISO-8859-1")
raw_documents = [a1, a2, a3, a4]

# Format
docs = format_raw_documents(raw_documents)

# Clear memory
del a1, a2, a3, a4, raw_documents

docs.tail()

Unnamed: 0,Date,Title,Content
142616,9/9/2015,oil prices up in asian trad,Singapore: Oil prices edged higher in quiet As...
142617,9/9/2015,cost of quaid e azam solar power project cut b...,LAHORE: Putting stout defence of the solar po...
142618,9/9/2015,china pakistan set to sign cpec zone d,KARACHI: Pakistan is set to sign a 40-year-lea...
142619,9/9/2016,Global airport traffic 64 percent 2015 fastest...,strong>MONTREAL: Global airport traffic grew a...
142620,9/9/2016,Taxpayers directory to be launched today,strong>ISLAMABAD: Finance Minister Ishaq Dar w...


## Topic Extraction
> 1. Merge `Title` & `Content`
> 2. Process text. I set-up to run in parallel and it's much faster but still takes ~30 minutes on my laptop.
> 3. Run `NMF`. Running `NMF` in parallel took ~30 hours.
> 4. Ready to Run `LDA`. Tried running `LDA` but it take will a long time, more than 5 hrs. 

**Next: try optimizing code for efficiency using `Dask` and `Cython`.**

In [5]:
# instantiate `myNLP` object
myNLP = myNLP()

In [6]:
# merge `Topics` and `Content`
merged = merge_2_string_lists(docs['Title'], docs['Content'])

# clean and prep text
prep_func = myNLP.prep_docs_stem
docs['Processed Text'] = parallelize(prep_func, merged)




In [None]:
# run NMF
top_5_topics_NMF, top_words_in_topic_NMF, nmf, tfidf, tfidf_vect = myNLP.fit_nmf(docs['Processed Text'])

In [91]:
# add `Top 5 Topics (NMF)`
docs = add_top_5_topics(docs, top_5_topics_NMF, 'NMF')
docs.head()

Unnamed: 0,Date,Title,Content,Processed Text,Top #1 topic (NMF),Top #2 topic (NMF),Top #3 topic (NMF),Top #4 topic (NMF),Top #5 topic (NMF)
0,1/1/2015,sindh govt decides to cut public transport far...,KARACHI: The Sindh government has decided to b...,sindh govt decid cut public transport fare 7pc...,80,29,46,11,42
1,1/1/2016,Australia set to test twin spin attack against...,strong>SYDNEY: Australia look set to field two...,australia set test twin spin attack windi stro...,59,52,9,94,27
2,1/1/2016,Oil ends 2015 down 35 per,strong>NEW YORK/LONDON: Oil prices rose on Thu...,oil end 2015 35 per strong new york london oil...,80,23,84,97,24
3,1/1/2016,Amir returns to Pakistan squad after 5 year b,strong>LAHORE: Left arm fast bowler Mohammad A...,amir return pakistan squad 5 year b strong lah...,59,70,19,9,94
4,1/10/2016,NA Special Committee on PIA to meet on Monday,ISLAMABAD: The National Assembly (NA) Special ...,na special committe pia meet monday islamabad ...,59,41,77,83,49


In [92]:
# save tops words t
top_words = pd.DataFrame(top_words_in_topic_NMF)
top_words.to_csv('src/results/top_words_in_topic_NMF.csv', sep=',')
top_words

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,one,like,even,would,book,itâ,way,make,seem,much,might,mani,stori,polit,write,could,time,thing,read,idea
1,trump,trumpâ,donald,heâ,support,ralli,presid,him,would,busi,nomine,tweet,polici,comment,ivanka,presidenti,tower,call,elect,critic
2,polic,offic,shoot,shot,kill,suspect,arrest,man,cop,depart,dalla,fire,investig,incid,enforc,fatal,report,chief,crime,scene
3,rate,fed,market,growth,economi,inflat,econom,hike,stock,expect,interest,dollar,increas,economist,polici,reserv,year,rise,index,data
4,clinton,hillari,clintonâ,foundat,bill,democrat,presidenti,nomine,debat,secretari,state,sheâ,speech,candid,her,donald,former,kain,donor,donat
5,mr,said,york,time,new,unit,interview,brief,lawyer,would,state,advis,chief,like,televis,wrote,republican,him,news,trumpâ
6,court,suprem,justic,judg,rule,case,scalia,gorsuch,appeal,constitut,decis,circuit,law,courtâ,legal,nomin,judici,nomine,lawyer,right
7,turkey,turkish,erdogan,coup,kurdish,militari,attempt,minist,govern,syria,nato,state,islam,prime,alli,detain,an,power,countri,fail
8,email,server,classifi,clintonâ,depart,fbi,privat,inform,releas,hack,wikileak,secretari,state,use,sent,document,account,dnc,person,aid
9,game,player,team,play,season,leagu,coach,win,footbal,nfl,ball,yanke,final,score,first,fan,second,bowl,sport,last


### Interpretation of Results: Topic Extraction
> Randomly pull up a few articles in each topic and assess if the model works

In [95]:
# read
validate_topics(docs, top_words, 1,'NMF', 3)

Random Topic #:  3
Top words in topic:  ['rate', 'fed', 'market', 'growth', 'economi', 'inflat', 'econom', 'hike', 'stock', 'expect', 'interest', 'dollar', 'increas', 'economist', 'polici', 'reserv', 'year', 'rise', 'index', 'data']

Article 1:
     Donald Trump may be a federal employee now, but fellow government workers wonât be getting discounts at his Pennsylvania Avenue hotel  â   even if the hotelâs website promises otherwise. Scroll through the offers on the Trump International Hotelâs website and youâll see a section advertising special government rates, currently capped at $182 per night by the General Services Administration. âIt is our pleasure to welcome Retired and Active Duty members of the United States Military and Armed Forces,â the page reads. But good luck getting one of those discounted rates.    A screengrab of the Trump International Hotel, Washington, D. C. website.  The booking calendar shows âno availabilityâ for government employees and milit

In [None]:
'''
# run LDA 
top_5_topics_LDA, top_words_in_topic_LDA, lda, tf, cnt_vect = myNLP.fit_lda(docs['Processed Text'])

# add `Top 5 Topics (NMF)`
docs['Top 5 Topics (LDA)'] = top_5_topics_LDA

# save
top_words_in_topic_LDA.to_csv('data/top_words_in_topic_LDA.csv', sep=',')
'''

In [88]:
# save
docs.to_csv('src/results/nlp.csv', sep=',')