In [44]:
%reload_ext autoreload
import numpy as np
import pandas as pd

from multiprocessing import Pool
from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
%matplotlib inline

# import own scripts
import sys
sys.path.insert(0, '/src/')

%autoreload 2
from utils import format_raw_documents, myNLP, merge_2_string_lists, add_top_5_topics, validate_topics, build_topic_df, clean_gdp, gdp_change

In [2]:
print("Number of available CPU cores: ")
!sysctl -n hw.ncpu

Number of available CPU cores: 
4


In [3]:
# Notebook functions
def parallelize(function, tasks, workers=4):
    '''
    Performs a task as defined by `function` in parallel and returns the result.
    '''
    if __name__ == '__main__':
        with Pool(processes=workers) as p:
            results = list(tqdm_notebook(p.imap(function, tasks), total=len(tasks)))   
        return results 

In [4]:
# load news articles
a1 = pd.read_csv('data/Articles.csv', encoding = "ISO-8859-1")
a2 = pd.read_csv('data/all-the-news/articles1.csv', encoding = "ISO-8859-1")
a3 = pd.read_csv('data/all-the-news/articles2.csv', encoding = "ISO-8859-1")
a4 = pd.read_csv('data/all-the-news/articles3.csv', encoding = "ISO-8859-1")
raw_documents = [a1, a2, a3, a4]

# Format
docs = format_raw_documents(raw_documents)

# Clear memory
del a1, a2, a3, a4, raw_documents

docs.tail()

Unnamed: 0,Date,Title,Content
142616,9/9/2015,oil prices up in asian trad,Singapore: Oil prices edged higher in quiet As...
142617,9/9/2015,cost of quaid e azam solar power project cut b...,LAHORE: Putting stout defence of the solar po...
142618,9/9/2015,china pakistan set to sign cpec zone d,KARACHI: Pakistan is set to sign a 40-year-lea...
142619,9/9/2016,Global airport traffic 64 percent 2015 fastest...,strong>MONTREAL: Global airport traffic grew a...
142620,9/9/2016,Taxpayers directory to be launched today,strong>ISLAMABAD: Finance Minister Ishaq Dar w...


## Topic Extraction
> 1. Merge `Title` & `Content`
> 2. Process text. I set-up to run in parallel and it's much faster but still takes ~30 minutes on my laptop.
> 3. Run `NMF`. Running `NMF` in parallel took ~30 minutes.
> 4. Ready to Run `LDA`. Tried running `LDA` but it take will a long time, more than 5 hrs. 

**Next: try optimizing code for efficiency using `Dask` and `Cython`.**

In [5]:
# instantiate `myNLP` object
myNLP = myNLP()

In [6]:
# merge `Topics` and `Content`
merged = merge_2_string_lists(docs['Title'], docs['Content'])

# clean and prep text
prep_func = myNLP.prep_docs_stem
docs['Processed Text'] = parallelize(prep_func, merged)




In [7]:
# run NMF
top_5_topics_NMF, top_words_in_topic_NMF, nmf, tfidf, tfidf_vect = myNLP.fit_nmf(docs['Processed Text'])

'''
# run LDA 
top_5_topics_LDA, top_words_in_topic_LDA, lda, tf, cnt_vect = myNLP.fit_lda(docs['Processed Text'])

# add `Top 5 Topics (NMF)`
docs['Top 5 Topics (LDA)'] = top_5_topics_LDA

# save
top_words_in_topic_LDA.to_csv('data/top_words_in_topic_LDA.csv', sep=',')
'''

In [8]:
# add `Top 5 Topics (NMF)`
docs = add_top_5_topics(docs, top_5_topics_NMF, 'NMF')
docs.head()

Unnamed: 0,Date,Title,Content,Processed Text,Top #1 topic (NMF),Top #2 topic (NMF),Top #3 topic (NMF),Top #4 topic (NMF),Top #5 topic (NMF)
0,1/1/2015,sindh govt decides to cut public transport far...,KARACHI: The Sindh government has decided to b...,sindh govt decid cut public transport fare 7pc...,80,29,46,11,42
1,1/1/2016,Australia set to test twin spin attack against...,strong>SYDNEY: Australia look set to field two...,australia set test twin spin attack windi stro...,59,52,9,94,27
2,1/1/2016,Oil ends 2015 down 35 per,strong>NEW YORK/LONDON: Oil prices rose on Thu...,oil end 2015 35 per strong new york london oil...,80,23,84,97,24
3,1/1/2016,Amir returns to Pakistan squad after 5 year b,strong>LAHORE: Left arm fast bowler Mohammad A...,amir return pakistan squad 5 year b strong lah...,59,70,19,9,94
4,1/10/2016,NA Special Committee on PIA to meet on Monday,ISLAMABAD: The National Assembly (NA) Special ...,na special committe pia meet monday islamabad ...,59,41,77,83,49


In [9]:
# save tops words t
top_words = pd.DataFrame(top_words_in_topic_NMF)
top_words.to_csv('src/results/top_words_in_topic_NMF.csv', sep=',')
top_words

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,one,like,even,would,book,itâ,way,make,seem,much,might,mani,stori,polit,write,could,time,thing,read,idea
1,trump,trumpâ,donald,heâ,support,ralli,presid,him,would,busi,nomine,tweet,polici,comment,ivanka,presidenti,tower,call,elect,critic
2,polic,offic,shoot,shot,kill,suspect,arrest,man,cop,depart,dalla,fire,investig,incid,enforc,fatal,report,chief,crime,scene
3,rate,fed,market,growth,economi,inflat,econom,hike,stock,expect,interest,dollar,increas,economist,polici,reserv,year,rise,index,data
4,clinton,hillari,clintonâ,foundat,bill,democrat,presidenti,nomine,debat,secretari,state,sheâ,speech,candid,her,donald,former,kain,donor,donat
5,mr,said,york,time,new,unit,interview,brief,lawyer,would,state,advis,chief,like,televis,wrote,republican,him,news,trumpâ
6,court,suprem,justic,judg,rule,case,scalia,gorsuch,appeal,constitut,decis,circuit,law,courtâ,legal,nomin,judici,nomine,lawyer,right
7,turkey,turkish,erdogan,coup,kurdish,militari,attempt,minist,govern,syria,nato,state,islam,prime,alli,detain,an,power,countri,fail
8,email,server,classifi,clintonâ,depart,fbi,privat,inform,releas,hack,wikileak,secretari,state,use,sent,document,account,dnc,person,aid
9,game,player,team,play,season,leagu,coach,win,footbal,nfl,ball,yanke,final,score,first,fan,second,bowl,sport,last


### Interpretation of Results: Topic Extraction
> Randomly pull up a few articles in each topic and assess if the model works

In [38]:
# read
validate_topics(docs, top_words, 1,'NMF', 3)

Random Topic #:  3
Top words in topic:  ['rate', 'fed', 'market', 'growth', 'economi', 'inflat', 'econom', 'hike', 'stock', 'expect', 'interest', 'dollar', 'increas', 'economist', 'polici', 'reserv', 'year', 'rise', 'index', 'data']

Article 1:
 Donald Trump went on CNBC this morning, and, over the course of a   interview, once again reminded the world of the most fundamental fact about his candidacy: He doesnât really seem to understand any aspect of American public policy.    Benefiting as he often does from a cable news format, he was allowed to ramble and dissemble across a variety of topics  â   including who sets interest rates, how monetary policy impacts the economy, and how his own money is invested, finding time for a racist personal attack against a rival politician. âI believe itâs a false market,â Trump said of the current state of American stocks. âI donât even invest in the stock market.â Trumpâs personal financial disclosures to the Federal Election Co

## Incorporate Sentiment Analysis
---

In [12]:
# pull in sentiment data
docs['Sentiment'] = pd.read_csv('src/results/sentiment.csv', sep=',', header=None)[1]
print(docs.shape)
docs.head()

(142619, 10)


Unnamed: 0,Date,Title,Content,Processed Text,Top #1 topic (NMF),Top #2 topic (NMF),Top #3 topic (NMF),Top #4 topic (NMF),Top #5 topic (NMF),Sentiment
0,1/1/2015,sindh govt decides to cut public transport far...,KARACHI: The Sindh government has decided to b...,sindh govt decid cut public transport fare 7pc...,80,29,46,11,42,-0.743
1,1/1/2016,Australia set to test twin spin attack against...,strong>SYDNEY: Australia look set to field two...,australia set test twin spin attack windi stro...,59,52,9,94,27,-0.5279
2,1/1/2016,Oil ends 2015 down 35 per,strong>NEW YORK/LONDON: Oil prices rose on Thu...,oil end 2015 35 per strong new york london oil...,80,23,84,97,24,-0.419
3,1/1/2016,Amir returns to Pakistan squad after 5 year b,strong>LAHORE: Left arm fast bowler Mohammad A...,amir return pakistan squad 5 year b strong lah...,59,70,19,9,94,0.5859
4,1/10/2016,NA Special Committee on PIA to meet on Monday,ISLAMABAD: The National Assembly (NA) Special ...,na special committe pia meet monday islamabad ...,59,41,77,83,49,0.6597


In [13]:
# save
docs.to_csv('src/results/nlp.csv', sep=',')

In [99]:
# build sentiment feature matrix
sentiment = build_topic_df(docs, 'Top #1 topic (NMF)')
sentiment.head()

Topic ID,Date,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2015-01-01,,,,,,,,,,...,,,,,,,,,,
1,2016-01-01,,,,,,,,,,...,,,,,,,,,,
2,2016-01-01,,,,,,,,,,...,,,,,,,,,,
3,2016-01-01,,,,,,,,,,...,,,,,,,,,,
4,2016-01-10,,,,,,,,,,...,,,,,,,,,,


In [100]:
# load, clean and calculate change in gdp data 
gdp = gdp_change(clean_gdp(pd.read_csv('data/GDPC1.csv')))
gdp.head()

Unnamed: 0_level_0,GDP,%Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1947-01-01,2033.061,
1947-04-01,2027.639,-0.002667
1947-07-01,2023.452,-0.002065
1947-10-01,2055.103,0.015642
1948-01-01,2086.017,0.015043


In [144]:
gdp_sentiment = pd.DataFrame()
gdp_sentiment['Period Start'], gdp_sentiment['Period End'] = gdp.index[:-1], gdp.index[1:]
idx = []
for period in range(0,gdp_sentiment.shape[0]):
    start_date = gdp_sentiment.iloc[period]['Period Start']
    end_date = gdp_sentiment.iloc[period]['Period End']
    idx.append(sentiment[(sentiment['Date']>=start_date) & (sentiment['Date']<end_date)].index.tolist())
gdp_sentiment['Article ID'] = idx

Unnamed: 0,Period Start,Period End,Article ID
280,2017-01-01,2017-04-01,"[10, 11, 18, 26, 32, 33, 38, 39, 46, 56, 61, 6..."
281,2017-04-01,2017-07-01,"[116851, 116852, 116853, 116854, 116855, 11685..."
282,2017-07-01,2017-10-01,"[138903, 138904, 138905, 138906, 138907, 13890..."
283,2017-10-01,2018-01-01,[]
284,2018-01-01,2018-04-01,[]


In [147]:
sentiment['Date'].min()

Timestamp('2000-05-15 00:00:00')

In [152]:
gdp_sentiment.tail(75)

Unnamed: 0,Period Start,Period End,Article ID
210,1999-07-01,1999-10-01,[]
211,1999-10-01,2000-01-01,[]
212,2000-01-01,2000-04-01,[]
213,2000-04-01,2000-07-01,[782]
214,2000-07-01,2000-10-01,[]
215,2000-10-01,2001-01-01,[]
216,2001-01-01,2001-04-01,[]
217,2001-04-01,2001-07-01,[]
218,2001-07-01,2001-10-01,[]
219,2001-10-01,2002-01-01,[]


In [16]:
topic_df = 
topic_df.head()

Unnamed: 0,index,Topic ID,Sentiment
0,0,80,-0.743
1,2,80,-0.419
2,19,80,-0.9578
3,21,80,-0.8614
4,26,80,0.8934


NameError: name 'topic_df' is not defined

In [None]:
topic_df = build_topic_df(docs, top_topic)
topic_df.shape

## Next Steps
> 1. labels: GDP up or down from period to period (1: 'if greater than previous period', 0: 'if less than previous period')
> 2. feature matrix: get average sentiment and number of articles per topic for each GDP reporting period
> 3. run a random forest model to see if sentiment per topic is a viable predictor of GDP
> 4. convert random forest regression probabilities to a time-series and incorporate as exogenous feature into the SARIMAX model

