In [1]:
%reload_ext autoreload
import numpy as np
import pandas as pd

from multiprocessing import Pool
from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
%matplotlib inline

# import own scripts
import sys
sys.path.insert(0, '/src/')

%autoreload 2
from utils import format_raw_documents, myNLP, merge_2_string_lists, add_top_5_topics

In [3]:
print("Number of available CPU cores: ")
!sysctl -n hw.ncpu

Number of available CPU cores: 
4


In [5]:
# Notebook functions
def parallelize(function, tasks, workers=4):
    '''
    Performs a task as defined by `function` in parallel and returns the result.
    '''
    if __name__ == '__main__':
        with Pool(processes=workers) as p:
            results = list(tqdm_notebook(p.imap(function, tasks), total=len(tasks)))   
        return results 

In [6]:
# load news articles
a1 = pd.read_csv('data/Articles.csv', encoding = "ISO-8859-1")
a2 = pd.read_csv('data/all-the-news/articles1.csv', encoding = "ISO-8859-1")
a3 = pd.read_csv('data/all-the-news/articles2.csv', encoding = "ISO-8859-1")
a4 = pd.read_csv('data/all-the-news/articles3.csv', encoding = "ISO-8859-1")
raw_documents = [a1, a2, a3, a4]

# Format
docs = format_raw_documents(raw_documents)

# Clear memory
del a1, a2, a3, a4, raw_documents

docs.tail()

Unnamed: 0,Date,Title,Content
142616,9/9/2015,oil prices up in asian trad,Singapore: Oil prices edged higher in quiet As...
142617,9/9/2015,cost of quaid e azam solar power project cut b...,LAHORE: Putting stout defence of the solar po...
142618,9/9/2015,china pakistan set to sign cpec zone d,KARACHI: Pakistan is set to sign a 40-year-lea...
142619,9/9/2016,Global airport traffic 64 percent 2015 fastest...,strong>MONTREAL: Global airport traffic grew a...
142620,9/9/2016,Taxpayers directory to be launched today,strong>ISLAMABAD: Finance Minister Ishaq Dar w...


## Topic Extraction
> 1. Merge `Title` & `Content`
> 2. Process text. I set-up to run in parallel and it's much faster but still takes ~30 minutes on my laptop.
> 3. Run `NMF`. Running `NMF` in parallel took ~1.5 hours.
> 4. Ready to Run `LDA`. Tried running `LDA` but it take will a long time, more than 5 hrs. 

**Next: try optimizing code for efficiency using `Dask` and `Cython`.**

In [None]:
# instantiate `myNLP` object
myNLP = myNLP()

In [7]:
# merge `Topics` and `Content`
merged = merge_2_string_lists(docs['Title'], docs['Content'])

# clean and prep text
prep_func = myNLP.prep_docs_stem
docs['Processed Text'] = parallelize(prep_func, merged)




In [None]:
# run NMF
top_5_topics_NMF, top_words_in_topic_NMF, nmf, tfidf, tfidf_vect = myNLP.fit_nmf(docs['Processed Text'])

In [25]:
# add `Top 5 Topics (NMF)`
docs = add_top_5_topics(docs, top_5_topics_NMF)

# save
top_words_in_topic_NMF.to_csv('data/top_words_in_topic_NMF.csv', sep=',')

KeyboardInterrupt: 

In [None]:
'''
# run LDA 
top_5_topics_LDA, top_words_in_topic_LDA, lda, tf, cnt_vect = myNLP.fit_lda(docs['Processed Text'])

# add `Top 5 Topics (NMF)`
docs['Top 5 Topics (LDA)'] = top_5_topics_LDA

# save
top_words_in_topic_LDA.to_csv('data/top_words_in_topic_LDA.csv', sep=',')
'''

In [10]:
# save
docs.to_csv('data/nlp.csv', sep=',')

## Interpretation of Results
> Randomly pull up a few articles in each topic and assess if the model works

In [24]:
np.array(top_5_topics_NMF).T

array([[80, 59, 80, ..., 59, 41, 31],
       [29, 52, 23, ..., 37, 23, 59],
       [46,  9, 84, ..., 73,  3, 11],
       [11, 94, 97, ..., 97, 96, 42],
       [42, 27, 24, ..., 31, 37, 18]])