In [None]:
%reload_ext autoreload
import numpy as np
import pandas as pd

from multiprocessing import Pool
from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
%matplotlib inline

# for NLP interpretation
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()


# import own scripts
import sys
sys.path.insert(0, '/src/')

%autoreload 2
from utils import format_raw_documents, myNLP, merge_2_string_lists, get_sentiment

In [None]:
print("Number of available CPU cores: ")
!sysctl -n hw.ncpu

In [3]:
# Notebook functions
def parallelize(function, tasks, workers=4):
    '''
    Performs a task as defined by `function` in parallel and returns the result.
    '''
    if __name__ == '__main__':
        with Pool(processes=workers) as p:
            results = list(tqdm_notebook(p.imap(function, tasks), total=len(tasks)))   
        return results 

In [4]:
# load news articles
a1 = pd.read_csv('data/Articles.csv', encoding = "ISO-8859-1")
a2 = pd.read_csv('data/all-the-news/articles1.csv', encoding = "ISO-8859-1")
a3 = pd.read_csv('data/all-the-news/articles2.csv', encoding = "ISO-8859-1")
a4 = pd.read_csv('data/all-the-news/articles3.csv', encoding = "ISO-8859-1")
raw_documents = [a1, a2, a3, a4]

# Format
docs = format_raw_documents(raw_documents)

# Clear memory
del a1, a2, a3, a4, raw_documents

docs.tail()

Unnamed: 0,Date,Title,Content
142616,9/9/2015,oil prices up in asian trad,Singapore: Oil prices edged higher in quiet As...
142617,9/9/2015,cost of quaid e azam solar power project cut b...,LAHORE: Putting stout defence of the solar po...
142618,9/9/2015,china pakistan set to sign cpec zone d,KARACHI: Pakistan is set to sign a 40-year-lea...
142619,9/9/2016,Global airport traffic 64 percent 2015 fastest...,strong>MONTREAL: Global airport traffic grew a...
142620,9/9/2016,Taxpayers directory to be launched today,strong>ISLAMABAD: Finance Minister Ishaq Dar w...


## Sentiment Analysis
> 1. Merge `Title` & `Content`
> 2. Prep text. I set-up to run in parallel and it's much faster but still takes ~7 minutes on my laptop.
> 3. Perform Sentiment Analysis using `TextBlob`. May take ~150 hours to run in parallel on my machine. A powerful AWS instance with 64 cores will still take ~9 hours to run. The best approach to optimize performance here is to use convert to **C** code using `Cython` and possibly even use distributed computing.

**Next: try optimizing code for efficiency using `Dask` and `Cython`.**

In [6]:
# instantiate `myNLP` object
myNLP = myNLP()

In [7]:
# merge `Topics` and `Content`
merged = merge_2_string_lists(docs['Title'], docs['Content'])

# clean and prep text
parallel_tasks = merged
parallel_func = myNLP.prep_docs_lematize
docs['Processed Text'] = parallelize(parallel_func, merged)




In [1]:
# perform Sentiment Analysis
parallel_tasks = docs['Processed Text']
parallel_func = get_sentiment
docs['Sentiment'] = parallelize(parallel_func, parallel_tasks)

NameError: name 'docs' is not defined

In [None]:
text = docs['Processed Text'][0]
sentiment_p_pos, polarity, subjectivity = get_sentiment(text)

In [22]:
from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer
tb = Blobber(analyzer=NaiveBayesAnalyzer())

print(tb(text).sentiment.p_pos)
print(tb(text))

0.999992782957725
sindh govt decides cut public transport fare 7pc kti rej karachi sindh government decided bring public transport fare 7 per cent due massive reduction petroleum product price federal government geo news reported source said reduction fare applicable public transport rickshaw taxi mean traveling meanwhile karachi transport ittehad kti refused abide government decision kti president irshad bukhari said commuter charged lowest fare karachi compare part country adding 80pc vehicle run compressed natural gas cng bukhari said karachi transporter cut fare decrease cng price made


In [26]:
opinion = tb(text)
print()

In [27]:
opinion.subjectivity

-0.048214285714285716

In [33]:
def add_top_5_topics(docs, top_5_topics_NMF):
    '''
    Adds top 5 topics in separate columns to the `docs` dataframe.
    '''
    for i, n in enumerate(range(1,6)):
        col_name = 'Top #{} topic (NMF)'.format(str(n))
        docs[col_name] = top_5_topics_NMF[i]
    return docs

0 Top #1 topic (NMF)
1 Top #2 topic (NMF)
2 Top #3 topic (NMF)
3 Top #4 topic (NMF)
4 Top #5 topic (NMF)
