In [1]:
%reload_ext autoreload
import numpy as np
import pandas as pd

from multiprocessing import Pool
from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
%matplotlib inline

# for NLP interpretation
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()


# import own scripts
import sys
sys.path.insert(0, '/src/')

%autoreload 2
from utils import format_raw_documents, myNLP, merge_2_string_lists, get_sentiment

In [2]:
print("Number of available CPU cores: ")
!sysctl -n hw.ncpu

Number of available CPU cores: 
4


In [3]:
# Notebook functions
def parallelize(function, tasks, workers=4):
    '''
    Performs a task as defined by `function` in parallel and returns the result.
    '''
    if __name__ == '__main__':
        with Pool(processes=workers) as p:
            results = list(tqdm_notebook(p.imap(function, tasks), total=len(tasks)))   
        return results 

In [4]:
# load news articles
a1 = pd.read_csv('data/Articles.csv', encoding = "ISO-8859-1")
a2 = pd.read_csv('data/all-the-news/articles1.csv', encoding = "ISO-8859-1")
a3 = pd.read_csv('data/all-the-news/articles2.csv', encoding = "ISO-8859-1")
a4 = pd.read_csv('data/all-the-news/articles3.csv', encoding = "ISO-8859-1")
raw_documents = [a1, a2, a3, a4]

# Format
docs = format_raw_documents(raw_documents)

# Clear memory
del a1, a2, a3, a4, raw_documents

docs.tail()

Unnamed: 0,Date,Title,Content
142616,9/9/2015,oil prices up in asian trad,Singapore: Oil prices edged higher in quiet As...
142617,9/9/2015,cost of quaid e azam solar power project cut b...,LAHORE: Putting stout defence of the solar po...
142618,9/9/2015,china pakistan set to sign cpec zone d,KARACHI: Pakistan is set to sign a 40-year-lea...
142619,9/9/2016,Global airport traffic 64 percent 2015 fastest...,strong>MONTREAL: Global airport traffic grew a...
142620,9/9/2016,Taxpayers directory to be launched today,strong>ISLAMABAD: Finance Minister Ishaq Dar w...


## Sentiment Analysis
> 1. Merge `Title` & `Content`
> 2. Prep text. I set-up to run in parallel and it's much faster but still takes ~7 minutes on my laptop.
> 3. Perform Sentiment Analysis using `TextBlob`. May take ~150 hours to run in parallel on my machine. A powerful AWS instance with 64 cores will still take ~9 hours to run. The best approach to optimize performance here is to use convert to **C** code using `Cython` and possibly even use distributed computing.

**Next: try optimizing code for efficiency using `Dask` and `Cython`.**

In [5]:
# instantiate `myNLP` object
myNLP = myNLP()

In [6]:
# merge `Topics` and `Content`
merged = merge_2_string_lists(docs['Title'], docs['Content'])

# clean and prep text
parallel_tasks = merged
parallel_func = myNLP.prep_docs_lematize
docs['Processed Text'] = parallelize(parallel_func, merged)




In [12]:
# perform Sentiment Analysis
parallel_tasks = docs['Processed Text']
parallel_func = get_sentiment
docs['Sentiment'] = parallelize(parallel_func, parallel_tasks)




In [11]:
i = 3
text = docs['Processed Text'][i]
content = docs['Content'][i]
print(content)
print(get_sentiment(text))

strong>LAHORE: Left arm fast bowler Mohammad Amir, who was banned for five years from International cricket in spot-fixing case, has returned to the national squad announced for upcoming New Zealand tour.</strongMohammad Amir has been included in One-Day International and Twenty20 squads.Chief selector Haroon Rasheed announced that Mohammad Amir, Ahmed Shahzad, Mohammad Hafeez, Sohaib Maqsood, Shahid Afridi, Emad Wasim, Anwar Ali, Amir Yamin, Sarfaraz Ahmed, Wahab Riaz, Umar Gul, Mohammad Rizwan, and Saad Nasim will be part of the T20 team.The ODI squad comprises Ahmed Shahzad, Azhar Ali, Mohammad Hafeez, Shoaib Malik, Asad Shafiq, Babar Azam, Sohaib Maqsood, Zafar Gohar, Emad Wasim, Anwar Ali, Sarfaraz Ahmed, Wahab Riaz, Rahat Ali, Mohammad Irfan, Mohammad Rizwan, and Mohammad Amir.Rasheed said that Amir's case has been forwarded to the New Zealand consulate and that his visa process will begin after the New Year's holidays.Amir, 23, resumed his career in March playing grade two crick

In [13]:
docs.head()

Unnamed: 0,Date,Title,Content,Processed Text,Sentiment
0,1/1/2015,sindh govt decides to cut public transport far...,KARACHI: The Sindh government has decided to b...,sindh govt decides cut public transport fare 7...,-0.743
1,1/1/2016,Australia set to test twin spin attack against...,strong>SYDNEY: Australia look set to field two...,australia set test twin spin attack windi stro...,-0.5279
2,1/1/2016,Oil ends 2015 down 35 per,strong>NEW YORK/LONDON: Oil prices rose on Thu...,oil end 2015 35 per strong new york london oil...,-0.419
3,1/1/2016,Amir returns to Pakistan squad after 5 year b,strong>LAHORE: Left arm fast bowler Mohammad A...,amir return pakistan squad 5 year b strong lah...,0.5859
4,1/10/2016,NA Special Committee on PIA to meet on Monday,ISLAMABAD: The National Assembly (NA) Special ...,na special committee pia meet monday islamabad...,0.6597


In [14]:
# save
docs['Sentiment'].to_csv('src/results/sentiment.csv', sep=',')