# Sentiment Analysis

Generate a sentiment analysis for newspaper articles covering events in Syria from the years 2010-2017. 

In [1]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np
from numpy import nan
import os

sns.set_context('notebook')
sns.set_style('whitegrid')

## Data Loading

In [4]:
df = pd.read_csv('CleanLexisNexis.csv', parse_dates=['date'])

In [5]:
df.dtypes

publication                object
date               datetime64[ns]
title                      object
length                      int64
publicationtype            object
text                       object
year                        int64
month                       int64
day                         int64
dtype: object

In [6]:
df.head(4)

Unnamed: 0,publication,date,title,length,publicationtype,text,year,month,day
0,The Atlanta Journal-Constitution,2010-01-03,Five pressing questions to answer in 2010,747,Newspapers,Will President Barack Obama regain his momentu...,2010,1,3
1,BBC,2010-01-04,"Saudi foreign minister says Israel ""spoiled ch...",2196,Transcript,Text of report by Saudi-owned leading pan-Arab...,2010,1,4
2,BBC,2010-01-08,Highlights of Iran parliamentary session.,1123,Transcript,Excerpt from report on parliamentary proceedin...,2010,1,8
3,Right Vision News,2010-01-09,Jordan:Way out for Obama,852,Newspaper,"Pakistan, Jan. 09 -- These are the worst of ti...",2010,1,9


## 1. Sentiment Analysis

Use NLTK to build sentiment scores. 

Use the positive/negative corpus provided by Andy Kim, author of *Can Big Data Forcast North Korean Military Aggression?* 

#### Append Harvard positive and negative list together

In [107]:
os.chdir('/Users/laurieottehenning/Documents/Georgetown Data Science /Capstone/Harvard Pos:Neg')

pos = pd.read_csv('Harvard_Positive.csv', names=['Word', 'positive'])
neg = pd.read_csv('Harvard_Negative.csv', names=['Word', 'negative'])

def col_lower(wordlist):
    for col in wordlist.columns:
        wordlist[col] = wordlist[col].str.lower()

col_lower(pos)
col_lower(neg)


#### Append diplomatic words together

In [13]:
dip_words = pd.read_csv('Vocab 250 Diplomatic Word List.csv')

In [34]:
# Random fraction of the word list
dip_words.sample(frac=.02)

Unnamed: 0,word,value,stem
30,breach,negative,breach
241,upheaval,negative,upheav
69,diplomacy,positive,diplomac
94,flux,neutral,flux
15,ambassador,neutral,ambassador


In [32]:
# Make all of the words in each column lower case
col_lower(dip_words)

In [70]:
# Add the stem from the diplomatic words list to the Harvard list
columns = ['negative']
dip_neg = pd.DataFrame(columns=columns)
dip_neg['negative'] = dip_words.loc[dip_words['value'] == 'negative', 'stem']

columns = ['positive']
dip_pos = pd.DataFrame(columns=columns)
dip_pos['positive'] = dip_words.loc[dip_words['value'] == 'positive', 'stem']

#### Count the number of positive or negative words within a text

In [122]:
# Append the list of words together
pos_list = []
for i in pos['Word']:
    pos_list.append(i)
for i in dip_pos['positive']:
    pos_list.append(i)
    
neg_list = []
for i in neg['Word']:
    neg_list.append(i)
for i in dip_neg['negative']:
    neg_list.append(i)

In [173]:
# Remove punctuation
df['text'] = df['text'].str.replace('[^\w\s]','')

In [174]:
# Get the sum of the number of positive and negative words in each article
df = df.assign(PositiveCount=df['text'].apply(lambda sentence: 
                                            sum(word.lower() in pos_list 
                                                for word in sentence.split())))
df = df.assign(NegativeCount=df['text'].apply(lambda sentence: 
                                            sum(word.lower() in neg_list 
                                                for word in sentence.split())))



In [175]:
df.head(4)

Unnamed: 0,publication,date,title,length,publicationtype,text,year,month,day,PositiveCount,NegativeCount
0,The Atlanta Journal-Constitution,2010-01-03,Five pressing questions to answer in 2010,747,Newspapers,Will President Barack Obama regain his momentu...,2010,1,3,45,60
1,BBC,2010-01-04,"Saudi foreign minister says Israel ""spoiled ch...",2196,Transcript,Text of report by Saudiowned leading panArab d...,2010,1,4,176,103
2,BBC,2010-01-08,Highlights of Iran parliamentary session.,1123,Transcript,Excerpt from report on parliamentary proceedin...,2010,1,8,60,42
3,Right Vision News,2010-01-09,Jordan:Way out for Obama,852,Newspaper,Pakistan Jan 09 These are the worst of times ...,2010,1,9,56,47


#### Create article polarity

Polarity is calculated by taking the (sum of positive words - sum of negative)/sum of all words

In [176]:
df['tone'] = (df['PositiveCount'] - df['NegativeCount'])/df['length']
df.head(4)

Unnamed: 0,publication,date,title,length,publicationtype,text,year,month,day,PositiveCount,NegativeCount,tone
0,The Atlanta Journal-Constitution,2010-01-03,Five pressing questions to answer in 2010,747,Newspapers,Will President Barack Obama regain his momentu...,2010,1,3,45,60,-0.02008
1,BBC,2010-01-04,"Saudi foreign minister says Israel ""spoiled ch...",2196,Transcript,Text of report by Saudiowned leading panArab d...,2010,1,4,176,103,0.033242
2,BBC,2010-01-08,Highlights of Iran parliamentary session.,1123,Transcript,Excerpt from report on parliamentary proceedin...,2010,1,8,60,42,0.016028
3,Right Vision News,2010-01-09,Jordan:Way out for Obama,852,Newspaper,Pakistan Jan 09 These are the worst of times ...,2010,1,9,56,47,0.010563


In [179]:
os.chdir('/Users/laurieottehenning/Documents/Georgetown Data Science /Capstone')
df.to_csv("Sentiment Data.csv")