In [1]:
# Import Libraries.
import nltk
from nltk import word_tokenize
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import sentiwordnet as swn
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import time

### Import the data

In [2]:
# Create location variables for data.
posts = "./data/facebook_congress_posts.csv"

responses01 = "./data/facebook_congress_responses01.csv"
responses02 = "./data/facebook_congress_responses02.csv"
responses03 = "./data/facebook_congress_responses03.csv"
responses04 = "./data/facebook_congress_responses04.csv"
responses05 = "./data/facebook_congress_responses05.csv"
responses06 = "./data/facebook_congress_responses06.csv"
responses07 = "./data/facebook_congress_responses07.csv"
responses08 = "./data/facebook_congress_responses08.csv"
responses09 = "./data/facebook_congress_responses09.csv"
responses10 = "./data/facebook_congress_responses10.csv"

responses_1st_half = "./data/congress_responses_1st_half.csv"
responses_2nd_half = "./data/congress_responses_2nd_half.csv"

annotations = "./data/annotations.csv"

In [3]:
# Download posts and annotations. This is relatively fast.
start_time = time.time()
postsDf = pd.read_csv(posts)
annotDf = pd.read_csv(annotations)
end_time = time.time()
print("run time:",end_time-start_time)

run time: 5.008749485015869


#### Dealing with the Responses

Figure out how much response data to load for the current task. Each responses01 to responses10 chunk is already 1.4 million posts. For the sentiment analysis, we can get sentiment in small segments and aggregate everything at the end of the analysis.

In [5]:
# This is enough response data for writing initial code, debugging, etc.
start_time = time.time()
respDf01 = pd.read_csv(responses01, index_col=0)
end_time = time.time()
print("run time:",end_time-start_time)

  mask |= (ar1 == a)


run time: 6.8802103996276855


In [9]:
# Response rows 0:6933254; if we want the first 6M responses.
# start_time = time.time()
# resp_1_df = pd.read_csv(responses_1st_half, index_col=0)
# end_time = time.time()
# print("run time:",end_time-start_time)

In [4]:
# Response rows 6933255:end; if we want the remaining 6M responses.
# start_time = time.time()
# resp_2_df = pd.read_csv(responses_2nd_half, index_col=0)
# end_time = time.time()
# print("run time:",end_time-start_time)

## Sentiment Analysis

In [13]:
# Need to make sure there are no NaNs.
posts_smt_Df = postsDf.fillna('0')

In [40]:
# Make sure NaNs don't throw error.
resps_smt_Df = respDf01.fillna('0')

In [11]:
# Take a small slice for testing.
resps_smt_Df = resps_smt_Df[:1000]

### Hu & Liu - pos/neg

This first lexicon is from the following paper:
Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." _Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD-2004)_, Aug 22-25, 2004, Seattle, Washington, USA.

This is a simple lexicon with two lists of 2006 positive words and 4785 negative words.

In [3]:
# Import HuLiu lexicon. (from http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar)

positive = open('./data/positive-words.txt').read()
pos_tokens = word_tokenize(positive)
negative = open('./data/negative-words.txt',encoding = "ISO-8859-1").read()
neg_tokens = word_tokenize(negative)

In [22]:
# Simple sentiment analysis with pos/neg values given to words in
# pos/neg lists. Neutral words get no value
def get_HL_Sentiment(sentence_string):
    sentiment = 0
    tokens = word_tokenize(sentence_string)
    for token in tokens:
        if token in pos_tokens:
            sentiment += 1
        if token in neg_tokens:
            sentiment -= 1
    return sentiment

In [32]:
# Run the sentiment analysis on the posts.
HL_sentiment = []
for row in range(len(posts_smt_Df["post_text"])):
    HL_sentiment.append(get_HL_Sentiment(posts_smt_Df["post_text"][row]))
posts_smt_Df['HL_sentiment'] = HL_sentiment

In [33]:
posts_smt_Df

Unnamed: 0,op_id,op_gender,post_id,post_text,post_type,HL_sentiment
0,57265377,M,0,"Yesterday, my colleagues and I voted to protec...",video,2
1,57265377,M,1,Roses are red...and so is Texas. Let's keep it...,video,0
2,57265377,M,2,#TBT to this classic video. #DonkeyWhisperer,video,1
3,57265377,M,3,Since President Donald J. Trump was sworn in o...,video,2
4,57265377,M,4,Remembering our 40th president today. LIKE to ...,video,0
5,57265377,M,5,"After a phenomenal season, Jason Garrett has b...",photo,1
6,57265377,M,6,"In Texas, we honor our military heroes. #Chris...",photo,1
7,57265377,M,7,"Last night, President Donald J. Trump nominate...",video,0
8,57265377,M,8,President Trump has just nominated judge Neil ...,status,3
9,57265377,M,9,"""I agree 100% with President Trump's decision....",link,0


In [34]:
# Have this saved; no need to run again until/unless change algorithm.
# posts_smt_Df.to_csv("./data/congress_posts_HL-smt.csv", header = True)

### Sentiment Analysis with SentiWordNet

Not sure this is the best tool for FB responses... tabling for now; will come back. Need to do more research about specifically how to implement this for our unlabeled data. Maybe only useful for supervised learning?

Need to preprocess data with POS and usage score? Not sure of range for usage.

- n - NOUN 
- v - VERB 
- a - ADJECTIVE 
- s - ADJECTIVE SATELLITE 
- r - ADVERB



In [21]:
# "breakdown" is the word we are interested in; "n" is the POS, "03" is the usage value

our_word = swn.senti_synset('protect.v.02') 
print(our_word)
our_word.pos_score()
our_word.neg_score()
our_word.obj_score()

<protect.v.02: PosScore=0.0 NegScore=0.0>


1.0

### Sentiment Analysis with TextBlob polarity scores

In [23]:
def compute_TB_sentiment(text_column):
    '''Takes a dataframe column of strings and outputs a list of 
    sentiment scores using textBlob.''' 
    TB_sentiment = []
    for row in range(len(text_column)):
        sentiment = 0
        tokens = word_tokenize(text_column[row])
        for word in tokens:
            # Range of sentiment for each word is -1 to 1.
            sentiment += TextBlob(word).sentiment.polarity
        TB_sentiment.append(sentiment)
    return TB_sentiment

In [34]:
# Run TextBlob analysis on chunk of responses.
start_time = time.time()
resps_smt_Df['TB_sentiment'] = compute_TB_sentiment(resps_smt_Df["response_text"])
end_time = time.time()
print("run time:",end_time-start_time)

run time: 5.033034801483154


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [35]:
resps_smt_Df

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,V_sentiment
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-0.7458
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,0.3302
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0.3612
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0.3612
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,-0.0821
5,57265377,M,0,Candice,"Women, the ""hosts"" of the unborn babies you're...",Roger Williams,Congress_Republican,1.5937
6,57265377,M,0,Cheri,I am Pro Choice and always will be,Roger Williams,Congress_Republican,0.0000
7,57265377,M,0,Julie,Thank you for taking a stand!!!!,Roger Williams,Congress_Republican,0.3612
8,57265377,M,0,Stephen,I seem to disagree with you again Mr.Williams....,Roger Williams,Congress_Republican,-0.1795
9,57265377,M,0,Tony,Please demonstrate your support by providing p...,Roger Williams,Congress_Republican,1.2140


In [19]:
# A sample look at sentiment for words in a post.

tokens = word_tokenize(posts_smt_Df["post_text"][1])
sentiment = 0
for i in tokens:
    token_score = TextBlob(i).sentiment.polarity
    print(i,token_score)
    sentiment += token_score
    
sentiment

Roses 0.0
are 0.0
red 0.0
... 0.0
and 0.0
so 0.0
is 0.0
Texas 0.0
. 0.0
Let 0.0
's 0.0
keep 0.0
it 0.0
that 0.0
way 0.0
. 0.0
Happy 0.8
Valentine 0.0
's 0.0
Day 0.0
, 0.0
Texas 0.0
. 0.0


0.8

### Sentiment analysis with VADER 

(Valence Aware Dictionary and Sentiment Reasoner, in NLTK)

In [33]:
sia = SentimentIntensityAnalyzer()

def compute_vader_sentiment(text_column):
    '''Takes a pandas series (dataframe column) of strings and outputs a list
    of sentiment scores using VADER from nltk package.'''
    V_sentiment = []
    for row in range(len(text_column)):
        sentiment = 0
        tokens = word_tokenize(text_column[row])
        for word in tokens:
            # Polarity score returns dictionary.
            ss = sia.polarity_scores(word)
            # Keep the 'compound' result. (for now? could tally pos/neg instead)
            sentiment += ss['compound']
        V_sentiment.append(sentiment)
    return V_sentiment

In [41]:
# Run Vader sentiment and store results in new column.
start_time = time.time()
resps_smt_Df['V_sentiment'] = compute_vader_sentiment(resps_smt_Df["response_text"])
end_time = time.time()
print("run time:",end_time-start_time)

# 1.3M responses took 18 minutes.

run time: 1074.4153938293457


In [42]:
1074.4153938293457/60

17.906923230489095

In [39]:
# Sample vader analysis for one response.
sia = SentimentIntensityAnalyzer()
tokens = word_tokenize(resps_smt_Df["response_text"][0])
for word in tokens:
    ss = sia.polarity_scores(word)
    print(word)
    for k in sorted(ss):    
        print('{0}: {1}, '.format(k, ss[k]), end='')
    print("\n---------------")

Protecting
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
birth
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
is
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
not
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
the
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
same
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
as
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
protecting
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
life
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
.
compound: 0.0, neg: 0.0, neu: 0.0, pos: 0.0, 
---------------
You
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
may
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
very
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
well
compound: 0.2732, neg: 0.0, neu: 0.0, pos: 1.0, 
---------------
pledge
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
----