In [1]:
# Import Libraries.
import nltk
from nltk import word_tokenize
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import sentiwordnet as swn
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import time



### Import the data

In [12]:
# Create location variables for data.

# Raw responses; 1.4M chunks
responses01 = "../data/facebook_congress_responses01.csv"
responses02 = "../data/facebook_congress_responses02.csv"
responses03 = "../data/facebook_congress_responses03.csv"
responses04 = "../data/facebook_congress_responses04.csv"
responses05 = "../data/facebook_congress_responses05.csv"
responses06 = "../data/facebook_congress_responses06.csv"
responses07 = "../data/facebook_congress_responses07.csv"
responses08 = "../data/facebook_congress_responses08.csv"
responses09 = "../data/facebook_congress_responses09.csv"
responses10 = "../data/facebook_congress_responses10.csv"

# Raw responses; 6.9M each
responses_1st_half = "../data/congress_responses_1st_half.csv"
responses_2nd_half = "../data/congress_responses_2nd_half.csv"

# Raw posts and annotations
posts = "../data/facebook_congress_posts.csv"
annotations = "../data/annotations.csv"

# Raw responses with sentiment analysis; 1.4M chunks.
resps_smt_01 = "../data/resps01_smt.csv"
resps_smt_02 = "../data/resps02_smt.csv"
resps_smt_03 = "../data/resps03_smt.csv"
resps_smt_04 = "../data/resps04_smt.csv"
resps_smt_05 = "../data/resps05_smt.csv"
resps_smt_06 = "../data/resps06_smt.csv"
resps_smt_07 = "../data/resps07_smt.csv"
resps_smt_08 = "../data/resps08_smt.csv"
resps_smt_09 = "../data/resps09_smt.csv"
resps_smt_10 = "../data/resps10_smt.csv"


In [3]:
# Download posts and annotations. This is relatively fast.
start_time = time.time()
postsDf = pd.read_csv(posts)
annotDf = pd.read_csv(annotations)
end_time = time.time()
print("run time:",end_time-start_time)

run time: 4.8103578090667725


#### Dealing with the Responses

Figure out how much response data to load for the current task. Each responses01 to responses10 chunk is already 1.4 million posts. For the sentiment analysis, we can get sentiment in small segments and aggregate everything at the end of the analysis.

In [5]:
# Load 1.4M responses (one tenth of the entire dataset.)
# This is enough response data for writing initial code, debugging, etc.
start_time = time.time()
respdf10 = pd.read_csv(responses10, index_col=0)
end_time = time.time()
print("run time:",end_time-start_time)

  mask |= (ar1 == a)


run time: 8.884251594543457


In [12]:
# Response rows 0:6933254; if we want the first 6M responses.
# start_time = time.time()
# resp_1_df = pd.read_csv(responses_1st_half, index_col=0)
# end_time = time.time()
# print("run time:",end_time-start_time)

  mask |= (ar1 == a)


run time: 39.64879083633423


In [4]:
# Response rows 6933255:end; if we want the remaining 6M responses.
# start_time = time.time()
# resp_2_df = pd.read_csv(responses_2nd_half, index_col=0)
# end_time = time.time()
# print("run time:",end_time-start_time)

## Sentiment Analysis

In [14]:
# resps_smt_01 = "../data/resps01_smt.csv"
start_time = time.time()
resp_1_df = pd.read_csv(resps_smt_01, index_col=0)
end_time = time.time()
print("run time:",end_time-start_time)

  mask |= (ar1 == a)


run time: 4.560492277145386


In [15]:
resp_1_df

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-1,-0.7458
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,1,0.3302
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0,0.3612
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0,0.3612
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,0,-0.0821
5,57265377,M,0,Candice,"Women, the ""hosts"" of the unborn babies you're...",Roger Williams,Congress_Republican,2,1.5937
6,57265377,M,0,Cheri,I am Pro Choice and always will be,Roger Williams,Congress_Republican,0,0.0000
7,57265377,M,0,Julie,Thank you for taking a stand!!!!,Roger Williams,Congress_Republican,0,0.3612
8,57265377,M,0,Stephen,I seem to disagree with you again Mr.Williams....,Roger Williams,Congress_Republican,1,-0.1795
9,57265377,M,0,Tony,Please demonstrate your support by providing p...,Roger Williams,Congress_Republican,1,1.2140


In [13]:
# Need to make sure there are no NaNs.
# posts_smt_Df = postsDf.fillna('0')

In [32]:
# Make sure NaNs don't throw error.
# resps1_smt_df = resp_1_df.fillna('0') # first half of responses
# resps01_smt_df = respDf01.fillna('0') # 1/10 of responses
# resps02_smt_df = respdf02.fillna('0') # 1/10 of responses
# resps03_smt_df = respdf03.fillna('0') # 1/10 of responses
# resps04_smt_df = respdf04.fillna('0') # 1/10 of responses
# resps05_smt_df = respdf05.fillna('0') # 1/10 of responses
# resps06_smt_df = respdf06.fillna('0') # 1/10 of responses
# resps07_smt_df = respdf07.fillna('0') # 1/10 of responses
# resps08_smt_df = respdf08.fillna('0') # 1/10 of responses
# resps09_smt_df = respdf09.fillna('0') # 1/10 of responses
resps10_smt_df = respdf10.fillna('0') # 1/10 of responses


In [11]:
# Take a small slice for testing.
# resps_smt_Df = resps_smt_Df[:1000]

### Hu & Liu - pos/neg

This first lexicon is from the following paper:
Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." _Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD-2004)_, Aug 22-25, 2004, Seattle, Washington, USA.

This is a simple lexicon with two lists of 2006 positive words and 4785 negative words.

In [5]:
# Import HuLiu lexicon. (from http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar)

positive = open('../data/positive-words.txt').read()
pos_tokens = word_tokenize(positive)
negative = open('../data/negative-words.txt',encoding = "ISO-8859-1").read()
neg_tokens = word_tokenize(negative)

In [6]:
# Simple sentiment analysis with pos/neg values given to words in
# pos/neg lists. Neutral words get no value
def get_HL_Sentiment(sentence_string):
    sentiment = 0
    tokens = word_tokenize(sentence_string)
    for token in tokens:
        if token in pos_tokens:
            sentiment += 1
        if token in neg_tokens:
            sentiment -= 1
    return sentiment

In [7]:
def run_HL_sentiment(pandas_series): # e.g., posts_smt_Df["post_text"]
    '''Takes Pandas Series object of strings and returns a list of
    sentiment values that is the same length as the pandas series.'''
    HL_sentiment = []
    count = 0
    for row in range(len(pandas_series)):
        HL_sentiment.append(get_HL_Sentiment(pandas_series[row]))
        if count == 50000:
            print("completed",row)
            count = 0
        count += 1
    return HL_sentiment

In [32]:
# Run the sentiment analysis on the posts.
# Commented out because we've already saved this column.
# HL_sentiment = run_HL_sentiment(posts_smt_Df["post_text"])
# posts_smt_Df['HL_sentiment'] = HL_sentiment

Run times for the Hu Liu sentiment analysis in 1.4M chunks.

|   chunk#	| minutes  	| 
|---	|---	|
|   01	|   	52.73078450759252| 
|   02	|   78.45855053663254	| 
|   03	|   76.10809705257415	| 
|   04	|   66.12366257508596 |
|   05	|   78.52342672745387 |
|   06	|   69.76696998675665 |
|   07	|   65.30492475827535 |
|   08	|   76.82786030769348 |
|   09	|   72.33480822245279 |
|   10	|   0 |


In [33]:
##### change four numbers..... ########
# Set index to start at 0 so function wont throw out of range error.
resps10_smt_df = resps10_smt_df.reset_index(drop=True)
# Run the sentiment analysis on 1/10 of responses.
start_time = time.time()
print(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resps10_smt_df["HL_sentiment"] = run_HL_sentiment(resps10_smt_df["response_text"])
end_time = time.time()
print("run time minutes:",(end_time-start_time)/60)

2019-10-10 22:26:27
completed 50000
completed 100000
completed 150000
completed 200000
completed 250000
completed 300000
completed 350000
completed 400000
completed 450000
completed 500000
completed 550000
completed 600000
completed 650000
completed 700000
completed 750000
completed 800000
completed 850000
completed 900000
completed 950000
completed 1000000
completed 1050000
completed 1100000
completed 1150000
completed 1200000
completed 1250000
completed 1300000
completed 1350000
run time minutes: 65.53730296293894


In [34]:
resps10_smt_df

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment
0,40220308,M,467864,Don,"Somehow I can picture Barack, MicHELLe, Joe, H...",Joe Barton,Congress_Republican,2
1,40220308,M,467864,Julie,Why have these hearings if nobody has to take ...,Joe Barton,Congress_Republican,-4
2,40220308,M,467864,Donald,"So Terry, you never did tell how much it pays ...",Joe Barton,Congress_Republican,-3
3,40220308,M,467864,Johnny,They will never release these numbers because ...,Joe Barton,Congress_Republican,-1
4,40220308,M,467864,Dale,Yes the liberals on this page have absolutely ...,Joe Barton,Congress_Republican,-1
5,40220308,M,467864,Maria,Congressman Joe Barton do YOU know where that ...,Joe Barton,Congress_Republican,1
6,40220308,M,467864,Congressman,"Ms. Maria Cypriotis Little, Those numbers com...",Joe Barton,Congress_Republican,-2
7,40220308,M,467864,Kelli,http://www.cnbc.com/id/101218418,Joe Barton,Congress_Republican,0
8,40220308,M,467864,Jim,"""I know nothing!"" Sgt Schultz and Sec Duh?",Joe Barton,Congress_Republican,0
9,40220308,M,467864,Congressman,"Ms. Kelli Brethour, You bring up a good point...",Joe Barton,Congress_Republican,3


In [37]:
# Save chunk to instance.
start_time = time.time()
resps10_smt_df.to_csv("../data/resps10_HL-smt.csv", header = True)
end_time = time.time()
print("run time:",end_time-start_time)

run time: 11.03587555885315


In [13]:
resps01_smt_df

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-1
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,1
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,0
5,57265377,M,0,Candice,"Women, the ""hosts"" of the unborn babies you're...",Roger Williams,Congress_Republican,2
6,57265377,M,0,Cheri,I am Pro Choice and always will be,Roger Williams,Congress_Republican,0
7,57265377,M,0,Julie,Thank you for taking a stand!!!!,Roger Williams,Congress_Republican,0
8,57265377,M,0,Stephen,I seem to disagree with you again Mr.Williams....,Roger Williams,Congress_Republican,1
9,57265377,M,0,Tony,Please demonstrate your support by providing p...,Roger Williams,Congress_Republican,1


In [None]:
# Run the sentiment analysis on first half of responses. Takes too long...
# start_time = time.time()
# HL_sentiment = []
# for row in range(len(resps_smt_df["response_text"])):
#     HL_sentiment.append(get_HL_Sentiment(resps_smt_df["response_text"][row]))
# resps_smt_df['HL_sentiment'] = HL_sentiment
# end_time = time.time()
# print("run time:",end_time-start_time)

In [33]:
posts_smt_Df

Unnamed: 0,op_id,op_gender,post_id,post_text,post_type,HL_sentiment
0,57265377,M,0,"Yesterday, my colleagues and I voted to protec...",video,2
1,57265377,M,1,Roses are red...and so is Texas. Let's keep it...,video,0
2,57265377,M,2,#TBT to this classic video. #DonkeyWhisperer,video,1
3,57265377,M,3,Since President Donald J. Trump was sworn in o...,video,2
4,57265377,M,4,Remembering our 40th president today. LIKE to ...,video,0
5,57265377,M,5,"After a phenomenal season, Jason Garrett has b...",photo,1
6,57265377,M,6,"In Texas, we honor our military heroes. #Chris...",photo,1
7,57265377,M,7,"Last night, President Donald J. Trump nominate...",video,0
8,57265377,M,8,President Trump has just nominated judge Neil ...,status,3
9,57265377,M,9,"""I agree 100% with President Trump's decision....",link,0


In [34]:
# Have this saved; no need to run again until/unless change algorithm.
# posts_smt_Df.to_csv("./data/congress_posts_HL-smt.csv", header = True)

### Sentiment Analysis with SentiWordNet

Not sure this is the best tool for FB responses... tabling for now; will come back. Need to do more research about specifically how to implement this for our unlabeled data. Maybe only useful for supervised learning?

Need to preprocess data with POS and usage score? Not sure of range for usage.

- n - NOUN 
- v - VERB 
- a - ADJECTIVE 
- s - ADJECTIVE SATELLITE 
- r - ADVERB



In [21]:
# "breakdown" is the word we are interested in; "n" is the POS, "03" is the usage value

our_word = swn.senti_synset('protect.v.02') 
print(our_word)
our_word.pos_score()
our_word.neg_score()
our_word.obj_score()

<protect.v.02: PosScore=0.0 NegScore=0.0>


1.0

### Sentiment Analysis with TextBlob polarity scores

In [16]:
def compute_TB_sentiment(text_column):
    '''Takes a dataframe column of strings (pandas series) and outputs a list of 
    sentiment scores using textBlob.''' 
    TB_sentiment = []
    count = 0
    for row in range(len(text_column)):
        if count == 50000:
            print("completed",row,time.strftime('%H:%M:%S', time.gmtime(time.time())))
            count = 0
        count += 1
        sentiment = 0
        tokens = word_tokenize(text_column[row])
        for word in tokens:
            # Range of sentiment for each word is -1 to 1.
            sentiment += TextBlob(word).sentiment.polarity
        TB_sentiment.append(sentiment)
    return TB_sentiment

In [27]:
##### change four numbers..... ########

# Load 1.4M responses (one tenth of the entire dataset.)
start_time = time.time()
resp10smt_df = pd.read_csv("../data/resps10_HL-smt.csv", index_col=0)
end_time = time.time()
print("run time:",end_time-start_time)
# resp02smt_df = resp02smt_df.drop(resp02smt_df.columns[0], axis = 1)
resps10_smt_df = resp10smt_df.fillna('0') # 1/10 of responses


  mask |= (ar1 == a)


run time: 7.801528692245483


In [17]:
##### change two numbers..... ########

# Set index to start at 0 so function wont throw out of range error.
# resps09_smt_df = resps09_smt_df.reset_index(drop=True)
# Run the TextBlob analysis on 1/10 of responses.
start_time = time.time()
print(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resp_1_df["TB_sentiment"] = compute_TB_sentiment(resp_1_df["response_text"])
end_time = time.time()
print("run time minutes:",(end_time-start_time)/60)


##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resp_1_df.to_csv("../data/resps01_smt.csv", header = True)
end_time = time.time()
print("run time for data save:",end_time-start_time)


##### change one number..... ########

# Verify textBlob sentiment is there.
resp_1_df.head()

2019-10-14 21:13:42
completed 50000 21:16:04
completed 100000 21:18:27
completed 150000 21:22:51
completed 200000 21:26:26
completed 250000 21:29:55
completed 300000 21:33:00
completed 350000 21:37:15
completed 400000 21:39:54
completed 450000 21:42:55
completed 500000 21:45:12
completed 550000 21:47:47
completed 600000 21:50:31
completed 650000 21:53:48
completed 700000 21:57:17
completed 750000 22:00:06
completed 800000 22:02:44
completed 850000 22:06:11
completed 900000 22:09:50
completed 950000 22:13:20
completed 1000000 22:16:29
completed 1050000 22:20:57
completed 1100000 22:23:50
completed 1150000 22:27:14
completed 1200000 22:30:25
completed 1250000 22:33:20
completed 1300000 22:37:04
completed 1350000 22:40:07
run time minutes: 88.4810378352801
run time for data save: 17.650113582611084


Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment,TB_sentiment
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-1,-0.7458,-1.3
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,1,0.3302,0.0
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0,0.3612,0.0
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0,0.3612,0.0
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,0,-0.0821,-0.8


In [18]:
start_time = time.time()
resp_2_df = pd.read_csv(resps_smt_02, index_col=0)
end_time = time.time()
print("run time:",end_time-start_time)
##### change two numbers..... ########

# Set index to start at 0 so function wont throw out of range error.
# resps09_smt_df = resps09_smt_df.reset_index(drop=True)
# Run the TextBlob analysis on 1/10 of responses.
start_time = time.time()
print(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resp_2_df["TB_sentiment"] = compute_TB_sentiment(resp_2_df["response_text"])
end_time = time.time()
print("run time minutes:",(end_time-start_time)/60)


##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resp_2_df.to_csv("../data/resps02_smt.csv", header = True)
end_time = time.time()
print("run time for data save:",end_time-start_time)


##### change one number..... ########

# Verify textBlob sentiment is there.
resp_2_df.head()

  mask |= (ar1 == a)


run time: 15.88544511795044
2019-10-14 23:49:43
completed 50000 23:56:00
completed 100000 23:59:54
completed 150000 00:03:54
completed 200000 00:08:11
completed 250000 00:12:57
completed 300000 00:17:53
completed 350000 00:22:55
completed 400000 00:28:55
completed 450000 00:33:25
completed 500000 00:39:07
completed 550000 00:44:05
completed 600000 00:49:12
completed 650000 00:54:10
completed 700000 00:59:00
completed 750000 01:04:08
completed 800000 01:08:30
completed 850000 01:14:29
completed 900000 01:18:47
completed 950000 01:23:25
completed 1000000 01:28:40
completed 1050000 01:33:27
completed 1100000 01:37:50
completed 1150000 01:42:30
completed 1200000 01:46:52
completed 1250000 01:51:01
completed 1300000 01:55:41
completed 1350000 01:59:38
run time minutes: 132.87436611652373
run time for data save: 19.59464192390442


Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment,TB_sentiment
0,29687946,M,14192,Teresa,"The Obama administration can ""justify"" giving ...",Rand Paul,Congress_Republican,1,0.68,0.4
1,29687946,M,14192,Robert,wtf we always find money for low class goverme...,Rand Paul,Congress_Republican,0,-0.3652,0.1
2,29687946,M,14192,Bill,Kerry another Obama puppet. Egypt is no longer...,Rand Paul,Congress_Republican,-2,0.0636,-1.0
3,29687946,M,14192,Kathy,"Oh, yes; absolutely! Let's send even more mone...",Rand Paul,Congress_Republican,-1,-0.3345,1.3
4,29687946,M,14192,Russ,"Oh gezzzzzzzzzz it just get worse by the day, ...",Rand Paul,Congress_Republican,-1,-0.8167,-0.4


In [24]:
##### change two numbers..... ########

# Set index to start at 0 so function wont throw out of range error.
# resps09_smt_df = resps09_smt_df.reset_index(drop=True)
# Run the TextBlob analysis on 1/10 of responses.
start_time = time.time()
print(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resps09_smt_df["TB_sentiment"] = compute_TB_sentiment(resps09_smt_df["response_text"])
end_time = time.time()
print("run time minutes:",(end_time-start_time)/60)

2019-10-11 22:45:07
completed 50000
completed 100000
completed 150000
completed 200000
completed 250000
completed 300000
completed 350000
completed 400000
completed 450000
completed 500000
completed 550000
completed 600000
completed 650000
completed 700000
completed 750000
completed 800000
completed 850000
completed 900000
completed 950000
completed 1000000
completed 1050000
completed 1100000
completed 1150000
completed 1200000
completed 1250000
completed 1300000
completed 1350000
run time minutes: 96.82541060845057


In [31]:
##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resps10_smt_df.to_csv("../data/resps10_smt.csv", header = True)
end_time = time.time()
print("run time:",end_time-start_time)

run time: 12.811877965927124


In [25]:
##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resps09_smt_df.to_csv("../data/resps09_smt.csv", header = True)
end_time = time.time()
print("run time:",end_time-start_time)

run time: 12.692348957061768


In [26]:
# Verify sentiment value is there.
resps09_smt_df

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,TB_sentiment
0,86569077,W,390566,Jan,"Oh like the ""political"" slogan that Obama said...",Kirsten Gillibrand,Congress_Democratic,1,0.000000
1,86569077,W,390566,Carole,affirmitive!,Kirsten Gillibrand,Congress_Democratic,0,0.000000
2,86569077,W,390566,Kevin,I DON'T CARE IF YOU SIT ON YOUR HEADS DURING T...,Kirsten Gillibrand,Congress_Democratic,0,0.000000
3,86569077,W,390566,Mark,What - and eliminate the visual for the pundit...,Kirsten Gillibrand,Congress_Democratic,0,0.800000
4,86569077,W,390566,Nikita,"Let's use our indoor voices, please.",Kirsten Gillibrand,Congress_Democratic,0,0.000000
5,86569077,W,390566,Jeffrey,Love it! Could you also hold on the obsessive ...,Kirsten Gillibrand,Congress_Democratic,-1,1.000000
6,86569077,W,390566,Jeanan,Good idea for sure. Maybe it will make them s...,Kirsten Gillibrand,Congress_Democratic,0,1.200000
7,86569077,W,390566,Laura,I think it's a great idea. I also think that y...,Kirsten Gillibrand,Congress_Democratic,2,2.511364
8,86569077,W,390566,Susan,Show is better than nothing. It's also better ...,Kirsten Gillibrand,Congress_Democratic,2,1.000000
9,86569077,W,390566,Mark,Might happen once. Sadly not more,Kirsten Gillibrand,Congress_Democratic,0,0.000000


In [30]:
# Verify sentiment value is there.
resps10_smt_df

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,TB_sentiment
0,40220308,M,467864,Don,"Somehow I can picture Barack, MicHELLe, Joe, H...",Joe Barton,Congress_Republican,2,0.000000
1,40220308,M,467864,Julie,Why have these hearings if nobody has to take ...,Joe Barton,Congress_Republican,-4,-0.700000
2,40220308,M,467864,Donald,"So Terry, you never did tell how much it pays ...",Joe Barton,Congress_Republican,-3,-0.600000
3,40220308,M,467864,Johnny,They will never release these numbers because ...,Joe Barton,Congress_Republican,-1,-0.700000
4,40220308,M,467864,Dale,Yes the liberals on this page have absolutely ...,Joe Barton,Congress_Republican,-1,0.200000
5,40220308,M,467864,Maria,Congressman Joe Barton do YOU know where that ...,Joe Barton,Congress_Republican,1,0.500000
6,40220308,M,467864,Congressman,"Ms. Maria Cypriotis Little, Those numbers com...",Joe Barton,Congress_Republican,-2,3.871591
7,40220308,M,467864,Kelli,http://www.cnbc.com/id/101218418,Joe Barton,Congress_Republican,0,0.000000
8,40220308,M,467864,Jim,"""I know nothing!"" Sgt Schultz and Sec Duh?",Joe Barton,Congress_Republican,0,-0.400000
9,40220308,M,467864,Congressman,"Ms. Kelli Brethour, You bring up a good point...",Joe Barton,Congress_Republican,3,1.716667


In [34]:
# Run TextBlob analysis on chunk of responses.
start_time = time.time()
resps_smt_Df['TB_sentiment'] = compute_TB_sentiment(resps_smt_Df["response_text"])
end_time = time.time()
print("run time:",end_time-start_time)

run time: 5.033034801483154


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [35]:
resps_smt_Df

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,V_sentiment
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-0.7458
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,0.3302
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0.3612
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0.3612
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,-0.0821
5,57265377,M,0,Candice,"Women, the ""hosts"" of the unborn babies you're...",Roger Williams,Congress_Republican,1.5937
6,57265377,M,0,Cheri,I am Pro Choice and always will be,Roger Williams,Congress_Republican,0.0000
7,57265377,M,0,Julie,Thank you for taking a stand!!!!,Roger Williams,Congress_Republican,0.3612
8,57265377,M,0,Stephen,I seem to disagree with you again Mr.Williams....,Roger Williams,Congress_Republican,-0.1795
9,57265377,M,0,Tony,Please demonstrate your support by providing p...,Roger Williams,Congress_Republican,1.2140


In [19]:
# A sample look at sentiment for words in a post.

tokens = word_tokenize(posts_smt_Df["post_text"][1])
sentiment = 0
for i in tokens:
    token_score = TextBlob(i).sentiment.polarity
    print(i,token_score)
    sentiment += token_score
    
sentiment

Roses 0.0
are 0.0
red 0.0
... 0.0
and 0.0
so 0.0
is 0.0
Texas 0.0
. 0.0
Let 0.0
's 0.0
keep 0.0
it 0.0
that 0.0
way 0.0
. 0.0
Happy 0.8
Valentine 0.0
's 0.0
Day 0.0
, 0.0
Texas 0.0
. 0.0


0.8

### Sentiment analysis with VADER 

(Valence Aware Dictionary and Sentiment Reasoner, in NLTK)

In [9]:
sia = SentimentIntensityAnalyzer()

def compute_vader_sentiment(text_column):
    '''Takes a pandas series (dataframe column) of strings and outputs a list
    of sentiment scores using VADER from nltk package.'''
    V_sentiment = []
    count = 0
    for row in range(len(text_column)):
        if count == 50000:
            print("completed",row,time.strftime('%H:%M:%S', time.gmtime(time.time())))
            count = 0
        count += 1
        sentiment = 0
        tokens = word_tokenize(text_column[row])
        for word in tokens:
            # Polarity score returns dictionary.
            ss = sia.polarity_scores(word)
            # Keep the 'compound' result. (for now? could tally pos/neg instead)
            sentiment += ss['compound']
        V_sentiment.append(sentiment)
    return V_sentiment

In [7]:
##### change four numbers..... ########

# Load 1.4M responses (one tenth of the entire dataset.)
start_time = time.time()
resp02smt_df = pd.read_csv("../data/resps02_HL-smt.csv", index_col=0)
end_time = time.time()
print("run time for loading responses:",end_time-start_time)
# resp02smt_df = resp02smt_df.drop(resp02smt_df.columns[0], axis = 1)
resps02_smt_df = resp02smt_df.fillna('0') # 1/10 of responses


  mask |= (ar1 == a)


run time: 9.290826559066772


In [4]:
##### change two numbers..... ########

# Run the Vader analysis on 1/10 of responses.
start_time = time.time()
print(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resps01_smt_df["V_sentiment"] = compute_vader_sentiment(resps01_smt_df["response_text"])
end_time = time.time()
print("run time minutes:",(end_time-start_time)/60)

2019-10-12 17:39:17
run time minutes: 18.47495219310125


In [5]:
##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resps01_smt_df.to_csv("../data/resps01_smt.csv", header = True)
end_time = time.time()
print("run time:",end_time-start_time)

run time: 11.898149967193604


In [6]:
# Verify vader sentiment is there.
resps01_smt_df

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-1,-0.7458
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,1,0.3302
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0,0.3612
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0,0.3612
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,0,-0.0821
5,57265377,M,0,Candice,"Women, the ""hosts"" of the unborn babies you're...",Roger Williams,Congress_Republican,2,1.5937
6,57265377,M,0,Cheri,I am Pro Choice and always will be,Roger Williams,Congress_Republican,0,0.0000
7,57265377,M,0,Julie,Thank you for taking a stand!!!!,Roger Williams,Congress_Republican,0,0.3612
8,57265377,M,0,Stephen,I seem to disagree with you again Mr.Williams....,Roger Williams,Congress_Republican,1,-0.1795
9,57265377,M,0,Tony,Please demonstrate your support by providing p...,Roger Williams,Congress_Republican,1,1.2140


In [8]:
##### change two numbers..... ########

# Run the Vader analysis on 1/10 of responses.
start_time = time.time()
print(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resps02_smt_df["V_sentiment"] = compute_vader_sentiment(resps02_smt_df["response_text"])
end_time = time.time()
print("run time (minutes) for analysis:",(end_time-start_time)/60)


##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resps02_smt_df.to_csv("../data/resps02_smt.csv", header = True)
end_time = time.time()
print("run time for data save:",end_time-start_time)


##### change one number..... ########

# Verify vader sentiment is there.
resps02_smt_df.head()

2019-10-12 17:58:08
run time (minutes) for analysis: 26.08641261657079
run time for data save: 16.10792636871338


Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment
0,29687946,M,14192,Teresa,"The Obama administration can ""justify"" giving ...",Rand Paul,Congress_Republican,1,0.68
1,29687946,M,14192,Robert,wtf we always find money for low class goverme...,Rand Paul,Congress_Republican,0,-0.3652
2,29687946,M,14192,Bill,Kerry another Obama puppet. Egypt is no longer...,Rand Paul,Congress_Republican,-2,0.0636
3,29687946,M,14192,Kathy,"Oh, yes; absolutely! Let's send even more mone...",Rand Paul,Congress_Republican,-1,-0.3345
4,29687946,M,14192,Russ,"Oh gezzzzzzzzzz it just get worse by the day, ...",Rand Paul,Congress_Republican,-1,-0.8167


In [12]:
##### change four numbers..... ########

# Load 1.4M responses (one tenth of the entire dataset.)
start_time = time.time()
resp05smt_df = pd.read_csv("../data/resps05_smt.csv", index_col=0)
end_time = time.time()
print("run time for loading responses:",end_time-start_time)
resps05_smt_df = resp05smt_df.fillna('0') # 1/10 of responses


##### change two numbers..... ########

# Run the Vader analysis on 1/10 of responses.
start_time = time.time()
print("analysis started:",time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resps05_smt_df["V_sentiment"] = compute_vader_sentiment(resps05_smt_df["response_text"])
end_time = time.time()
print("run time (minutes) for analysis:",(end_time-start_time)/60)


##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resps05_smt_df.to_csv("../data/resps05_smt.csv", header = True)
end_time = time.time()
print("run time for data save:",end_time-start_time)


##### change one number..... ########

# Verify vader sentiment is there.
resps05_smt_df.head()

  mask |= (ar1 == a)


run time for loading responses: 9.667893886566162
analysis started: 2019-10-12 20:35:47
completed 50000 20:36:43
completed 100000 20:37:40
completed 150000 20:38:32
completed 200000 20:39:18
completed 250000 20:40:14
completed 300000 20:41:05
completed 350000 20:41:55
completed 400000 20:42:48
completed 450000 20:43:41
completed 500000 20:44:41
completed 550000 20:45:49
completed 600000 20:46:45
completed 650000 20:47:48
completed 700000 20:48:49
completed 750000 20:49:56
completed 800000 20:50:40
completed 850000 20:51:47
completed 900000 20:52:56
completed 950000 20:53:48
completed 1000000 20:54:32
completed 1050000 20:55:28
completed 1100000 20:56:21
completed 1150000 20:57:31
completed 1200000 20:58:39
completed 1250000 20:59:48
completed 1300000 21:00:58
completed 1350000 21:01:58
run time (minutes) for analysis: 27.013674302895865
run time for data save: 17.369163751602173


Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,TB_sentiment,V_sentiment
0,87524765,M,181905,James,Once again thank you for all you do Congressman.,Morgan Griffith,Congress_Republican,1,0.0,0.3612
1,87524765,M,181905,Paulette,"Congressman Griffith , thank you and your coll...",Morgan Griffith,Congress_Republican,2,0.85,-0.1585
2,87524765,M,181905,Patrick,"I agree with your efforts, no real stretch on ...",Morgan Griffith,Congress_Republican,0,1.35,1.5586
3,87524765,M,181905,Pat,Vote NO on Fast Track! It would be insane to g...,Morgan Griffith,Congress_Republican,-1,-0.6,-0.6979
4,87524765,M,181905,Zhivko,http://www.nytimes.com/2015/06/19/world/europe...,Morgan Griffith,Congress_Republican,0,0.0,0.0


In [13]:
##### change four numbers..... ########

# Load 1.4M responses (one tenth of the entire dataset.)
start_time = time.time()
resp06smt_df = pd.read_csv("../data/resps06_smt.csv", index_col=0)
end_time = time.time()
print("run time for loading responses:",end_time-start_time)
resps06_smt_df = resp06smt_df.fillna('0') # 1/10 of responses


##### change two numbers..... ########

# Run the Vader analysis on 1/10 of responses.
start_time = time.time()
print("analysis started:",time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resps06_smt_df["V_sentiment"] = compute_vader_sentiment(resps06_smt_df["response_text"])
end_time = time.time()
print("run time (minutes) for analysis:",(end_time-start_time)/60)


##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resps06_smt_df.to_csv("../data/resps06_smt.csv", header = True)
end_time = time.time()
print("run time for data save:",end_time-start_time)


##### change one number..... ########

# Verify vader sentiment is there.
resps06_smt_df.head()

  mask |= (ar1 == a)


run time for loading responses: 9.414726734161377
analysis started: 2019-10-12 21:03:16
completed 50000 21:04:06
completed 100000 21:04:48
completed 150000 21:05:37
completed 200000 21:06:36
completed 250000 21:07:44
completed 300000 21:08:55
completed 350000 21:09:45
completed 400000 21:10:34
completed 450000 21:11:26
completed 500000 21:12:17
completed 550000 21:13:08
completed 600000 21:13:59
completed 650000 21:14:51
completed 700000 21:15:37
completed 750000 21:16:08
completed 800000 21:16:57
completed 850000 21:18:07
completed 900000 21:19:12
completed 950000 21:20:15
completed 1000000 21:21:23
completed 1050000 21:22:19
completed 1100000 21:23:12
completed 1150000 21:23:49
completed 1200000 21:24:35
completed 1250000 21:25:18
completed 1300000 21:25:57
completed 1350000 21:26:42
run time (minutes) for analysis: 24.02466559012731
run time for data save: 16.05329918861389


Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,TB_sentiment,V_sentiment
0,10532564,M,264460,Christopher,"BOY,THOSE GREEN SIGNS ARE EXPENSIVE",Frank Pallone,Congress_Democratic,0,-0.7,0.0
1,10532564,M,264460,Michael,"Paul, congress's approval rating hasn't been o...",Frank Pallone,Congress_Democratic,1,0.136364,0.2035
2,10532564,M,264460,Mark,Shall we look at the breakdown by Party?,Frank Pallone,Congress_Democratic,-1,0.0,0.4019
3,10532564,M,264460,Paul,I don't know what Mike is talking about. From ...,Frank Pallone,Congress_Democratic,-1,0.475,0.8003
4,10532564,M,264460,Colleen,OBAMA is wax fruit... looks yummy until you ta...,Frank Pallone,Congress_Democratic,-1,-0.5,0.05


In [14]:
##### change four numbers..... ########

# Load 1.4M responses (one tenth of the entire dataset.)
start_time = time.time()
resp07smt_df = pd.read_csv("../data/resps07_smt.csv", index_col=0)
end_time = time.time()
print("run time for loading responses:",end_time-start_time)
resps07_smt_df = resp07smt_df.fillna('0') # 1/10 of responses


##### change two numbers..... ########

# Run the Vader analysis on 1/10 of responses.
start_time = time.time()
print("analysis started:",time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resps07_smt_df["V_sentiment"] = compute_vader_sentiment(resps07_smt_df["response_text"])
end_time = time.time()
print("run time (minutes) for analysis:",(end_time-start_time)/60)


##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resps07_smt_df.to_csv("../data/resps07_smt.csv", header = True)
end_time = time.time()
print("run time for data save:",end_time-start_time)


##### change one number..... ########

# Verify vader sentiment is there.
resps07_smt_df.head()

  mask |= (ar1 == a)


run time for loading responses: 8.4635648727417
analysis started: 2019-10-12 21:27:43
completed 50000 21:28:24
completed 100000 21:29:10
completed 150000 21:29:49
completed 200000 21:30:23
completed 250000 21:31:03
completed 300000 21:31:48
completed 350000 21:32:29
completed 400000 21:33:14
completed 450000 21:33:49
completed 500000 21:34:28
completed 550000 21:35:22
completed 600000 21:36:23
completed 650000 21:37:23
completed 700000 21:38:33
completed 750000 21:39:31
completed 800000 21:40:40
completed 850000 21:41:32
completed 900000 21:42:22
completed 950000 21:43:10
completed 1000000 21:43:45
completed 1050000 21:44:33
completed 1100000 21:45:28
completed 1150000 21:46:24
completed 1200000 21:47:12
completed 1250000 21:47:57
completed 1300000 21:48:47
completed 1350000 21:49:37
run time (minutes) for analysis: 22.551532677809398
run time for data save: 14.169496059417725


Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,TB_sentiment,V_sentiment
0,83458649,W,300841,Amir,Hillary Clinton is no better whatsoever. It's ...,Elizabeth Warren,Congress_Democratic,1,0.5,0.1444
1,83458649,W,300841,Linda,"Think Ben Ghazi people, Trump maybe a lot of t...",Elizabeth Warren,Congress_Democratic,-2,-0.2,-1.5423
2,83458649,W,300841,Rachael,SO MUCH IGNORANCE IN THESE COMMENTS,Elizabeth Warren,Congress_Democratic,0,0.2,-0.3612
3,83458649,W,300841,Maggie,"Grow up & vote for Hillary, don't surrender to...",Elizabeth Warren,Congress_Democratic,-1,0.0,0.0
4,83458649,W,300841,Louis,*For the delusional Donald supporters* Trump i...,Elizabeth Warren,Congress_Democratic,-2,-0.7,-0.5423


In [15]:
##### change four numbers..... ########

# Load 1.4M responses (one tenth of the entire dataset.)
start_time = time.time()
resp08smt_df = pd.read_csv("../data/resps08_smt.csv", index_col=0)
end_time = time.time()
print("run time for loading responses:",end_time-start_time)
resps08_smt_df = resp08smt_df.fillna('0') # 1/10 of responses


##### change two numbers..... ########

# Run the Vader analysis on 1/10 of responses.
start_time = time.time()
print("analysis started:",time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resps08_smt_df["V_sentiment"] = compute_vader_sentiment(resps08_smt_df["response_text"])
end_time = time.time()
print("run time (minutes) for analysis:",(end_time-start_time)/60)


##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resps08_smt_df.to_csv("../data/resps08_smt.csv", header = True)
end_time = time.time()
print("run time for data save:",end_time-start_time)


##### change one number..... ########

# Verify vader sentiment is there.
resps08_smt_df.head()

  mask |= (ar1 == a)


run time for loading responses: 9.371558904647827
analysis started: 2019-10-12 21:50:41
completed 50000 21:51:30
completed 100000 21:52:21
completed 150000 21:53:19
completed 200000 21:54:12
completed 250000 21:55:16
completed 300000 21:56:31
completed 350000 21:57:36
completed 400000 21:58:48
completed 450000 22:00:06
completed 500000 22:01:05
completed 550000 22:01:57
completed 600000 22:03:04
completed 650000 22:04:08
completed 700000 22:05:04
completed 750000 22:06:09
completed 800000 22:07:03
completed 850000 22:07:51
completed 900000 22:08:41
completed 950000 22:09:36
completed 1000000 22:10:20
completed 1050000 22:11:05
completed 1100000 22:11:53
completed 1150000 22:12:51
completed 1200000 22:13:38
completed 1250000 22:14:30
completed 1300000 22:15:18
completed 1350000 22:16:13
run time (minutes) for analysis: 26.23079349597295
run time for data save: 15.176342248916626


Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,TB_sentiment,V_sentiment
0,91600525,M,351831,Janice,https://www.facebook.com/photo.php?fbid=189557...,Louie Gohmert,Congress_Republican,0,0.0,0.0
1,91600525,M,351831,Gary,Where is your endorsement of Ted Cruz for pres...,Louie Gohmert,Congress_Republican,2,0.55,0.8627
2,91600525,M,351831,Beverly,You're talk at the Conservative Convention ton...,Louie Gohmert,Congress_Republican,2,1.5,1.6247
3,91600525,M,351831,Mary,AMEN !!!,Louie Gohmert,Congress_Republican,0,0.0,0.0
4,91600525,M,351832,Ginger,God help us................,Louie Gohmert,Congress_Republican,0,0.0,0.6751


In [16]:
##### change four numbers..... ########

# Load 1.4M responses (one tenth of the entire dataset.)
start_time = time.time()
resp09smt_df = pd.read_csv("../data/resps09_smt.csv", index_col=0)
end_time = time.time()
print("run time for loading responses:",end_time-start_time)
resps09_smt_df = resp09smt_df.fillna('0') # 1/10 of responses


##### change two numbers..... ########

# Run the Vader analysis on 1/10 of responses.
start_time = time.time()
print("analysis started:",time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resps09_smt_df["V_sentiment"] = compute_vader_sentiment(resps09_smt_df["response_text"])
end_time = time.time()
print("run time (minutes) for analysis:",(end_time-start_time)/60)


##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resps09_smt_df.to_csv("../data/resps09_smt.csv", header = True)
end_time = time.time()
print("run time for data save:",end_time-start_time)


##### change one number..... ########

# Verify vader sentiment is there.
resps09_smt_df.head()

  mask |= (ar1 == a)


run time for loading responses: 9.312560319900513
analysis started: 2019-10-12 22:17:21
completed 50000 22:18:12
completed 100000 22:19:09
completed 150000 22:20:17
completed 200000 22:21:30
completed 250000 22:22:39
completed 300000 22:23:33
completed 350000 22:24:22
completed 400000 22:25:24
completed 450000 22:26:34
completed 500000 22:27:19
completed 550000 22:28:19
completed 600000 22:29:07
completed 650000 22:29:56
completed 700000 22:31:06
completed 750000 22:32:08
completed 800000 22:32:52
completed 850000 22:33:33
completed 900000 22:34:01
completed 950000 22:35:08
completed 1000000 22:36:09
completed 1050000 22:37:03
completed 1100000 22:37:47
completed 1150000 22:38:34
completed 1200000 22:39:15
completed 1250000 22:39:51
completed 1300000 22:40:26
completed 1350000 22:41:16
run time (minutes) for analysis: 24.5812593738238
run time for data save: 14.341899156570435


Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,TB_sentiment,V_sentiment
0,86569077,W,390566,Jan,"Oh like the ""political"" slogan that Obama said...",Kirsten Gillibrand,Congress_Democratic,1,0.0,0.3612
1,86569077,W,390566,Carole,affirmitive!,Kirsten Gillibrand,Congress_Democratic,0,0.0,0.0
2,86569077,W,390566,Kevin,I DON'T CARE IF YOU SIT ON YOUR HEADS DURING T...,Kirsten Gillibrand,Congress_Democratic,0,0.0,-0.5385
3,86569077,W,390566,Mark,What - and eliminate the visual for the pundit...,Kirsten Gillibrand,Congress_Democratic,0,0.8,0.4215
4,86569077,W,390566,Nikita,"Let's use our indoor voices, please.",Kirsten Gillibrand,Congress_Democratic,0,0.0,0.3182


In [17]:
##### change four numbers..... ########

# Load 1.4M responses (one tenth of the entire dataset.)
start_time = time.time()
resp10smt_df = pd.read_csv("../data/resps10_smt.csv", index_col=0)
end_time = time.time()
print("run time for loading responses:",end_time-start_time)
resps10_smt_df = resp10smt_df.fillna('0') # 1/10 of responses


##### change two numbers..... ########

# Run the Vader analysis on 1/10 of responses.
start_time = time.time()
print("analysis started:",time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resps10_smt_df["V_sentiment"] = compute_vader_sentiment(resps10_smt_df["response_text"])
end_time = time.time()
print("run time (minutes) for analysis:",(end_time-start_time)/60)


##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resps10_smt_df.to_csv("../data/resps10_smt.csv", header = True)
end_time = time.time()
print("run time for data save:",end_time-start_time)


##### change one number..... ########

# Verify vader sentiment is there.
resps10_smt_df.head()

  mask |= (ar1 == a)


run time for loading responses: 8.474327802658081
analysis started: 2019-10-12 22:42:20
completed 50000 22:43:31
completed 100000 22:43:51
completed 150000 22:44:09
completed 200000 22:44:21
completed 250000 22:44:32
completed 300000 22:45:00
completed 350000 22:45:40
completed 400000 22:46:21
completed 450000 22:47:13
completed 500000 22:48:09
completed 550000 22:49:16
completed 600000 22:50:17
completed 650000 22:51:27
completed 700000 22:52:17
completed 750000 22:53:13
completed 800000 22:54:11
completed 850000 22:55:02
completed 900000 22:56:03
completed 950000 22:57:10
completed 1000000 22:58:06
completed 1050000 22:58:41
completed 1100000 22:59:37
completed 1150000 23:00:39
completed 1200000 23:01:42
completed 1250000 23:02:34
completed 1300000 23:03:23
completed 1350000 23:04:12
run time (minutes) for analysis: 22.514630802472432
run time for data save: 13.945488214492798


Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,TB_sentiment,V_sentiment
0,40220308,M,467864,Don,"Somehow I can picture Barack, MicHELLe, Joe, H...",Joe Barton,Congress_Republican,2,0.0,-0.8679
1,40220308,M,467864,Julie,Why have these hearings if nobody has to take ...,Joe Barton,Congress_Republican,-4,-0.7,-0.5423
2,40220308,M,467864,Donald,"So Terry, you never did tell how much it pays ...",Joe Barton,Congress_Republican,-3,-0.6,-0.3818
3,40220308,M,467864,Johnny,They will never release these numbers because ...,Joe Barton,Congress_Republican,-1,-0.7,-0.5423
4,40220308,M,467864,Dale,Yes the liberals on this page have absolutely ...,Joe Barton,Congress_Republican,-1,0.2,0.1059


In [18]:
##### change four numbers..... ########

# Load 1.4M responses (one tenth of the entire dataset.)
start_time = time.time()
resp01smt_df = pd.read_csv("../data/resps01_smt.csv", index_col=0)
end_time = time.time()
print("run time for loading responses:",end_time-start_time)
resps01_smt_df = resp01smt_df.fillna('0') # 1/10 of responses


##### change two numbers..... ########

# Run the Vader analysis on 1/10 of responses.
start_time = time.time()
print("analysis started:",time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resps01_smt_df["V_sentiment"] = compute_vader_sentiment(resps01_smt_df["response_text"])
end_time = time.time()
print("run time (minutes) for analysis:",(end_time-start_time)/60)


##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resps01_smt_df.to_csv("../data/resps01_smt.csv", header = True)
end_time = time.time()
print("run time for data save:",end_time-start_time)


##### change one number..... ########

# Verify vader sentiment is there.
resps01_smt_df.head()

  mask |= (ar1 == a)


run time for loading responses: 6.109203815460205
analysis started: 2019-10-12 23:05:12
completed 50000 23:05:43
completed 100000 23:06:14
completed 150000 23:07:08
completed 200000 23:07:53
completed 250000 23:08:36
completed 300000 23:09:14
completed 350000 23:10:06
completed 400000 23:10:39
completed 450000 23:11:18
completed 500000 23:11:46
completed 550000 23:12:19
completed 600000 23:12:54
completed 650000 23:13:34
completed 700000 23:14:18
completed 750000 23:14:54
completed 800000 23:15:28
completed 850000 23:16:11
completed 900000 23:16:57
completed 950000 23:17:37
completed 1000000 23:18:16
completed 1050000 23:19:11
completed 1100000 23:19:48
completed 1150000 23:20:30
completed 1200000 23:21:10
completed 1250000 23:21:47
completed 1300000 23:22:33
completed 1350000 23:23:11
run time (minutes) for analysis: 18.429134575525918
run time for data save: 13.265977144241333


Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-1,-0.7458
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,1,0.3302
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0,0.3612
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0,0.3612
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,0,-0.0821


In [19]:
##### change four numbers..... ########

# Load 1.4M responses (one tenth of the entire dataset.)
start_time = time.time()
resp02smt_df = pd.read_csv("../data/resps02_smt.csv", index_col=0)
end_time = time.time()
print("run time for loading responses:",end_time-start_time)
resps02_smt_df = resp02smt_df.fillna('0') # 1/10 of responses


##### change two numbers..... ########

# Run the Vader analysis on 1/10 of responses.
start_time = time.time()
print("analysis started:",time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)))
resps02_smt_df["V_sentiment"] = compute_vader_sentiment(resps02_smt_df["response_text"])
end_time = time.time()
print("run time (minutes) for analysis:",(end_time-start_time)/60)


##### change two numbers..... ########

# Save chunk to instance disk.
start_time = time.time()
resps02_smt_df.to_csv("../data/resps02_smt.csv", header = True)
end_time = time.time()
print("run time for data save:",end_time-start_time)


##### change one number..... ########

# Verify vader sentiment is there.
resps02_smt_df.head()

  mask |= (ar1 == a)


run time for loading responses: 8.500474452972412
analysis started: 2019-10-12 23:24:01
completed 50000 23:24:46
completed 100000 23:25:33
completed 150000 23:26:20
completed 200000 23:27:12
completed 250000 23:28:10
completed 300000 23:29:11
completed 350000 23:30:12
completed 400000 23:31:23
completed 450000 23:32:19
completed 500000 23:33:26
completed 550000 23:34:25
completed 600000 23:35:29
completed 650000 23:36:28
completed 700000 23:37:26
completed 750000 23:38:29
completed 800000 23:39:23
completed 850000 23:40:35
completed 900000 23:41:29
completed 950000 23:42:26
completed 1000000 23:43:30
completed 1050000 23:44:28
completed 1100000 23:45:21
completed 1150000 23:46:18
completed 1200000 23:47:12
completed 1250000 23:48:04
completed 1300000 23:49:01
completed 1350000 23:49:49
run time (minutes) for analysis: 26.381252626578014
run time for data save: 16.372407913208008


Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment
0,29687946,M,14192,Teresa,"The Obama administration can ""justify"" giving ...",Rand Paul,Congress_Republican,1,0.68
1,29687946,M,14192,Robert,wtf we always find money for low class goverme...,Rand Paul,Congress_Republican,0,-0.3652
2,29687946,M,14192,Bill,Kerry another Obama puppet. Egypt is no longer...,Rand Paul,Congress_Republican,-2,0.0636
3,29687946,M,14192,Kathy,"Oh, yes; absolutely! Let's send even more mone...",Rand Paul,Congress_Republican,-1,-0.3345
4,29687946,M,14192,Russ,"Oh gezzzzzzzzzz it just get worse by the day, ...",Rand Paul,Congress_Republican,-1,-0.8167


In [10]:
# Load 1.4M responses (one tenth of the entire dataset.)
start_time = time.time()
testdf2 = pd.read_csv("../data/resps02_HL-smt.csv", index_col=0)
end_time = time.time()
print("run time for loading responses:",end_time-start_time)
# resps02_smt_df = resp02smt_df.fillna('0') # 1/10 of responses


  mask |= (ar1 == a)


run time for loading responses: 8.550698518753052


In [11]:
testdf2

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment
0,29687946,M,14192,Teresa,"The Obama administration can ""justify"" giving ...",Rand Paul,Congress_Republican,1
1,29687946,M,14192,Robert,wtf we always find money for low class goverme...,Rand Paul,Congress_Republican,0
2,29687946,M,14192,Bill,Kerry another Obama puppet. Egypt is no longer...,Rand Paul,Congress_Republican,-2
3,29687946,M,14192,Kathy,"Oh, yes; absolutely! Let's send even more mone...",Rand Paul,Congress_Republican,-1
4,29687946,M,14192,Russ,"Oh gezzzzzzzzzz it just get worse by the day, ...",Rand Paul,Congress_Republican,-1
5,29687946,M,14192,Sharon,"Yep it is amazing! People here homeless, dyin...",Rand Paul,Congress_Republican,-2
6,29687946,M,14192,Mark,"We need to stop become involved in European, A...",Rand Paul,Congress_Republican,2
7,29687946,M,14192,Ronald,Good old MUslim Brotherhood send them some foo...,Rand Paul,Congress_Republican,0
8,29687946,M,14192,Glenn,Unless some of those greedy congress-people st...,Rand Paul,Congress_Republican,-1
9,29687946,M,14192,Kevin,Washington is a joke! They are the problem. It...,Rand Paul,Congress_Republican,-1


In [8]:
resp02smt_df

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment
0,29687946,M,14192,Teresa,"The Obama administration can ""justify"" giving ...",Rand Paul,Congress_Republican,1,0.6800
1,29687946,M,14192,Robert,wtf we always find money for low class goverme...,Rand Paul,Congress_Republican,0,-0.3652
2,29687946,M,14192,Bill,Kerry another Obama puppet. Egypt is no longer...,Rand Paul,Congress_Republican,-2,0.0636
3,29687946,M,14192,Kathy,"Oh, yes; absolutely! Let's send even more mone...",Rand Paul,Congress_Republican,-1,-0.3345
4,29687946,M,14192,Russ,"Oh gezzzzzzzzzz it just get worse by the day, ...",Rand Paul,Congress_Republican,-1,-0.8167
5,29687946,M,14192,Sharon,"Yep it is amazing! People here homeless, dyin...",Rand Paul,Congress_Republican,-2,0.2673
6,29687946,M,14192,Mark,"We need to stop become involved in European, A...",Rand Paul,Congress_Republican,2,1.7990
7,29687946,M,14192,Ronald,Good old MUslim Brotherhood send them some foo...,Rand Paul,Congress_Republican,0,0.4404
8,29687946,M,14192,Glenn,Unless some of those greedy congress-people st...,Rand Paul,Congress_Republican,-1,-0.3182
9,29687946,M,14192,Kevin,Washington is a joke! They are the problem. It...,Rand Paul,Congress_Republican,-1,0.6527


In [41]:
# Run Vader sentiment and store results in new column.
start_time = time.time()
resps_smt_Df['V_sentiment'] = compute_vader_sentiment(resps_smt_Df["response_text"])
end_time = time.time()
print("run time:",end_time-start_time)

# 1.3M responses took 18 minutes.

run time: 1074.4153938293457


In [42]:
1074.4153938293457/60

17.906923230489095

In [39]:
# Sample vader analysis for one response.
sia = SentimentIntensityAnalyzer()
tokens = word_tokenize(resps_smt_Df["response_text"][0])
for word in tokens:
    ss = sia.polarity_scores(word)
    print(word)
    for k in sorted(ss):    
        print('{0}: {1}, '.format(k, ss[k]), end='')
    print("\n---------------")

Protecting
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
birth
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
is
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
not
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
the
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
same
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
as
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
protecting
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
life
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
.
compound: 0.0, neg: 0.0, neu: 0.0, pos: 0.0, 
---------------
You
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
may
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
very
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
---------------
well
compound: 0.2732, neg: 0.0, neu: 0.0, pos: 1.0, 
---------------
pledge
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
----