# Twitter Sentiment Analysis

The following code takes the libraries of data scraped from twitter and converts them into predictors that will be used for the future regression. We are interested in two types of predictors from the twitter data:

1) The share of tweets a contestant received for one episode.

2) The general positivity of a contestant's tweets for one episode.

In [1]:
%matplotlib inline

import oauth2
import simplejson
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
import datetime
import json
import unittest, time, re
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
from sklearn.feature_extraction import text
import nltk


In [75]:
#Load Contestant Name Data from wiki scrape
with open("tempdata/seasonsDict.json") as json_file:
    wiki_data = json.load(json_file)

#Fix known formatting problems:
wiki_data['19'][19]['eliminated'] = u'Eliminated in week 2'
wiki_data['19'][20]['eliminated'] = u'Eliminated in week 1'

w19 = []
for ww in wiki_data['19'][0:29]:
    w19.append(ww)
    
wiki_data['19'] = w19

#Load date guide
date_guide = pd.read_csv("date_guide.csv")

In [157]:
#Get all contestant names
cont_nam = []
for wkey in wiki_data.keys():
    for person in wiki_data[wkey]:
        cont_nam.append(person['name'])

#Strip names with weird formatting
def url_strip(r):
    if bool(re.search("href", r)):
        oval = r.split(">")[1].replace("</a","")
    else:
        oval = r
    if bool(re.search("u\"", oval)):
        return oval.replace("u\"","").replace("[","")
    else:
        return oval
    
full_names = map(url_strip, cont_nam)

#Get just first names
first_names = set(map(lambda r: r.split(" ")[0], full_names))

bach_names = ['Jason', 'Jake', 'Brad', 'Ben', 'Sean', 'Juan', 'Chris']

## Create Corpus

Here we create a corpus of adjectives and adverbs from our entire body of tweets. Since Twitter is not copy-edited, we have to intensely filter our results to collect the words.


In [2]:
#First load all the tweets from all seasons
all_tweets = []
for iseason in range(12,20):
    file_name = "tweets" + str(iseason) + ".json"
    with open(file_name) as json_file:
        tdat = json.load(json_file)
    
    for tkey in tdat.keys():
        cont_dat = tdat[tkey]
        if cont_dat is not None:
            for cc in cont_dat:
                ep_dat = cc.keys()
                for tweet in cc[ep_dat[0]]:
                    all_tweets.append(tweet)

tweets12.json  is done
tweets13.json  is done
tweets14.json  is done
tweets15.json  is done
tweets16.json  is done
tweets17.json  is done
tweets18.json  is done
tweets19.json  is done


In [311]:
#Next flatten all the tweets into one list - where one sentence = one list entry

#Get all twitter sentences over all tweets   
#Flatten all sentences into an array
tweet_periods = map(lambda r: r.split("."), all_tweets)
tweet_flat1 = [item for sublist in tweet_periods for item in sublist]

tweet_questions = map(lambda r: r.split("?"), tweet_flat1)
tweet_flat2 = [item for sublist in tweet_questions for item in sublist]

tweet_exclaim = map(lambda r: r.split("!"), tweet_flat2)
tweet_flat3 = [item for sublist in tweet_exclaim for item in sublist]

#Filter out empty sentences
tweet_sentences = filter(lambda r: r not in "", tweet_flat3)

#Replace hypens as spaces
tweet_sentences = map(lambda r: r.replace("-"," "), tweet_sentences)
tweet_sentences = map(lambda r: r.replace("="," "), tweet_sentences)

#Strip weird characters from words
tweet_encode = [tt.encode("ascii", "ignore") for tt in tweet_sentences]
tweet_process_output = map(lambda r: r.translate(None,"*@#\/[]()"), tweet_encode)

#Filter out strange results in our vocabulary
good_sentences = filter(lambda r: np.logical_not(bool(re.search("\/",r))) & \
                  np.logical_not(bool(re.search("\\\\",r))) & \
                  np.logical_not(bool(re.search("http",r))) & \
                  np.logical_not(bool(re.search("www",r)))  & \
                  np.logical_not(bool(re.search("comnode",r))) & \
                  np.logical_not(bool(re.search("utm_",r))) & \
                  np.logical_not(bool(re.search("v=",r))) & \
                  np.logical_not(bool(re.search("lyw",r)))
                  , tweet_process_output)


In [312]:
%%time
#Use the NLTK package to tokenize each word in each sentence
#Collect only adjectives and adverbs
all_adj = []
for sentence in good_sentences:
    stokens = nltk.word_tokenize(sentence)
    for word, part_of_speech in nltk.pos_tag(stokens):
        if part_of_speech in ['JJ', 'JJS', 'JJR', 'RB', 'RBR', 'RBS', 'RP']:
            all_adj.append(word)

CPU times: user 14min 7s, sys: 4.39 s, total: 14min 11s
Wall time: 14min 22s


In [315]:
#Remove stop words
full_corpus = filter(lambda r: r not in nltk.corpus.stopwords.words("english"), all_adj)

#Remove words that are contestant names
full_corpus = filter(lambda r: r not in first_names, full_corpus)
full_corpus = filter(lambda r: r not in bach_names, full_corpus)

#Get unique values
our_corpus = set(full_corpus)
len(our_corpus)

4240

## Find sentiment of words in our corpus

We use the website http://text-processing.com/, which is a library that gives the probability that a word is either positive or negative from a fit of words to movie and twitter data. 

In [167]:
#To not make the text-processing API mad, we will break up our data into 3 sections
#text-processing.org throttles API requests to 1000 per day
corpus1 = our_corpus[0:800]
corpus2 = our_corpus[801:1600]
corpus3 = our_corpus[1601:-1]


In [168]:
def corpus_prob(which_word):

    api_url = "http://text-processing.com/api/sentiment/"

    data_type = {"text": which_word}
    request_val = requests.post(api_url, data = data_type)

    return json.loads(request_val.text)

In [169]:
%%time
#Run on CORPUS1
json_list1 = []
for wword in corpus1:
    time.sleep(1)
    json_list1.append(corpus_prob(wword))
    

CPU times: user 3.28 s, sys: 589 ms, total: 3.87 s
Wall time: 16min 14s


In [174]:
#Remove weird keys
json_list1_good = []
good_corpus1 = []

for corp, dicts in zip(corpus1, json_list1):
    try: 
        good_corpus1.append(corp.encode('utf-8'))
        json_list1_good.append(dicts)
    except:
        ""
probs1 = dict(zip(good_corpus1, json_list1_good))

with open('probs1.json', 'w') as fp:
    json.dump(probs1, fp)

In [None]:
%%time

#Run on CORPUS2
json_list2 = []
for wword in corpus2:
    time.sleep(1)
    json_list2.append(corpus_prob(wword))
    
#Remove weird keys
json_list2_good = []
good_corpus2 = []

for corp, dicts in zip(corpus2, json_list2):
    try: 
        good_corpus2.append(corp.encode('utf-8'))
        json_list2_good.append(dicts)
    except:
        ""
probs2 = dict(zip(good_corpus2, json_list2_good))

with open('probs2.json', 'w') as fp:
    json.dump(probs2, fp)
    

In [178]:
%%time

#Run on CORPUS3
json_list3 = []
for wword in corpus3:
    time.sleep(1)
    json_list3.append(corpus_prob(wword))
    
#Remove weird keys
json_list3_good = []
good_corpus3 = []

for corp, dicts in zip(corpus3, json_list3):
    try: 
        good_corpus3.append(corp.encode('utf-8'))
        json_list3_good.append(dicts)
    except:
        ""
probs3 = dict(zip(good_corpus3, json_list3_good))

with open('probs3.json', 'w') as fp:
    json.dump(probs3, fp)
    

CPU times: user 2.24 s, sys: 332 ms, total: 2.57 s
Wall time: 12min 8s


## Create dictionaries for each contestant for each episode

We want a dictionary that is keyed by season, contestant, and episode date. The values are then the share of tweets & a positivity index.

In [181]:
#Read all dictionaries back in to get full corpus
with open("probs1.json") as json_file:
    probs1 = json.load(json_file)
with open("probs2.json") as json_file:
    probs2 = json.load(json_file)
with open("probs3.json") as json_file:
    probs3 = json.load(json_file)

corpus = {}
corpus.update(probs1)
corpus.update(probs2)
corpus.update(probs3)

In [317]:
#Similar to full corpus tweet processing - make function to get single tweet into manageable format
def tweet_process(tweet):
    #Split sentences up
    tweet_periods = tweet.split(".")
    tweet_questions = [tp.split("?") for tp in tweet_periods]
    tweet_flat1 = [item for sublist in tweet_questions for item in sublist]
    tweet_exclaim = [tq.split("!") for tq in tweet_flat1]
    tweet_flat2 = [item for sublist in tweet_exclaim for item in sublist]

    #Replace hypens as spaces
    tweet_sentences1 = [tf.replace("-"," ") for tf in tweet_flat2]
    return [tf.replace("="," ") for tf in tweet_sentences1]

#Similar to full corpus, use a tweet to find all adjectives + adverbs for that tweet
def tweet_part_of_speech(tweet_process_output):
    
    #Strip weird characters from words
    tweet_encode = [tt.encode("ascii", "ignore") for tt in tweet_process_output]
    tweet_process_output = map(lambda r: r.translate(None,"*@#\/[]()"), tweet_encode)
    
    for ch in uni_char:
        tweet_process_output = map(lambda r: r.replace(ch,""), tweet_encode)
        
    #Get all adjectives from tweet
    all_adj = []
    for sentence in tweet_process_output:
        stokens = nltk.word_tokenize(sentence)
        for word, part_of_speech in nltk.pos_tag(stokens):
            if part_of_speech in ['JJ', 'JJS', 'JJR', 'RB', 'RBR', 'RBS', 'RP']:
                all_adj.append(word)

    good_adj = filter(lambda r: np.logical_not(bool(re.search("\/",r))) & \
                      np.logical_not(bool(re.search("\\\\",r))) & \
                      np.logical_not(bool(re.search("http",r))) & \
                      np.logical_not(bool(re.search("www",r)))  & \
                      np.logical_not(bool(re.search("comnode",r))) & \
                      np.logical_not(bool(re.search("utm_",r))) & \
                      np.logical_not(bool(re.search("v=",r))) & \
                      np.logical_not(bool(re.search("lyw",r)))
                      , all_adj)  
    return good_adj

#Make probability function that takes the output of tweet_part_of_speech and finds 
#the probability that the tweet is positive.
#This essentially boils down to finding the mean positive probability of the tweet based on the 
#adjectives/adverbs. If above 50% - consider positive, otherwise negative.
def is_tweet_positive(tweet_pos_output):
    probs = []
    for word in tweet_pos_output:
        try:
            word_prob = corpus[word]['label']
            if word_prob == "neg":
                probs.append(-1)
            elif word_prob == "pos":
                probs.append(1)
            else:
                probs.append(0)
                
        except:
            ""
    if len(probs) == 0:
        return 0
    else:
        return np.mean(probs)
        

In [283]:
#Create function that takes season, gets tweets for each episode/contestant
def tweets_by_season(use_season):
    with open("tweets"+str(use_season)+".json") as json_file:
        tdat = json.load(json_file)

    season_dates = date_guide[date_guide.Season == use_season]
    season_dict={}
    contestants = tdat.keys()

    for contestant in contestants:
        contestant_dict = {}
        cont_dat = tdat[contestant]
        if cont_dat is not None:
            for cc in cont_dat:
                episode_date = cc.keys()
                number_of_tweets = 0
                positive_index = 0
                for tweet in cc[episode_date[0]]:
                    number_of_tweets += 1
                    positive_index += is_tweet_positive(tweet_part_of_speech(tweet_process(tweet)))
                if number_of_tweets == 0:
                    sentiment = 0
                else:
                    sentiment = float(positive_index) / float(number_of_tweets)

                episode_dict = {episode_date[0]: {"ntweet": number_of_tweets, "sentiment":sentiment}}
                contestant_dict.update(episode_dict)
        season_dict.update({contestant: contestant_dict})
    return season_dict               
    
            

In [284]:
%%time

#Put all tweets together to form one large dictionary
season_nums = range(13,20)
tweet_dict = {}
for season_num in season_nums:
    dseason = tweets_by_season(season_num)
    tweet_dict.update({season_num : dseason})
    print season_num, "is done"
    
with open('twitter_sentiment.json', 'w') as fp:
    json.dump(tweet_dict, fp)

13 is done
14 is done
15 is done
16 is done
17 is done
18 is done
19 is done
CPU times: user 14min 49s, sys: 4.53 s, total: 14min 54s
Wall time: 14min 56s
