## What is the distribution of sentiments of tweets as grouped by account category?
### Sentiment analysis with VADER

In [1]:
import os
import re
import ipywidgets as widgets
import sqlite3 as sql
import emoji
import datetime
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
pd.options.display.max_colwidth = 250
%matplotlib notebook

In [2]:
if not os.path.exists('data/fivethirtyeight_tweets.csv'):
    with open('data/fivethirtyeight_tweets.csv', 'w'):
        for ii in range(1,12):
            print('Accessing file '+str(ii)+' of 11...')
            if ii == 1:
                ira_tweets_538 = pd.read_csv('https://raw.githubusercontent.com/fivethirtyeight/russian-troll-tweets/master/IRAhandle_tweets_'+str(ii)+'.csv')
            else:
                more = pd.read_csv('https://raw.githubusercontent.com/fivethirtyeight/russian-troll-tweets/master/IRAhandle_tweets_'+str(ii)+'.csv')
                ira_tweets_538 = pd.concat([ira_tweets, more])
        ira_tweets_538.to_csv('data/fivethirtyeight_tweets.csv')
        print('Done.')
        pass
else:
    print('Opening existing data file...')
    ira_tweets_538 = pd.read_csv('data/fivethirtyeight_tweets.csv')
    print('Done.')

Opening existing data file...
Done.


In [3]:
ira_tweets_538.account_category.value_counts()

NonEnglish      746245
RightTroll      652275
NewsFeed        542640
LeftTroll       389898
HashtagGamer    209279
Commercial      121904
Unknown          13070
Fearmonger       10652
Name: account_category, dtype: int64

In [54]:
# cleaning the tweet content a bit differently for VADER, since it uses capitalization, 
# puncutation, and emojis to extract sentiment
eng_tweets = ira_tweets_538[ira_tweets_538.language == 'English']
eng_tweets = eng_tweets[eng_tweets.account_category != 'NonEnglish']
eng_tweets.content = eng_tweets.content.str.replace('((www\.[^\s]+)|(http?://[^\s]+)|(https?://[^\s]+))','')
eng_tweets.content = eng_tweets.content.apply(lambda x: re.sub('[#@/|\-;,.]','', str(x)))
eng_tweets.content = eng_tweets.content.str.replace("'s",'')
eng_tweets.content = eng_tweets.content.str.replace('amp','')
eng_tweets.content = eng_tweets.content.apply(lambda x: re.sub('([a-z])([A-Z])','\g<1> \g<2>', str(x)))

In [55]:
targets = ['RightTroll', 'LeftTroll', 'NewsFeed', 'Fearmonger']
eng_tweets = eng_tweets.loc[eng_tweets.account_category.isin(targets)]

In [56]:
eng_tweets.account_category.value_counts()

RightTroll    646007
NewsFeed      541260
LeftTroll     385410
Fearmonger     10524
Name: account_category, dtype: int64

In [57]:
analyzer = SentimentIntensityAnalyzer()

In [58]:
def get_polarity(compound_score):
    if compound_score > 0.05:
        return 'Positive'
    if compound_score < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [59]:
eng_tweets['sentiment'] = eng_tweets.content.apply(lambda x: analyzer.polarity_scores(x)['compound'])
eng_tweets.sentiment.head()

0    0.0000
1   -0.3182
2   -0.4404
3    0.0000
4    0.6399
Name: sentiment, dtype: float64

In [60]:
eng_tweets['polarity'] = eng_tweets.sentiment.apply(get_polarity)

In [61]:
def plot_func(category):
    plt.figure(figsize=(16,9))
    bins = np.arange(-1,1,0.05)
    colors = ['#f10c45','#069af3','#929591']
    props = dict(boxstyle='round', facecolor='#929591', alpha=0.5)
    if category == 'All':
        plt.hist(eng_tweets.sentiment, bins=bins, 
                 weights = np.ones(len(eng_tweets.sentiment))/len(eng_tweets.sentiment),
                 color='green', alpha=0.75, histtype='stepfilled', linewidth=2, label='All tweets'
                   )
        plt.figtext(0.1,0.7, eng_tweets.
                    sentiment.describe().apply(lambda x: '%.3f' % x).to_string(), fontsize='x-large', bbox=props)

    else:
        if category=='RightTroll': color='#f10c45'
        if category=='LeftTroll': color='#069af3'
        if category=='NewsFeed': color='#029386'
        if category=='Fearmonger': color='#7e1e9c'
        n = len(eng_tweets[eng_tweets.account_category==str(category)])
        plt.hist(eng_tweets.sentiment[eng_tweets.account_category==str(category)], #/n, 
                 bins=bins, weights = np.ones(n)/n, color=color, alpha=0.75, 
                 histtype='stepfilled', linewidth=2, 
                    label=str(category))
        plt.figtext(0.1,0.7, eng_tweets[eng_tweets.account_category==str(category)].
                    sentiment.describe().apply(lambda x: '%.3f' % x).to_string(), fontsize='x-large', bbox=props)

    plt.xlabel('Tweet sentiment', fontsize=18)
    plt.ylabel('Fraction of tweets', fontsize=18)
    plt.legend(fontsize=16)
    plt.tight_layout()

widgets.interact(plot_func, category=widgets.Dropdown(
    options=targets+['All'],
    value='All',
    description='Account category:',
    disabled=False));

interactive(children=(Dropdown(description='Account category:', index=4, options=('RightTroll', 'LeftTroll', '…

### Many tweets are identified as neutral.  What are some examples of neutral tweets in this dataset?

In [64]:
# Are all of these neutral tweets actually neutral?
pd.options.display.max_colwidth = 500
neutral = eng_tweets['content'].loc[eng_tweets.polarity=='Neutral']
for tweet in neutral.sample(10).values:
    print(tweet)
    print('')

A tale of two conventions  politics

Susan Collins wants Mike Flynn to testify in Russia probe and doesn’t rule out subpoenaing POTUS’s tax returns 

Colombia reports more than 2000 Zika Virus cases in pregnant women 

Trump Train MAGA POTUS 

Could by your girlfriend wife sister Mom  

Senate panel calls for women to sign up for military draft  politics

Sneezeo Meter: It happens every spring … sometimes a little sooner  

sports Russell Westbrook tripledouble streak mesmerized while it lasted

Is this a real pride of cops to shoot teens? policebrutality Wear Hoodie For Trayvon

US airforce scrambles its jets to prevent SAA bombing kurds in Hasakah     



### Are neutral tweets shorter than positive or negative tweets?

In [68]:
neutral.str.len().describe()

count    489617.000000
mean         67.227931
std          29.708461
min           0.000000
25%          50.000000
50%          64.000000
75%          82.000000
max         813.000000
Name: content, dtype: float64

In [69]:
eng_tweets['content'].loc[eng_tweets.polarity=='Positive'].str.len().describe()

count    484352.000000
mean         83.128497
std          34.139860
min           2.000000
25%          60.000000
50%          78.000000
75%         108.000000
max         891.000000
Name: content, dtype: float64

In [70]:
eng_tweets['content'].loc[eng_tweets.polarity=='Negative'].str.len().describe()

count    609232.000000
mean         82.182325
std          31.040569
min           2.000000
25%          61.000000
50%          75.000000
75%         104.000000
max         783.000000
Name: content, dtype: float64