In [1]:
import os
import datetime
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import plotly.plotly as py
import plotly.graph_objs as go

plt.style.use('fivethirtyeight')
#%matplotlib inline

In [3]:
if not os.path.exists('data/fivethirtyeight_tweets.csv'):
    with open('data/fivethirtyeight_tweets.csv', 'w'):
        for ii in range(1,12):
            print('Accessing file '+str(ii)+' of 11...')
            if ii == 1:
                ira_tweets = pd.read_csv('https://raw.githubusercontent.com/fivethirtyeight/russian-troll-tweets/master/IRAhandle_tweets_'+str(ii)+'.csv')
            else:
                more = pd.read_csv('https://raw.githubusercontent.com/fivethirtyeight/russian-troll-tweets/master/IRAhandle_tweets_'+str(ii)+'.csv')
                ira_tweets = pd.concat([ira_tweets,more])
        ira_tweets.to_csv('data/fivethirtyeight_tweets.csv')
        print('Done.')
        pass
else:
    print('Opening existing data file...')
    ira_tweets = pd.read_csv('data/fivethirtyeight_tweets.csv')
    print('Done.')

Opening existing data file...



Columns (1,11,16,21) have mixed types. Specify dtype option on import or set low_memory=False.



Done.


In [14]:
ira_tweets.head()

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,...,account_type,retweet,account_category,new_june_2018,alt_external_id,tweet_id,article_url,tco1_step1,tco2_step1,tco3_step1
0,906000000000000000,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,...,Right,0,RightTroll,0,905874659358453760,914580356430536707,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/914580356430...,,
1,906000000000000000,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,254,...,Right,0,RightTroll,0,905874659358453760,914621840496189440,http://twitter.com/905874659358453760/statuses...,https://twitter.com/damienwoody/status/9145685...,,
2,906000000000000000,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,255,...,Right,1,RightTroll,0,905874659358453760,914623490375979008,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/913231923715...,,
3,906000000000000000,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,10/1/2017 23:52,10/1/2017 23:52,1062,9642,256,...,Right,0,RightTroll,0,905874659358453760,914639143690555392,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/914639143690...,,
4,906000000000000000,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,10/1/2017 2:13,10/1/2017 2:13,1050,9645,246,...,Right,1,RightTroll,0,905874659358453760,914312219952861184,http://twitter.com/905874659358453760/statuses...,https://twitter.com/realDonaldTrump/status/914...,,


In [17]:
ira_tweets.columns

Index(['external_author_id', 'author', 'content', 'region', 'language',
       'publish_date', 'harvested_date', 'following', 'followers', 'updates',
       'post_type', 'account_type', 'retweet', 'account_category',
       'new_june_2018', 'alt_external_id', 'tweet_id', 'article_url',
       'tco1_step1', 'tco2_step1', 'tco3_step1'],
      dtype='object')

Add a new column: list of hashtags in tweet

In [None]:
'''
#.*?     -- non-greedy match for words starting with #
(?=\s|$) -- look ahead for the end of the word
'''
ira_tweets['hashtags'] = ira_tweets.content.str.findall(r'#.*?(?=\s|$)')

In [None]:
ira_tweets.hashtags.head()

In [9]:
eng_tweets = ira_tweets[ira_tweets.language == 'English']

In [11]:
eng_tweets.account_category.value_counts()

RightTroll      646007
NewsFeed        541260
LeftTroll       385410
HashtagGamer    204239
Commercial      112580
NonEnglish       22782
Fearmonger       10524
Unknown           6496
Name: account_category, dtype: int64

In [12]:
eng_tweets.account_type.value_counts()

Right         646007
local         405885
Left          385410
Hashtager     204239
news          135375
Commercial    112580
Russian        16949
Koch           10282
?               6496
German          3530
Italian         1125
Arabic          1030
ZAPOROSHIA       172
Spanish           79
Ebola             70
French            69
Name: account_type, dtype: int64

In [10]:
eng_tweets['publish_date'] = pd.to_datetime(eng_tweets['publish_date'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [None]:
time_series = eng_tweets.groupby(eng_tweets.publish_date.dt.date)['author'].agg('count')

In [None]:
right_trolls = eng_tweets[eng_tweets.account_category=='RightTroll']

In [None]:
right_ts = right_trolls.groupby(right_trolls.publish_date.dt.date)['author'].agg('count')

In [None]:
left_trolls = eng_tweets[eng_tweets.account_category=='LeftTroll']

In [None]:
left_ts = left_trolls.groupby(left_trolls.publish_date.dt.date)['author'].agg('count')

In [None]:
fake_news = eng_tweets[eng_tweets.account_category=='Newsfeed']

In [None]:
news_ts = fake_news.groupby(fake_news.publish_date.dt.date)['author'].agg('count')

In [None]:
fig, ax = plt.subplots(figsize=(15,9))
ax.xaxis.set_tick_params(labelsize=16)
ax.yaxis.set_tick_params(labelsize=16)

ax.plot(time_series.index, time_series.values, label='All')
ax.plot(right_ts.index, right_ts.values,'--', label='Right Troll')
ax.plot(left_ts.index, left_ts.values,'-.', label='Left Troll')
ax.plot(news_ts.index, news_ts.values, label='Newsfeed')

ax.set_xlabel('Date', fontsize=18)
ax.set_ylabel('Number of tweets per day', fontsize=18)
ax.set_xlim(datetime.date(2014, 4, 1), datetime.date(2018, 4, 1))
ax.legend()