In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
plt.rcParams['figure.figsize'] = (20, 12)

## Import data

In [3]:
trump = pd.read_json('data/trump.json')

In [4]:
trump.shape

(38397, 21)

In [5]:
trump.columns

Index(['has_media', 'hashtags', 'img_urls', 'is_replied', 'is_reply_to',
       'likes', 'links', 'parent_tweet_id', 'replies', 'reply_to_users',
       'retweets', 'screen_name', 'text', 'text_html', 'timestamp',
       'timestamp_epochs', 'tweet_id', 'tweet_url', 'user_id', 'username',
       'video_url'],
      dtype='object')

## Data formatting

In [6]:
# dates covered by the dataset
print(trump.timestamp.min())
print(trump.timestamp.max())

2009-05-04 18:54:25
2019-11-11 23:58:14


In [7]:
#Make text for twitter lowercase to search for matching expressions

trump['text_lower'] = trump.text.str.lower()


In [10]:
# Function to find and count all fake news expressions

def fake_news_mentions(data, expressions = []):
    #find patterns
    for i in expressions:
        data[i] = pd.Series(data.text_lower.str.contains(pat = i)) 
    
    # Return value counts for expressions
    for i in expressions:
        print(data[i].value_counts())
    
    # Aggregate tweets by days
    d = {'timestamp':'days'}    


In [11]:
columns = ['fake news', 'false news', 'fake media', 'falsenews', 'fakemedia']

fake_news_mentions(trump, columns)

False    37885
True       512
Name: fake news, dtype: int64
False    38396
True         1
Name: false news, dtype: int64
False    38379
True        18
Name: fake media, dtype: int64
False    38397
Name: falsenews, dtype: int64
False    38397
Name: fakemedia, dtype: int64


In [None]:
#### PENDING

In [None]:
#### SUM ALL COLUMNS VALUES INTO ONE SINGLE COLUMN

In [18]:
trump['all_fake_news'] = trump[columns].count(axis=1)

In [19]:
trump.sample(20)

Unnamed: 0,has_media,hashtags,img_urls,is_replied,is_reply_to,likes,links,parent_tweet_id,replies,reply_to_users,...,user_id,username,video_url,text_lower,fake news,false news,fake media,falsenews,fakemedia,all_fake_news
18066,False,[],[],True,False,10,[],,38,[],...,25073877,Donald J. Trump,,the usc should be ruling any day now on @obama...,False,False,False,False,False,5
8813,False,[],[],True,False,114453,[],,23064,[],...,25073877,Donald J. Trump,,"thank you to all of my great supporters, reall...",False,False,False,False,False,5
18742,False,[],[],True,False,20982,[],,4299,[],...,25073877,Donald J. Trump,,"""in politics, and in life, ignorance is not a ...",False,False,False,False,False,5
27130,False,[],[],True,False,29,[http://www.agriculture.com/news/business/trum...,,3,[],...,25073877,Donald J. Trump,,“trump: 'never give up' on farmland value rall...,False,False,False,False,False,5
18163,False,[],[],True,False,1309,[],,129,[],...,25073877,Donald J. Trump,,"""he who knows when he can fight and when he ca...",False,False,False,False,False,5
38237,False,[],[],True,False,56,[],,45,[],...,25073877,Donald J. Trump,,.@keithurban is excellent on american idol—gre...,False,False,False,False,False,5
815,False,[],[],True,False,179441,[],,54380,[],...,25073877,Donald J. Trump,,courageous patriots have fought and died for o...,False,False,False,False,False,5
19989,False,[],[],True,False,7849,[],,1617,[],...,25073877,Donald J. Trump,,"and finally, cruz strongly told thousands of c...",False,False,False,False,False,5
13873,False,[],[],True,False,319654,[],,65525,[],...,25073877,Donald J. Trump,,boring!,False,False,False,False,False,5
27327,False,[],[],True,False,43,[http://bit.ly/1CV61zL],,25,[],...,25073877,Donald J. Trump,,which national costume do you think should win...,False,False,False,False,False,5


In [None]:
# Add True/False column based on fake_news
# trump_days['label'] = np.where(trump_days['fake_news']>=1, 'yes', 'no')
# fake_news = trump_days[trump_days['label'] == 'yes']

## Data Analysis

In [None]:
# Aggregate tweets by days

d = {'timestamp':'days'}

trump_days = trump.groupby(trump.timestamp.dt.date)./
    agg({'timestamp':'count', 'fake_news':'sum'}).rename(columns=d)

trump_days['fake_news'] = trump_days['fake_news'].astype('int32')

In [None]:
print(trump_days.label.value_counts())
print(trump_days.fake_news.sum())


In [None]:
print('First Trump tweet mentioning fake news: {}'.format(fake_news.index.min()))
print('Last Trump tweet mentioning fake news: {}'.format(fake_news.index.max()))
print('Total tweets from Trump mentioning fake news: {}'.format(fake_news.fake_news.sum()))

In [None]:
# Frequency Trump mentiones 'fake news' in his tweets
res = (pd.Series(fake_news.index[1:]) - pd.Series(fake_news.index[:-1])).value_counts()
print('Average frequency for tweets mentioning fake news is: {} days '.format(res.mean()))

In [None]:
plt.boxplot(res)

In [None]:
plt.plot(trump_days.index, trump_days.label, 'o', alpha = 0.3);

In [None]:
plt.plot(fake_news.index, fake_news.days, 'o', alpha = 0.3);	