In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import math
import datetime
from textblob import TextBlob 

%matplotlib inline

import seaborn as sns
sns.set(style='whitegrid')
# plt.rcParams.update({'font.size': 22})

plt.style.use('seaborn')
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)

In [61]:
## Data Import
trump_tweet_original_df = pd.read_csv("data/tweets_scraped_11_07_2019.csv")
trump_tweet_original_df['created_at'] = pd.to_datetime(trump_tweet_original_df['created_at'])

In [62]:
pre_iphone_tweets = trump_tweet_original_df[trump_tweet_original_df['created_at'] < pd.to_datetime('03-24-2017 14:41:15')]

pre_iphone_tweets['sender'] = (pre_iphone_tweets['source'] == 'Twitter for Android').astype(int)

# trump_tweet_original_df[trump_tweet_original_df['source'] == 'Twitter for Android']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [65]:
def generate_features(tweet_df):
    
    tweet_df_new = tweet_df.copy()
    
    # Generate Sentiment
    
    for i in range(len(tweet_df_new)):
        
        sentiment = TextBlob(tweet_df.iloc[i, 1]).sentiment
        
        polarity = sentiment[0]
        subjectivity = sentiment[1]
        
        sentiment_df = 
        
    return tweet_df_new

# Cleaning -- With Features Training Set 

In [71]:
##### With Features

trump_tweet_with_features = pd.read_csv("data/full.csv")
trump_tweet_with_features['created_at'] = pd.to_datetime(trump_tweet_with_features['created_at'])

In [72]:
pre_iphone_tweets_features = trump_tweet_with_features[trump_tweet_with_features['created_at'] < pd.to_datetime('03-24-2017 14:41:15')]

pre_iphone_tweets_features['sender'] = (pre_iphone_tweets_features['source'] == 'Twitter for Android').astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [112]:
training_set = pre_iphone_tweets_features[['created_at', 'retweet_count', 'favorite_count', 'hour', 'min', 'linked',
                                'hash', 'dot', 'polarity', 'subjectivity', 'sender', 'text']]

In [115]:
training_set['year'] = training_set.created_at.dt.year
training_set['month'] = training_set.created_at.dt.month
training_set['day'] = training_set.created_at.dt.day
training_set['hour'] = training_set.created_at.dt.hour
training_set['minute'] = training_set.created_at.dt.minute
training_set = training_set.drop(['created_at'], axis = 1, errors='ignore')

AttributeError: 'DataFrame' object has no attribute 'created_at'

In [116]:
training_set.head()

Unnamed: 0,retweet_count,favorite_count,hour,min,linked,hash,dot,polarity,subjectivity,sender,text,year,month,day,minute
0,220796,633253,11,36,False,0,0,0.45,0.833333,1,such a beautiful and important evening! the fo...,2016,11,9,36
1,45576,169729,19,31,True,0,0,1.0,1.0,0,happy 241st birthday to the u.s. marine corps!...,2016,11,10,31
2,37788,192638,2,10,False,0,0,0.58,0.676667,1,a fantastic day in d.c. met with president oba...,2016,11,11,10
3,69498,231526,2,19,False,0,0,0.009375,0.675,1,just had a very open and successful presidenti...,2016,11,11,19
4,55954,221718,11,14,False,0,0,0.41,0.563333,1,love the fact that the small groups of protest...,2016,11,11,14


In [214]:
training_set.columns.values

array(['retweet_count', 'favorite_count', 'hour', 'min', 'linked', 'hash',
       'dot', 'polarity', 'subjectivity', 'sender', 'text', 'year',
       'month', 'day', 'minute'], dtype=object)

# Pre-iPhone Train-Test Split

In [174]:
pre_iphone_train, pre_iphone_test = train_test_split(training_set.drop('text', axis=1, errors='ignore'),
                                                     test_size=0.2, 
                                                     stratify = training_set.sender)

In [175]:
x_pre_iphone_train = pre_iphone_train.drop('sender', axis=1, errors='ignore')
y_pre_iphone_train = pre_iphone_train.sender

x_pre_iphone_test = pre_iphone_test.drop('sender', axis=1, errors='ignore')
y_pre_iphone_test = pre_iphone_test.sender

# Baseline Random Forest Classifier

In [176]:
model = RandomForestClassifier()

In [177]:
model.fit(x_pre_iphone_train, y_pre_iphone_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [178]:
## TRAINING ACCURACY
model.score(x_pre_iphone_train, y_pre_iphone_train)

0.9962756052141527

In [179]:
## TEST ACCURACY
model.score(x_pre_iphone_test, y_pre_iphone_test)

0.8888888888888888

# Trying Out TFIDF Vector

In [104]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [180]:
tfidf = TfidfVectorizer(max_df=0.95,min_df=0.02)

In [181]:
tfidf.fit(trump_tweet_original_df.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.95, max_features=None,
                min_df=0.02, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [182]:
n_cols = len(tfidf.get_feature_names())

In [183]:
tfidf_vector = pd.DataFrame(tfidf.transform(training_set.text).toarray(), columns = tfidf.get_feature_names())

In [184]:
training_with_tfidf = pd.concat([training_set, tfidf_vector], axis = 1).drop('text', axis = 1, errors='ignore')

In [185]:
training_with_tfidf.head()

Unnamed: 0,retweet_count,favorite_count,hour,min,linked,hash,dot,polarity,subjectivity,sender,year,month,day,minute,about,after,again,against,all,also,am,america,american,amp,an,and,are,as,at,back,bad,be,because,been,before,being,better,big,border,but,by,can,china,co,collusion,congress,country,crime,day.1,deal,...,state,states,strong,tax,than,thank,that,the,their,them,there,they,this,time,to,today,total,trade,trump,two,united,up,us,very,vote,wall,want,was,way,we,well,were,what,when,which,whitehouse,who,why,will,win,witch,with,work,working,world,would,year.1,years,you,your
0,220796,633253,11,36,False,0,0,0.45,0.833333,1,2016,11,9,36,0.0,0.0,0.314953,0.0,0.230941,0.0,0.0,0.0,0.0,0.0,0.0,0.262353,0.0,0.258873,0.0,0.0,0.0,0.199657,0.0,0.0,0.339969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.395671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,45576,169729,19,31,True,0,0,1.0,1.0,0,2016,11,10,31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285876,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.463842,0.0,0.180355,0.0,0.0,0.0,0.0,0.0,0.0,0.219746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.381256,0.554051
2,37788,192638,2,10,False,0,0,0.58,0.676667,1,2016,11,11,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.364535,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.320658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,69498,231526,2,19,False,0,0,0.009375,0.675,1,2016,11,11,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147141,0.22142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.553239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,55954,221718,11,14,False,0,0,0.41,0.563333,1,2016,11,11,14,0.0,0.0,0.0,0.0,0.292238,0.0,0.0,0.0,0.0,0.0,0.0,0.165994,0.0,0.0,0.0,0.0,0.0,0.252651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331288,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.23959,0.259616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.262065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.250346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [186]:
pre_iphone_train_with_tfidf, pre_iphone_test_with_tfidf = train_test_split(training_with_tfidf,
                                                                           test_size=0.2, 
                                                                           stratify = training_with_tfidf.sender)

In [187]:
x_pre_iphone_train_wtfidf = pre_iphone_train_with_tfidf.drop('sender', axis=1, errors='ignore')
y_pre_iphone_train_wtfidf = pre_iphone_train_with_tfidf.sender

x_pre_iphone_test_wtfidf = pre_iphone_test_with_tfidf.drop('sender', axis=1, errors='ignore')
y_pre_iphone_test_wtfidf = pre_iphone_test_with_tfidf.sender

In [188]:
model_with_tfidf = RandomForestClassifier()

In [189]:
model_with_tfidf.fit(x_pre_iphone_train_wtfidf, y_pre_iphone_train_wtfidf)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [190]:
model_with_tfidf.score(x_pre_iphone_train_wtfidf, y_pre_iphone_train_wtfidf)

0.994413407821229

In [191]:
model_with_tfidf.score(x_pre_iphone_test_wtfidf, y_pre_iphone_test_wtfidf)

0.8888888888888888

# TEST ON MANUALLY LABELLED SET

In [208]:
manually_labelled_set = pd.read_csv("manul100.csv")
manually_labelled_set['created_at'] = pd.to_datetime(manually_labelled_set['created_at'])

In [209]:
manually_labelled_set['sender'] = (manually_labelled_set['AVG'] >= 2.5).astype(int)

In [210]:
manually_labelled_set.sender.value_counts()

1    65
0    35
Name: sender, dtype: int64

In [220]:
manually_labelled_set['year'] = manually_labelled_set.created_at.dt.year
manually_labelled_set['month'] = manually_labelled_set.created_at.dt.month
manually_labelled_set['day'] = manually_labelled_set.created_at.dt.day
manually_labelled_set['minute'] = manually_labelled_set.created_at.dt.minute

In [221]:
manually_labelled_set

Unnamed: 0.2,Unnamed: 0,ID,id_str_x,Rscore,Jscore,AVG,Unnamed: 0.1,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str_y,date,hour,min,time,linked,hash,dot,polarity,subjectivity,sender,year,month,day,minute
0,0,1607,8.964813e+17,1,1.0,1.0,1464,Media Studio,we must remember this truth: no matter our col...,2017-08-12 21:19:23,33786,113556,False,896481262776360960,2017-08-12,21,19,21:19,True,0,0,0.125000,0.216667,0,2017,8,12,19
1,1,11849,1.186970e+18,5,4.0,4.5,9233,Twitter for iPhone,republicans are going to fight harder than eve...,2019-10-23 11:36:27,20898,90017,False,1186969632944611072,2019-10-23,11,36,11:36,False,0,0,0.233333,0.133333,1,2019,10,23,36
2,2,10562,1.171590e+18,4,4.0,4.0,8433,Twitter for iPhone,i am pleased to endorse governor mike parson o...,2019-09-11 01:04:25,13480,61236,False,1171590284544826880,2019-09-11,1,4,1:4,False,0,0,0.384333,0.776667,1,2019,9,11,4
3,3,6852,1.099684e+18,2,1.0,1.5,5906,Twitter for iPhone,poll: suburban women are coming back into the ...,2019-02-24 14:56:08,24312,110799,False,1099684406002925952,2019-02-24,14,56,14:56,False,0,0,0.250000,0.218750,0,2019,2,24,56
4,4,7865,1.124734e+18,1,1.0,1.0,6591,Twitter for iPhone,today may 4th - is international firefighters ...,2019-05-04 17:53:42,16891,65373,False,1124733856526077952,2019-05-04,17,53,17:53,True,1,0,0.500000,0.450000,0,2019,5,4,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,8283,1.131529e+18,4,2.0,3.0,6852,Twitter for iPhone,the democrats are getting nothing done in cong...,2019-05-23 11:56:03,21730,95999,False,1131529221862502016,2019-05-23,11,56,11:56,False,0,1,0.000000,0.562500,1,2019,5,23,56
96,96,4081,1.007987e+18,4,1.0,2.5,3598,Twitter for iPhone,the ig report totally destroys james comey and...,2018-06-16 14:01:33,30186,119661,False,1007986538985197952,2018-06-16,14,1,14:1,False,0,0,0.111806,0.536111,1,2018,6,16,1
97,97,1439,8.896724e+17,4,4.0,4.0,1332,Twitter for iPhone,the amazon washington post fabricated the fact...,2017-07-25 02:23:18,15482,59829,False,889672374458646528,2017-07-25,2,23,2:23,False,0,1,-0.200000,0.883333,1,2017,7,25,23
98,98,7596,1.120440e+18,1,1.0,1.0,6462,Twitter Media Studio,this should never happen to another president ...,2019-04-22 21:30:57,19203,65922,False,1120439873226400000,2019-04-22,21,30,21:30,True,0,0,0.000000,0.000000,0,2019,4,22,30


In [224]:
manual_test_set = manually_labelled_set[['retweet_count', 'favorite_count', 'hour', 'min', 'linked', 'hash',
       'dot', 'polarity', 'subjectivity', 'sender', 'text', 'year',
       'month', 'day', 'minute']]

In [230]:
x_manual_test_set = manual_test_set.drop(['sender', 'text'], axis = 1, errors='ignore')
y_manual_test_set = manual_test_set.sender

In [231]:
model.score(x_manual_test_set, y_manual_test_set)

0.77

# TEST ON MANUALLY LABELLED SET (WITH TFIDF)

In [248]:
tfidf_vector_manual = pd.DataFrame(tfidf.transform(manual_test_set.text).toarray(), columns = tfidf.get_feature_names())

In [249]:
test_with_tfidf_manual = pd.concat([manual_test_set, tfidf_vector_manual], axis = 1).drop('text', axis = 1, errors='ignore')

In [251]:
x_test_with_tfidf_manual = test_with_tfidf_manual.drop('sender', axis=1, errors='ignore')
y_test_with_tfidf_manual = test_with_tfidf_manual.sender


In [252]:

model_with_tfidf.score(x_test_with_tfidf_manual, y_test_with_tfidf_manual)

0.89

# TWEET STUDIO (GROUND TRUTH:FALSE) GENERATION

In [258]:
tms_tweets = trump_tweet_with_features[trump_tweet_with_features['source'] == 'Twitter Media Studio']

In [263]:
tms_tweets['year'] = tms_tweets.created_at.dt.year
tms_tweets['month'] = tms_tweets.created_at.dt.month
tms_tweets['day'] = tms_tweets.created_at.dt.day
tms_tweets['minute'] = tms_tweets.created_at.dt.minute

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [266]:
tms_tweets['sender'] = 0
tms_cleaned  = tms_tweets[['retweet_count', 'favorite_count', 'hour', 'min', 'linked', 'hash',
                           'dot', 'polarity', 'subjectivity', 'sender', 'text', 'year',
                           'month', 'day', 'minute']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [271]:
tms_merged = tms_cleaned.append(manual_test_set, ignore_index=True)

In [275]:
x_tms_merged = tms_merged.drop(['sender', 'text'], axis = 1, errors='ignore')
y_tms_merged = tms_merged.sender


In [276]:
model.score(x_tms_merged, y_tms_merged)

0.8904109589041096

# TWEET STUDIO (GROUND TRUTH:FALSE) GENERATION with TFIDF

In [277]:
tfidf_vector_full = pd.DataFrame(tfidf.transform(tms_merged.text).toarray(), columns = tfidf.get_feature_names())

In [279]:
test_with_tfidf_full = pd.concat([tms_merged, tfidf_vector_full], axis = 1).drop('text', axis = 1, errors='ignore')

In [280]:
x_tfidf_full_test = test_with_tfidf_full.drop('sender', axis=1, errors='ignore')
y_tfidf_full_test = test_with_tfidf_full.sender


In [282]:
model_with_tfidf.score(x_tfidf_full_test, y_tfidf_full_test)

0.9497716894977168