In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import math
import datetime
from textblob import TextBlob 

%matplotlib inline

import seaborn as sns
sns.set(style='whitegrid')
# plt.rcParams.update({'font.size': 22})

plt.style.use('seaborn')
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)

In [61]:
## Data Import
trump_tweet_original_df = pd.read_csv("data/tweets_scraped_11_07_2019.csv")
trump_tweet_original_df['created_at'] = pd.to_datetime(trump_tweet_original_df['created_at'])

In [62]:
pre_iphone_tweets = trump_tweet_original_df[trump_tweet_original_df['created_at'] < pd.to_datetime('03-24-2017 14:41:15')]

pre_iphone_tweets['sender'] = (pre_iphone_tweets['source'] == 'Twitter for Android').astype(int)

# trump_tweet_original_df[trump_tweet_original_df['source'] == 'Twitter for Android']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [65]:
def generate_features(tweet_df):
    
    tweet_df_new = tweet_df.copy()
    
    # Generate Sentiment
    
    for i in range(len(tweet_df_new)):
        
        sentiment = TextBlob(tweet_df.iloc[i, 1]).sentiment
        
        polarity = sentiment[0]
        subjectivity = sentiment[1]
        
        sentiment_df = 
        
    return tweet_df_new

# Cleaning -- With Features Training Set 

In [297]:
##### With Features

trump_tweet_with_features = pd.read_csv("data/full.csv")
trump_tweet_with_features['created_at'] = pd.to_datetime(trump_tweet_with_features['created_at'])

In [298]:
pre_iphone_tweets_features = trump_tweet_with_features[trump_tweet_with_features['created_at'] < pd.to_datetime('03-24-2017 14:41:15')]

pre_iphone_tweets_features['sender'] = (pre_iphone_tweets_features['source'] == 'Twitter for Android').astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [299]:
training_set = pre_iphone_tweets_features[['created_at', 'hour', 'min', 'linked',
                                'hash', 'dot', 'polarity', 'subjectivity', 'sender', 'text']]

In [300]:
training_set['year'] = training_set.created_at.dt.year
training_set['month'] = training_set.created_at.dt.month
training_set['day'] = training_set.created_at.dt.day
training_set['hour'] = training_set.created_at.dt.hour
training_set['minute'] = training_set.created_at.dt.minute
training_set = training_set.drop(['created_at'], axis = 1, errors='ignore')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [301]:
training_set.head()

Unnamed: 0,hour,min,linked,hash,dot,polarity,subjectivity,sender,text,year,month,day,minute
0,11,36,False,0,0,0.45,0.833333,1,such a beautiful and important evening! the fo...,2016,11,9,36
1,19,31,True,0,0,1.0,1.0,0,happy 241st birthday to the u.s. marine corps!...,2016,11,10,31
2,2,10,False,0,0,0.58,0.676667,1,a fantastic day in d.c. met with president oba...,2016,11,11,10
3,2,19,False,0,0,0.009375,0.675,1,just had a very open and successful presidenti...,2016,11,11,19
4,11,14,False,0,0,0.41,0.563333,1,love the fact that the small groups of protest...,2016,11,11,14


In [302]:
training_set.columns.values

array(['hour', 'min', 'linked', 'hash', 'dot', 'polarity', 'subjectivity',
       'sender', 'text', 'year', 'month', 'day', 'minute'], dtype=object)

# Pre-iPhone Train-Test Split

In [303]:
pre_iphone_train, pre_iphone_test = train_test_split(training_set.drop('text', axis=1, errors='ignore'),
                                                     test_size=0.2, 
                                                     stratify = training_set.sender)

In [304]:
x_pre_iphone_train = pre_iphone_train.drop('sender', axis=1, errors='ignore')
y_pre_iphone_train = pre_iphone_train.sender

x_pre_iphone_test = pre_iphone_test.drop('sender', axis=1, errors='ignore')
y_pre_iphone_test = pre_iphone_test.sender

# Baseline Random Forest Classifier

In [305]:
model = RandomForestClassifier()

In [306]:
model.fit(x_pre_iphone_train, y_pre_iphone_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [307]:
## TRAINING ACCURACY
model.score(x_pre_iphone_train, y_pre_iphone_train)

0.9981378026070763

In [308]:
## TEST ACCURACY
model.score(x_pre_iphone_test, y_pre_iphone_test)

0.9333333333333333

# Trying Out TFIDF Vector

In [309]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [310]:
tfidf = TfidfVectorizer(max_df=0.95,min_df=0.02)

In [311]:
tfidf.fit(trump_tweet_original_df.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.95, max_features=None,
                min_df=0.02, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [312]:
n_cols = len(tfidf.get_feature_names())

In [313]:
tfidf_vector = pd.DataFrame(tfidf.transform(training_set.text).toarray(), columns = tfidf.get_feature_names())

In [314]:
training_with_tfidf = pd.concat([training_set, tfidf_vector], axis = 1).drop('text', axis = 1, errors='ignore')

In [315]:
training_with_tfidf.head()

Unnamed: 0,hour,min,linked,hash,dot,polarity,subjectivity,sender,year,month,day,minute,about,after,again,against,all,also,am,america,american,amp,an,and,are,as,at,back,bad,be,because,been,before,being,better,big,border,but,by,can,china,co,collusion,congress,country,crime,day.1,deal,democrats,dems,...,state,states,strong,tax,than,thank,that,the,their,them,there,they,this,time,to,today,total,trade,trump,two,united,up,us,very,vote,wall,want,was,way,we,well,were,what,when,which,whitehouse,who,why,will,win,witch,with,work,working,world,would,year.1,years,you,your
0,11,36,False,0,0,0.45,0.833333,1,2016,11,9,36,0.0,0.0,0.314953,0.0,0.230941,0.0,0.0,0.0,0.0,0.0,0.0,0.262353,0.0,0.258873,0.0,0.0,0.0,0.199657,0.0,0.0,0.339969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.395671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,19,31,True,0,0,1.0,1.0,0,2016,11,10,31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.463842,0.0,0.180355,0.0,0.0,0.0,0.0,0.0,0.0,0.219746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.381256,0.554051
2,2,10,False,0,0,0.58,0.676667,1,2016,11,11,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.364535,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.320658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,19,False,0,0,0.009375,0.675,1,2016,11,11,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147141,0.22142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.553239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11,14,False,0,0,0.41,0.563333,1,2016,11,11,14,0.0,0.0,0.0,0.0,0.292238,0.0,0.0,0.0,0.0,0.0,0.0,0.165994,0.0,0.0,0.0,0.0,0.0,0.252651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331288,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.23959,0.259616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.262065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.250346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [316]:
pre_iphone_train_with_tfidf, pre_iphone_test_with_tfidf = train_test_split(training_with_tfidf,
                                                                           test_size=0.2, 
                                                                           stratify = training_with_tfidf.sender)

In [317]:
x_pre_iphone_train_wtfidf = pre_iphone_train_with_tfidf.drop('sender', axis=1, errors='ignore')
y_pre_iphone_train_wtfidf = pre_iphone_train_with_tfidf.sender

x_pre_iphone_test_wtfidf = pre_iphone_test_with_tfidf.drop('sender', axis=1, errors='ignore')
y_pre_iphone_test_wtfidf = pre_iphone_test_with_tfidf.sender

In [318]:
model_with_tfidf = RandomForestClassifier()

In [319]:
model_with_tfidf.fit(x_pre_iphone_train_wtfidf, y_pre_iphone_train_wtfidf)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [320]:
model_with_tfidf.score(x_pre_iphone_train_wtfidf, y_pre_iphone_train_wtfidf)

1.0

In [321]:
model_with_tfidf.score(x_pre_iphone_test_wtfidf, y_pre_iphone_test_wtfidf)

0.8666666666666667

# TEST ON MANUALLY LABELLED SET

In [322]:
manually_labelled_set = pd.read_csv("manul100.csv")
manually_labelled_set['created_at'] = pd.to_datetime(manually_labelled_set['created_at'])

In [323]:
manually_labelled_set['sender'] = (manually_labelled_set['AVG'] >= 2.5).astype(int)

In [324]:
manually_labelled_set.sender.value_counts()

1    65
0    35
Name: sender, dtype: int64

In [325]:
manually_labelled_set['year'] = manually_labelled_set.created_at.dt.year
manually_labelled_set['month'] = manually_labelled_set.created_at.dt.month
manually_labelled_set['day'] = manually_labelled_set.created_at.dt.day
manually_labelled_set['minute'] = manually_labelled_set.created_at.dt.minute

In [326]:
manually_labelled_set

Unnamed: 0.2,Unnamed: 0,ID,id_str_x,Rscore,Jscore,AVG,Unnamed: 0.1,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str_y,date,hour,min,time,linked,hash,dot,polarity,subjectivity,sender,year,month,day,minute
0,0,1607,8.964813e+17,1,1.0,1.0,1464,Media Studio,we must remember this truth: no matter our col...,2017-08-12 21:19:23,33786,113556,False,896481262776360960,2017-08-12,21,19,21:19,True,0,0,0.125000,0.216667,0,2017,8,12,19
1,1,11849,1.186970e+18,5,4.0,4.5,9233,Twitter for iPhone,republicans are going to fight harder than eve...,2019-10-23 11:36:27,20898,90017,False,1186969632944611072,2019-10-23,11,36,11:36,False,0,0,0.233333,0.133333,1,2019,10,23,36
2,2,10562,1.171590e+18,4,4.0,4.0,8433,Twitter for iPhone,i am pleased to endorse governor mike parson o...,2019-09-11 01:04:25,13480,61236,False,1171590284544826880,2019-09-11,1,4,1:4,False,0,0,0.384333,0.776667,1,2019,9,11,4
3,3,6852,1.099684e+18,2,1.0,1.5,5906,Twitter for iPhone,poll: suburban women are coming back into the ...,2019-02-24 14:56:08,24312,110799,False,1099684406002925952,2019-02-24,14,56,14:56,False,0,0,0.250000,0.218750,0,2019,2,24,56
4,4,7865,1.124734e+18,1,1.0,1.0,6591,Twitter for iPhone,today may 4th - is international firefighters ...,2019-05-04 17:53:42,16891,65373,False,1124733856526077952,2019-05-04,17,53,17:53,True,1,0,0.500000,0.450000,0,2019,5,4,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,8283,1.131529e+18,4,2.0,3.0,6852,Twitter for iPhone,the democrats are getting nothing done in cong...,2019-05-23 11:56:03,21730,95999,False,1131529221862502016,2019-05-23,11,56,11:56,False,0,1,0.000000,0.562500,1,2019,5,23,56
96,96,4081,1.007987e+18,4,1.0,2.5,3598,Twitter for iPhone,the ig report totally destroys james comey and...,2018-06-16 14:01:33,30186,119661,False,1007986538985197952,2018-06-16,14,1,14:1,False,0,0,0.111806,0.536111,1,2018,6,16,1
97,97,1439,8.896724e+17,4,4.0,4.0,1332,Twitter for iPhone,the amazon washington post fabricated the fact...,2017-07-25 02:23:18,15482,59829,False,889672374458646528,2017-07-25,2,23,2:23,False,0,1,-0.200000,0.883333,1,2017,7,25,23
98,98,7596,1.120440e+18,1,1.0,1.0,6462,Twitter Media Studio,this should never happen to another president ...,2019-04-22 21:30:57,19203,65922,False,1120439873226400000,2019-04-22,21,30,21:30,True,0,0,0.000000,0.000000,0,2019,4,22,30


In [327]:
manual_test_set = manually_labelled_set[['hour', 'min', 'linked', 'hash',
       'dot', 'polarity', 'subjectivity', 'sender', 'text', 'year',
       'month', 'day', 'minute']]

In [328]:
x_manual_test_set = manual_test_set.drop(['sender', 'text'], axis = 1, errors='ignore')
y_manual_test_set = manual_test_set.sender

In [329]:
model.score(x_manual_test_set, y_manual_test_set)

0.57

# TEST ON MANUALLY LABELLED SET (WITH TFIDF)

In [330]:
tfidf_vector_manual = pd.DataFrame(tfidf.transform(manual_test_set.text).toarray(), columns = tfidf.get_feature_names())

In [331]:
test_with_tfidf_manual = pd.concat([manual_test_set, tfidf_vector_manual], axis = 1).drop('text', axis = 1, errors='ignore')

In [332]:
x_test_with_tfidf_manual = test_with_tfidf_manual.drop('sender', axis=1, errors='ignore')
y_test_with_tfidf_manual = test_with_tfidf_manual.sender


In [336]:
x_test_with_tfidf_manual

Unnamed: 0,hour,min,linked,hash,dot,polarity,subjectivity,year,month,day,minute,about,after,again,against,all,also,am,america,american,amp,an,and,are,as,at,back,bad,be,because,been,before,being,better,big,border,but,by,can,china,co,collusion,congress,country,crime,day.1,deal,democrats,dems,did,...,state,states,strong,tax,than,thank,that,the,their,them,there,they,this,time,to,today,total,trade,trump,two,united,up,us,very,vote,wall,want,was,way,we,well,were,what,when,which,whitehouse,who,why,will,win,witch,with,work,working,world,would,year.1,years,you,your
0,21,19,True,0,0,0.125000,0.216667,2017,8,12,19,0.000000,0.0,0.000000,0.000000,0.266008,0.0,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.227369,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.187288,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.255037,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.477087,0.0,0.0,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,11,36,False,0,0,0.233333,0.133333,2019,10,23,36,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.153660,0.000000,0.000000,0.242908,0.0,0.0,0.250437,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.203794,0.0,0.000000,0.0,0.201145,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.223507,0.000000,0.000000,0.159705,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.291878,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.222233,0.0,0.00000,0.0,0.000000,0.0,0.00000,0.263123,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2,1,4,False,0,0,0.384333,0.776667,2019,9,11,4,0.000000,0.0,0.203600,0.000000,0.000000,0.0,0.21013,0.0,0.000000,0.0,0.0,0.169597,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.220741,0.0,0.000000,0.000000,0.122395,0.066313,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.080796,0.000000,0.226106,0.0,0.0,0.0,0.0,0.0,0.0,0.159417,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.184552,0.0,0.00000,0.0,0.000000,0.0,0.12789,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,14,56,False,0,0,0.250000,0.218750,2019,2,24,56,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.177037,0.266407,0.000000,0.000000,0.210570,0.0,0.0,0.217096,0.0,0.0,0.0,0.0,0.0,0.390419,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.276887,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.458491,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,17,53,True,1,0,0.500000,0.450000,2019,5,4,53,0.000000,0.0,0.000000,0.000000,0.193483,0.0,0.00000,0.0,0.000000,0.0,0.0,0.109900,0.165379,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.136225,0.0,0.000000,0.000000,0.0,0.273694,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.221029,0.000000,0.085942,0.226570,0.000000,0.0,0.000000,0.000000,0.000000,0.314139,0.234437,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.347013,0.0,0.0,0.000000,0.0,0.00000,0.0,0.435654,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181675,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,11,56,False,0,1,0.000000,0.562500,2019,5,23,56,0.176658,0.0,0.000000,0.000000,0.144677,0.0,0.00000,0.0,0.000000,0.0,0.0,0.082178,0.123662,0.000000,0.139969,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.214718,0.000000,0.0,0.000000,0.0,0.161877,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.257053,0.338834,0.198978,0.0,0.139007,0.000000,0.180022,0.078299,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.207857,0.000000,0.0,0.0,0.000000,0.0,0.19274,0.0,0.000000,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
96,14,1,False,0,0,0.111806,0.536111,2018,6,16,1,0.000000,0.0,0.000000,0.266606,0.179233,0.0,0.00000,0.0,0.233976,0.0,0.0,0.305418,0.000000,0.200911,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.238838,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.0,0.201783,0.0,0.15354,0.000000,0.261223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
97,2,23,False,0,1,-0.200000,0.883333,2017,7,25,23,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.315969,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.494177,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.301055,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
98,21,30,True,0,0,0.000000,0.000000,2019,4,22,30,0.000000,0.0,0.470564,0.000000,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.242934,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.330813,0.000000,0.186737,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [337]:

model_with_tfidf.score(x_test_with_tfidf_manual, y_test_with_tfidf_manual)

0.89

# TWEET STUDIO (GROUND TRUTH:FALSE) GENERATION

In [338]:
tms_tweets = trump_tweet_with_features[trump_tweet_with_features['source'] == 'Twitter Media Studio']

In [339]:
tms_tweets['year'] = tms_tweets.created_at.dt.year
tms_tweets['month'] = tms_tweets.created_at.dt.month
tms_tweets['day'] = tms_tweets.created_at.dt.day
tms_tweets['minute'] = tms_tweets.created_at.dt.minute

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [340]:
tms_tweets['sender'] = 0
tms_cleaned  = tms_tweets[['hour', 'min', 'linked', 'hash',
                           'dot', 'polarity', 'subjectivity', 'sender', 'text', 'year',
                           'month', 'day', 'minute']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [341]:
tms_merged = tms_cleaned.append(manual_test_set, ignore_index=True)

In [342]:
x_tms_merged = tms_merged.drop(['sender', 'text'], axis = 1, errors='ignore')
y_tms_merged = tms_merged.sender


In [343]:
model.score(x_tms_merged, y_tms_merged)

0.8036529680365296

# TWEET STUDIO (GROUND TRUTH:FALSE) GENERATION with TFIDF

In [344]:
tfidf_vector_full = pd.DataFrame(tfidf.transform(tms_merged.text).toarray(), columns = tfidf.get_feature_names())

In [345]:
test_with_tfidf_full = pd.concat([tms_merged, tfidf_vector_full], axis = 1).drop('text', axis = 1, errors='ignore')

In [346]:
x_tfidf_full_test = test_with_tfidf_full.drop('sender', axis=1, errors='ignore')
y_tfidf_full_test = test_with_tfidf_full.sender


In [347]:
model_with_tfidf.score(x_tfidf_full_test, y_tfidf_full_test)

0.9497716894977168

# REAL DONALD TRUMP TWEETS GENERATION

In [348]:
trump_tweet_with_features

trump_tweet_with_features['year'] = trump_tweet_with_features.created_at.dt.year
trump_tweet_with_features['month'] = trump_tweet_with_features.created_at.dt.month
trump_tweet_with_features['day'] = trump_tweet_with_features.created_at.dt.day
trump_tweet_with_features['minute'] = trump_tweet_with_features.created_at.dt.minute

In [350]:
trump_tweet_with_features_cleaned  = trump_tweet_with_features[['hour', 'min', 'linked', 'hash',
                           'dot', 'polarity', 'subjectivity', 'text', 'year',
                           'month', 'day', 'minute']]

In [351]:
trump_tweet_with_features_cleaned

Unnamed: 0,hour,min,linked,hash,dot,polarity,subjectivity,text,year,month,day,minute
0,11,36,False,0,0,0.450000,0.833333,such a beautiful and important evening! the fo...,2016,11,9,36
1,19,31,True,0,0,1.000000,1.000000,happy 241st birthday to the u.s. marine corps!...,2016,11,10,31
2,2,10,False,0,0,0.580000,0.676667,a fantastic day in d.c. met with president oba...,2016,11,11,10
3,2,19,False,0,0,0.009375,0.675000,just had a very open and successful presidenti...,2016,11,11,19
4,11,14,False,0,0,0.410000,0.563333,love the fact that the small groups of protest...,2016,11,11,14
...,...,...,...,...,...,...,...,...,...,...,...,...
9489,15,16,False,0,0,-0.140625,0.458333,it was just explained to me that for next week...,2019,11,7,16
9490,15,18,True,1,0,0.000000,0.000000,thank you! #maga https://t.co/e6dzshyfmv,2019,11,7,18
9491,15,27,False,0,0,-0.116071,0.750000,the amazon washington post and three lowlife r...,2019,11,7,27
9492,15,41,False,0,0,0.169697,0.349053,the radical left dems and lamestream media are...,2019,11,7,41


In [352]:
tfidf_vector_all = pd.DataFrame(tfidf.transform(trump_tweet_with_features_cleaned.text).toarray(), columns = tfidf.get_feature_names())

In [353]:
to_generate = pd.concat([trump_tweet_with_features_cleaned, tfidf_vector_all], axis = 1).drop('text', axis = 1, errors='ignore')

In [354]:
to_generate

Unnamed: 0,hour,min,linked,hash,dot,polarity,subjectivity,year,month,day,minute,about,after,again,against,all,also,am,america,american,amp,an,and,are,as,at,back,bad,be,because,been,before,being,better,big,border,but,by,can,china,co,collusion,congress,country,crime,day.1,deal,democrats,dems,did,...,state,states,strong,tax,than,thank,that,the,their,them,there,they,this,time,to,today,total,trade,trump,two,united,up,us,very,vote,wall,want,was,way,we,well,were,what,when,which,whitehouse,who,why,will,win,witch,with,work,working,world,would,year.1,years,you,your
0,11,36,False,0,0,0.450000,0.833333,2016,11,9,36,0.000000,0.0,0.314953,0.000000,0.230941,0.0,0.0,0.0,0.0,0.000000,0.0,0.262353,0.000000,0.258873,0.0,0.0,0.0,0.199657,0.0,0.0,0.339969,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.102580,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.207097,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.395671,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1,19,31,True,0,0,1.000000,1.000000,2016,11,10,31,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.285876,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.463842,0.000000,0.180355,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.219746,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.381256,0.554051
2,2,10,False,0,0,0.580000,0.676667,2016,11,11,10,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.364535,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.320658,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.216091,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
3,2,19,False,0,0,0.009375,0.675000,2016,11,11,19,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.147141,0.221420,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.267925,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.115065,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.553239,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
4,11,14,False,0,0,0.410000,0.563333,2016,11,11,14,0.000000,0.0,0.000000,0.000000,0.292238,0.0,0.0,0.0,0.0,0.000000,0.0,0.165994,0.000000,0.000000,0.0,0.0,0.0,0.252651,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.331288,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.239590,0.259616,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.262065,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.250346,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9489,15,16,False,0,0,-0.140625,0.458333,2019,11,7,16,0.000000,0.0,0.000000,0.223011,0.000000,0.0,0.0,0.0,0.0,0.145649,0.0,0.170318,0.000000,0.168059,0.0,0.0,0.0,0.129616,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.122915,0.133189,0.0,0.000000,0.0,0.144049,0.143742,0.000000,0.162279,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.14982,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.218509,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
9490,15,18,True,1,0,0.000000,0.000000,2019,11,7,18,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.395065,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.641004,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.526874,0.000000
9491,15,27,False,0,0,-0.116071,0.750000,2019,11,7,27,0.298151,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.237211,0.0,0.277387,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.359451,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.108459,0.0,0.000000,0.0,0.469210,0.234105,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.218964,0.0,0.0,0.0,0.0,0.325292,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
9492,15,41,False,0,0,0.169697,0.349053,2019,11,7,41,0.000000,0.0,0.000000,0.288874,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.220618,0.165994,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.270379,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.172524,0.0,0.267093,0.0,0.000000,0.000000,0.000000,0.210205,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.284245,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000


In [355]:
model_with_tfidf.predict(to_generate).reshape(-1, 1)

array([[1],
       [0],
       [1],
       ...,
       [1],
       [1],
       [0]])

In [356]:
model_with_tfidf.predict_proba(to_generate).reshape(-1, 1)

array([[0. ],
       [1. ],
       [1. ],
       ...,
       [0.7],
       [0.8],
       [0.2]])

# JK, WRONG DATASET

In [295]:
trump_tweet_original_df['text_blob'] = trump_tweet_original_df['text'].apply(lambda text: TextBlob(text))

In [296]:
trump_tweet_original_df

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,text_blob
0,Twitter for Android,Such a beautiful and important evening! The fo...,2016-11-09 11:36:58,220796,633253,False,796315640307060738,"(S, u, c, h, , a, , b, e, a, u, t, i, f, u, ..."
1,Twitter for iPhone,Happy 241st birthday to the U.S. Marine Corps!...,2016-11-10 19:31:27,45576,169729,False,796797436752707585,"(H, a, p, p, y, , 2, 4, 1, s, t, , b, i, r, ..."
2,Twitter for Android,A fantastic day in D.C. Met with President Oba...,2016-11-11 02:10:46,37788,192638,False,796897928048766976,"(A, , f, a, n, t, a, s, t, i, c, , d, a, y, ..."
3,Twitter for Android,Just had a very open and successful presidenti...,2016-11-11 02:19:44,69498,231526,False,796900183955095552,"(J, u, s, t, , h, a, d, , a, , v, e, r, y, ..."
4,Twitter for Android,Love the fact that the small groups of protest...,2016-11-11 11:14:20,55954,221718,False,797034721075228672,"(L, o, v, e, , t, h, e, , f, a, c, t, , t, ..."
...,...,...,...,...,...,...,...,...
12318,Twitter for iPhone,It was just explained to me that for next week...,2019-11-07 15:16:15,17716,59582,False,1192460764235419654,"(I, t, , w, a, s, , j, u, s, t, , e, x, p, ..."
12319,Twitter for iPhone,THANK YOU! #MAGA https://t.co/e6dZshYFMV,2019-11-07 15:18:53,6548,26575,False,1192461425358385154,"(T, H, A, N, K, , Y, O, U, !, , #, M, A, G, ..."
12320,Twitter for iPhone,The Amazon Washington Post and three lowlife r...,2019-11-07 15:27:57,12082,38697,False,1192463709400117250,"(T, h, e, , A, m, a, z, o, n, , W, a, s, h, ..."
12321,Twitter for iPhone,The Radical Left Dems and LameStream Media are...,2019-11-07 15:41:53,8318,29340,False,1192467215360102401,"(T, h, e, , R, a, d, i, c, a, l, , L, e, f, ..."


In [None]:
from textblob import TextBlob

def find_emotion(i):
    s = trump_tweet_original_df.iloc[i, 2]
    t = TextBlob(s)
    df.iloc[i,15] = t.sentiment[0] #polarity
    df.iloc[i,16] = t.sentiment[1] #subjectivity
    
    
for i in range(9494):
    find_emotion(i)