In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import math
import datetime
from textblob import TextBlob 

%matplotlib inline

import seaborn as sns
sns.set(style='whitegrid')
# plt.rcParams.update({'font.size': 22})

plt.style.use('seaborn')
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)

In [43]:
## Data Import
trump_tweet_original_df = pd.read_csv("data/tweets_scraped_11_07_2019.csv")
trump_tweet_original_df['created_at'] = pd.to_datetime(trump_tweet_original_df['created_at'])

In [44]:
trump_tweet_original_df['text_cleaned'] = trump_tweet_original_df.text.str.replace("[^\w\s]", "").str.lower()

In [45]:
trump_tweet_original_df['text_cleaned'] = trump_tweet_original_df['text_cleaned'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))


In [46]:
trump_tweet_original_df.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,text_cleaned
0,Twitter for Android,Such a beautiful and important evening! The fo...,2016-11-09 11:36:58,220796,633253,False,796315640307060738,beautiful important evening forgotten man woma...
1,Twitter for iPhone,Happy 241st birthday to the U.S. Marine Corps!...,2016-11-10 19:31:27,45576,169729,False,796797436752707585,happy 241st birthday us marine corps thank ser...
2,Twitter for Android,A fantastic day in D.C. Met with President Oba...,2016-11-11 02:10:46,37788,192638,False,796897928048766976,fantastic day dc met president obama first tim...
3,Twitter for Android,Just had a very open and successful presidenti...,2016-11-11 02:19:44,69498,231526,False,796900183955095552,open successful presidential election professi...
4,Twitter for Android,Love the fact that the small groups of protest...,2016-11-11 11:14:20,55954,221718,False,797034721075228672,love fact small groups protesters last night p...


In [47]:
pre_iphone_tweets = trump_tweet_original_df[trump_tweet_original_df['created_at'] < pd.to_datetime('03-24-2017 14:41:15')]

pre_iphone_tweets['sender'] = (pre_iphone_tweets['source'] == 'Twitter for Android').astype(int)

# trump_tweet_original_df[trump_tweet_original_df['source'] == 'Twitter for Android']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [48]:
def generate_features(df_input):
    
    df = df_input.copy()
    
    df['polarity'] = df['text'].apply(lambda text: TextBlob(text).sentiment[0])
    df['subjectivity'] = df['text'].apply(lambda text: TextBlob(text).sentiment[1])

    df['hash'] = (df['text'].str.contains('#') == True).astype(int)
    df['linked'] = (df['text'].str.contains('https://t.co/') == True).astype(int)
    df['dot'] = (df['text'].str.contains('...') == True).astype(int)
    
    df['year'] = df.created_at.dt.year
    df['month'] = df.created_at.dt.month
    df['day'] = df.created_at.dt.day
    df['hour'] = df.created_at.dt.hour
    df['minute'] = df.created_at.dt.minute
#     df = df.drop(['created_at', 'source'], axis = 1, errors='ignore')
    
    return df


# Cleaning -- With Features Training Set 

In [7]:
##### With Features

# trump_tweet_with_features = pd.read_csv("data/full.csv")
# trump_tweet_with_features['is_retweet'] = 0
# trump_tweet_with_features['created_at'] = pd.to_datetime(trump_tweet_with_features['created_at'])

In [49]:
trump_tweet_with_features = generate_features(trump_tweet_original_df)

In [50]:
trump_tweet_with_features.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,text_cleaned,polarity,subjectivity,hash,linked,dot,year,month,day,hour,minute
0,Twitter for Android,Such a beautiful and important evening! The fo...,2016-11-09 11:36:58,220796,633253,False,796315640307060738,beautiful important evening forgotten man woma...,0.45,0.833333,0,0,1,2016,11,9,11,36
1,Twitter for iPhone,Happy 241st birthday to the U.S. Marine Corps!...,2016-11-10 19:31:27,45576,169729,False,796797436752707585,happy 241st birthday us marine corps thank ser...,1.0,1.0,0,1,1,2016,11,10,19,31
2,Twitter for Android,A fantastic day in D.C. Met with President Oba...,2016-11-11 02:10:46,37788,192638,False,796897928048766976,fantastic day dc met president obama first tim...,0.58,0.676667,0,0,1,2016,11,11,2,10
3,Twitter for Android,Just had a very open and successful presidenti...,2016-11-11 02:19:44,69498,231526,False,796900183955095552,open successful presidential election professi...,0.009375,0.675,0,0,1,2016,11,11,2,19
4,Twitter for Android,Love the fact that the small groups of protest...,2016-11-11 11:14:20,55954,221718,False,797034721075228672,love fact small groups protesters last night p...,0.41,0.563333,0,0,1,2016,11,11,11,14


In [51]:
pre_iphone_tweets_features = trump_tweet_with_features[trump_tweet_with_features['created_at'] < pd.to_datetime('03-24-2017 14:41:15')]

pre_iphone_tweets_features['sender'] = (pre_iphone_tweets_features['source'] == 'Twitter for Android').astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [52]:
pre_iphone_tweets_features.is_retweet.value_counts()

False    672
True      32
Name: is_retweet, dtype: int64

In [58]:
training_set = pre_iphone_tweets_features[['created_at', 'hour', 'minute', 'linked',
                                'hash', 'dot', 'polarity', 'subjectivity', 'sender', 'text']]

In [59]:
# training_set[training_set['is_retweet'] == 1]

In [60]:
training_set['year'] = training_set.created_at.dt.year
training_set['month'] = training_set.created_at.dt.month
training_set['day'] = training_set.created_at.dt.day
training_set['hour'] = training_set.created_at.dt.hour
training_set['minute'] = training_set.created_at.dt.minute
training_set = training_set.drop(['created_at'], axis = 1, errors='ignore')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [61]:
training_set.head()

Unnamed: 0,hour,minute,linked,hash,dot,polarity,subjectivity,sender,text,year,month,day
0,11,36,0,0,1,0.45,0.833333,1,Such a beautiful and important evening! The fo...,2016,11,9
1,19,31,1,0,1,1.0,1.0,0,Happy 241st birthday to the U.S. Marine Corps!...,2016,11,10
2,2,10,0,0,1,0.58,0.676667,1,A fantastic day in D.C. Met with President Oba...,2016,11,11
3,2,19,0,0,1,0.009375,0.675,1,Just had a very open and successful presidenti...,2016,11,11
4,11,14,0,0,1,0.41,0.563333,1,Love the fact that the small groups of protest...,2016,11,11


In [62]:
training_set.columns.values

array(['hour', 'minute', 'linked', 'hash', 'dot', 'polarity',
       'subjectivity', 'sender', 'text', 'year', 'month', 'day'],
      dtype=object)

# Pre-iPhone Train-Test Split

In [63]:
pre_iphone_train, pre_iphone_test = train_test_split(training_set.drop('text', axis=1, errors='ignore'),
                                                     test_size=0.2, 
                                                     stratify = training_set.sender)

In [64]:
x_pre_iphone_train = pre_iphone_train.drop('sender', axis=1, errors='ignore')
y_pre_iphone_train = pre_iphone_train.sender

x_pre_iphone_test = pre_iphone_test.drop('sender', axis=1, errors='ignore')
y_pre_iphone_test = pre_iphone_test.sender

# Baseline Random Forest Classifier

In [65]:
model = RandomForestClassifier()

In [66]:
model.fit(x_pre_iphone_train, y_pre_iphone_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [67]:
## TRAINING ACCURACY
model.score(x_pre_iphone_train, y_pre_iphone_train)

0.9893428063943162

In [68]:
x_pre_iphone_train.columns.values

array(['hour', 'minute', 'linked', 'hash', 'dot', 'polarity',
       'subjectivity', 'year', 'month', 'day'], dtype=object)

In [69]:
## TEST ACCURACY
model.score(x_pre_iphone_test, y_pre_iphone_test)

0.8794326241134752

# Trying Out TFIDF Vector

In [70]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:
tfidf = TfidfVectorizer(max_df=0.95,min_df=0.02)

In [118]:
tfidf.fit(trump_tweet_original_df.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.95, max_features=None,
                min_df=0.02, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [119]:
n_cols = len(tfidf.get_feature_names())

In [120]:
tfidf.get_feature_names()

['about',
 'after',
 'again',
 'against',
 'all',
 'also',
 'am',
 'america',
 'american',
 'amp',
 'an',
 'and',
 'are',
 'as',
 'at',
 'back',
 'bad',
 'be',
 'because',
 'been',
 'before',
 'being',
 'better',
 'big',
 'border',
 'but',
 'by',
 'can',
 'china',
 'co',
 'collusion',
 'congress',
 'country',
 'crime',
 'day',
 'deal',
 'democrats',
 'dems',
 'did',
 'do',
 'doing',
 'don',
 'done',
 'down',
 'economy',
 'election',
 'even',
 'ever',
 'fake',
 'far',
 'fbi',
 'first',
 'for',
 'foxandfriends',
 'foxnews',
 'from',
 'get',
 'go',
 'going',
 'good',
 'great',
 'had',
 'hard',
 'has',
 'have',
 'he',
 'hillary',
 'him',
 'his',
 'history',
 'honor',
 'house',
 'how',
 'https',
 'hunt',
 'if',
 'in',
 'into',
 'is',
 'it',
 'job',
 'jobs',
 'just',
 'know',
 'last',
 'like',
 'long',
 'look',
 'made',
 'make',
 'many',
 'me',
 'media',
 'military',
 'more',
 'most',
 'much',
 'mueller',
 'must',
 'my',
 'never',
 'new',
 'news',
 'no',
 'north',
 'not',
 'nothing',
 'now',

In [121]:
tfidf_vector = pd.DataFrame(tfidf.transform(training_set.text).toarray(), columns = tfidf.get_feature_names())

In [122]:
training_with_tfidf = pd.concat([training_set, tfidf_vector], axis = 1).drop('text', axis = 1, errors='ignore')

In [123]:
training_with_tfidf.head()

Unnamed: 0,hour,minute,linked,hash,dot,polarity,subjectivity,sender,year,month,day,about,after,again,against,all,also,am,america,american,amp,an,and,are,as,at,back,bad,be,because,been,before,being,better,big,border,but,by,can,china,co,collusion,congress,country,crime,day.1,deal,democrats,dems,did,...,state,states,strong,tax,than,thank,that,the,their,them,there,they,this,time,to,today,total,trade,trump,two,united,up,us,very,vote,wall,want,was,way,we,well,were,what,when,which,whitehouse,who,why,will,win,witch,with,work,working,world,would,year.1,years,you,your
0,11,36,0,0,1,0.45,0.833333,1,2016,11,9,0.0,0.0,0.314953,0.0,0.230941,0.0,0.0,0.0,0.0,0.0,0.0,0.262353,0.0,0.258873,0.0,0.0,0.0,0.199657,0.0,0.0,0.339969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.395671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,19,31,1,0,1,1.0,1.0,0,2016,11,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.463842,0.0,0.180355,0.0,0.0,0.0,0.0,0.0,0.0,0.219746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.381256,0.554051
2,2,10,0,0,1,0.58,0.676667,1,2016,11,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.364535,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.320658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,19,0,0,1,0.009375,0.675,1,2016,11,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147141,0.22142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.553239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11,14,0,0,1,0.41,0.563333,1,2016,11,11,0.0,0.0,0.0,0.0,0.292238,0.0,0.0,0.0,0.0,0.0,0.0,0.165994,0.0,0.0,0.0,0.0,0.0,0.252651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331288,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.23959,0.259616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.262065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.250346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [124]:
pre_iphone_train_with_tfidf, pre_iphone_test_with_tfidf = train_test_split(training_with_tfidf,
                                                                           test_size=0.2, 
                                                                           stratify = training_with_tfidf.sender)

In [125]:
x_pre_iphone_train_wtfidf = pre_iphone_train_with_tfidf.drop('sender', axis=1, errors='ignore')
y_pre_iphone_train_wtfidf = pre_iphone_train_with_tfidf.sender

x_pre_iphone_test_wtfidf = pre_iphone_test_with_tfidf.drop('sender', axis=1, errors='ignore')
y_pre_iphone_test_wtfidf = pre_iphone_test_with_tfidf.sender

In [126]:
model_with_tfidf = RandomForestClassifier()

In [127]:
model_with_tfidf.fit(x_pre_iphone_train_wtfidf, y_pre_iphone_train_wtfidf)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [128]:
model_with_tfidf.score(x_pre_iphone_train_wtfidf, y_pre_iphone_train_wtfidf)

0.9946714031971581

In [129]:
model_with_tfidf.score(x_pre_iphone_test_wtfidf, y_pre_iphone_test_wtfidf)

0.8368794326241135

# TEST ON MANUALLY LABELLED SET

In [130]:
manually_labelled_set = pd.read_csv("manul100.csv")
manually_labelled_set['created_at'] = pd.to_datetime(manually_labelled_set['created_at'])
manually_labelled_set['is_retweet'] = 0

In [131]:
manually_labelled_set['sender'] = (manually_labelled_set['AVG'] >= 2.5).astype(int)

In [132]:
manually_labelled_set.sender.value_counts()

1    65
0    35
Name: sender, dtype: int64

In [133]:
manually_labelled_set['year'] = manually_labelled_set.created_at.dt.year
manually_labelled_set['month'] = manually_labelled_set.created_at.dt.month
manually_labelled_set['day'] = manually_labelled_set.created_at.dt.day
manually_labelled_set['minute'] = manually_labelled_set.created_at.dt.minute

In [134]:
manually_labelled_set.head()

Unnamed: 0.2,Unnamed: 0,ID,id_str_x,Rscore,Jscore,AVG,Unnamed: 0.1,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str_y,date,hour,min,time,linked,hash,dot,polarity,subjectivity,sender,year,month,day,minute
0,0,1607,8.964813e+17,1,1.0,1.0,1464,Media Studio,we must remember this truth: no matter our col...,2017-08-12 21:19:23,33786,113556,0,896481262776360960,2017-08-12,21,19,21:19,True,0,0,0.125,0.216667,0,2017,8,12,19
1,1,11849,1.18697e+18,5,4.0,4.5,9233,Twitter for iPhone,republicans are going to fight harder than eve...,2019-10-23 11:36:27,20898,90017,0,1186969632944611072,2019-10-23,11,36,11:36,False,0,0,0.233333,0.133333,1,2019,10,23,36
2,2,10562,1.17159e+18,4,4.0,4.0,8433,Twitter for iPhone,i am pleased to endorse governor mike parson o...,2019-09-11 01:04:25,13480,61236,0,1171590284544826880,2019-09-11,1,4,1:4,False,0,0,0.384333,0.776667,1,2019,9,11,4
3,3,6852,1.099684e+18,2,1.0,1.5,5906,Twitter for iPhone,poll: suburban women are coming back into the ...,2019-02-24 14:56:08,24312,110799,0,1099684406002925952,2019-02-24,14,56,14:56,False,0,0,0.25,0.21875,0,2019,2,24,56
4,4,7865,1.124734e+18,1,1.0,1.0,6591,Twitter for iPhone,today may 4th - is international firefighters ...,2019-05-04 17:53:42,16891,65373,0,1124733856526077952,2019-05-04,17,53,17:53,True,1,0,0.5,0.45,0,2019,5,4,53


In [135]:
manual_test_set = manually_labelled_set[['hour', 'minute', 'linked', 'hash',
       'dot', 'polarity', 'subjectivity', 'sender', 'text', 'year',
       'month', 'day']]

In [136]:
# manual_test_set.is_retweet.value_counts()

In [137]:
x_manual_test_set = manual_test_set.drop(['sender', 'text'], axis = 1, errors='ignore')
y_manual_test_set = manual_test_set.sender

In [138]:
x_manual_test_set

Unnamed: 0,hour,minute,linked,hash,dot,polarity,subjectivity,year,month,day
0,21,19,True,0,0,0.125000,0.216667,2017,8,12
1,11,36,False,0,0,0.233333,0.133333,2019,10,23
2,1,4,False,0,0,0.384333,0.776667,2019,9,11
3,14,56,False,0,0,0.250000,0.218750,2019,2,24
4,17,53,True,1,0,0.500000,0.450000,2019,5,4
...,...,...,...,...,...,...,...,...,...,...
95,11,56,False,0,1,0.000000,0.562500,2019,5,23
96,14,1,False,0,0,0.111806,0.536111,2018,6,16
97,2,23,False,0,1,-0.200000,0.883333,2017,7,25
98,21,30,True,0,0,0.000000,0.000000,2019,4,22


In [139]:
# array(['hour', 'minute', 'linked', 'hash', 'dot', 'polarity',
#        'subjectivity', 'is_retweet', 'year', 'month', 'day'], dtype=object)


In [140]:
x_manual_test_set.columns.values

array(['hour', 'minute', 'linked', 'hash', 'dot', 'polarity',
       'subjectivity', 'year', 'month', 'day'], dtype=object)

In [141]:
model.score(x_manual_test_set, y_manual_test_set)

0.63

# TEST ON MANUALLY LABELLED SET (WITH TFIDF)

In [142]:
tfidf_vector_manual = pd.DataFrame(tfidf.transform(manual_test_set.text).toarray(), columns = tfidf.get_feature_names())

In [143]:
test_with_tfidf_manual = pd.concat([manual_test_set, tfidf_vector_manual], axis = 1).drop('text', axis = 1, errors='ignore')

In [144]:
x_test_with_tfidf_manual = test_with_tfidf_manual.drop('sender', axis=1, errors='ignore')
y_test_with_tfidf_manual = test_with_tfidf_manual.sender


In [145]:
x_test_with_tfidf_manual

Unnamed: 0,hour,minute,linked,hash,dot,polarity,subjectivity,year,month,day,about,after,again,against,all,also,am,america,american,amp,an,and,are,as,at,back,bad,be,because,been,before,being,better,big,border,but,by,can,china,co,collusion,congress,country,crime,day.1,deal,democrats,dems,did,do,...,state,states,strong,tax,than,thank,that,the,their,them,there,they,this,time,to,today,total,trade,trump,two,united,up,us,very,vote,wall,want,was,way,we,well,were,what,when,which,whitehouse,who,why,will,win,witch,with,work,working,world,would,year.1,years,you,your
0,21,19,True,0,0,0.125000,0.216667,2017,8,12,0.000000,0.0,0.000000,0.000000,0.266008,0.0,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.227369,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.187288,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.255037,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.477087,0.0,0.0,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,11,36,False,0,0,0.233333,0.133333,2019,10,23,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.153660,0.000000,0.000000,0.242908,0.0,0.0,0.250437,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.203794,0.0,0.000000,0.0,0.201145,0.0,0.0,0.212367,...,0.0,0.0,0.000000,0.0,0.223507,0.000000,0.000000,0.159705,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.291878,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.222233,0.0,0.00000,0.0,0.000000,0.0,0.00000,0.263123,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2,1,4,False,0,0,0.384333,0.776667,2019,9,11,0.000000,0.0,0.203600,0.000000,0.000000,0.0,0.21013,0.0,0.000000,0.0,0.0,0.169597,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.220741,0.0,0.000000,0.000000,0.122395,0.066313,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.080796,0.000000,0.226106,0.0,0.0,0.0,0.0,0.0,0.0,0.159417,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.184552,0.0,0.00000,0.0,0.000000,0.0,0.12789,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,14,56,False,0,0,0.250000,0.218750,2019,2,24,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.177037,0.266407,0.000000,0.000000,0.210570,0.0,0.0,0.217096,0.0,0.0,0.0,0.0,0.0,0.390419,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.276887,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.458491,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,17,53,True,1,0,0.500000,0.450000,2019,5,4,0.000000,0.0,0.000000,0.000000,0.193483,0.0,0.00000,0.0,0.000000,0.0,0.0,0.109900,0.165379,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.136225,0.0,0.000000,0.000000,0.0,0.273694,0.0,0.000000,0.0,0.0,0.228563,...,0.0,0.0,0.000000,0.0,0.000000,0.221029,0.000000,0.085942,0.226570,0.000000,0.0,0.000000,0.000000,0.000000,0.314139,0.234437,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.347013,0.0,0.0,0.000000,0.0,0.00000,0.0,0.435654,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181675,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,11,56,False,0,1,0.000000,0.562500,2019,5,23,0.176658,0.0,0.000000,0.000000,0.144677,0.0,0.00000,0.0,0.000000,0.0,0.0,0.082178,0.123662,0.000000,0.139969,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.214718,0.000000,0.0,0.000000,0.0,0.161877,0.0,0.0,0.170908,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.257053,0.338834,0.198978,0.0,0.139007,0.000000,0.180022,0.078299,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.207857,0.000000,0.0,0.0,0.000000,0.0,0.19274,0.0,0.000000,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
96,14,1,False,0,0,0.111806,0.536111,2018,6,16,0.000000,0.0,0.000000,0.266606,0.179233,0.0,0.00000,0.0,0.233976,0.0,0.0,0.305418,0.000000,0.200911,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.238838,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.0,0.201783,0.0,0.15354,0.000000,0.261223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
97,2,23,False,0,1,-0.200000,0.883333,2017,7,25,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.315969,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.494177,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.301055,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
98,21,30,True,0,0,0.000000,0.000000,2019,4,22,0.000000,0.0,0.470564,0.000000,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.242934,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.330813,0.000000,0.186737,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.0,0.000000,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [146]:

model_with_tfidf.score(x_test_with_tfidf_manual, y_test_with_tfidf_manual)

0.82

# TWEET STUDIO (GROUND TRUTH:FALSE) GENERATION

In [147]:
tms_tweets = trump_tweet_with_features[trump_tweet_with_features['source'] == 'Twitter Media Studio']

In [148]:
tms_tweets['year'] = tms_tweets.created_at.dt.year
tms_tweets['month'] = tms_tweets.created_at.dt.month
tms_tweets['day'] = tms_tweets.created_at.dt.day
tms_tweets['minute'] = tms_tweets.created_at.dt.minute

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [149]:
tms_tweets['sender'] = 0
tms_cleaned  = tms_tweets[['hour', 'linked', 'hash',
                           'dot', 'polarity', 'subjectivity', 'sender', 'text', 'year',
                           'month', 'day', 'minute']]
# tms_cleaned['is_retweet'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [150]:
tms_merged = tms_cleaned.append(manual_test_set, ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [151]:
x_tms_merged = tms_merged.drop(['sender', 'text'], axis = 1, errors='ignore')
y_tms_merged = tms_merged.sender


In [152]:
x_tms_merged

Unnamed: 0,day,dot,hash,hour,linked,minute,month,polarity,subjectivity,year
0,17,1,0,17,1,29,10,0.000000,0.000000,2018
1,17,1,0,17,1,38,10,0.000000,0.000000,2018
2,17,1,0,17,1,41,10,0.000000,0.000000,2018
3,17,1,0,17,1,52,10,0.000000,0.000000,2018
4,26,1,0,17,1,23,10,0.500000,1.000000,2018
...,...,...,...,...,...,...,...,...,...,...
214,23,1,0,11,0,56,5,0.000000,0.562500,2019
215,16,0,0,14,0,1,6,0.111806,0.536111,2018
216,25,1,0,2,0,23,7,-0.200000,0.883333,2017
217,22,0,0,21,1,30,4,0.000000,0.000000,2019


In [153]:
model.score(x_tms_merged, y_tms_merged)

0.5753424657534246

# TWEET STUDIO (GROUND TRUTH:FALSE) GENERATION with TFIDF

In [154]:
tfidf_vector_full = pd.DataFrame(tfidf.transform(tms_merged.text).toarray(), columns = tfidf.get_feature_names())

In [155]:
test_with_tfidf_full = pd.concat([tms_merged, tfidf_vector_full], axis = 1).drop('text', axis = 1, errors='ignore')

In [156]:
x_tfidf_full_test = test_with_tfidf_full.drop('sender', axis=1, errors='ignore')
y_tfidf_full_test = test_with_tfidf_full.sender


In [157]:
model_with_tfidf.score(x_tfidf_full_test, y_tfidf_full_test)

0.8447488584474886

# REAL DONALD TRUMP TWEETS GENERATION

In [158]:
trump_tweet_with_features

trump_tweet_with_features['year'] = trump_tweet_with_features.created_at.dt.year
trump_tweet_with_features['month'] = trump_tweet_with_features.created_at.dt.month
trump_tweet_with_features['day'] = trump_tweet_with_features.created_at.dt.day
trump_tweet_with_features['minute'] = trump_tweet_with_features.created_at.dt.minute

In [187]:
trump_tweet_with_features_cleaned  = trump_tweet_with_features[['hour', 'linked', 'hash',
                           'dot', 'polarity', 'subjectivity', 'text', 'year',
                           'month', 'day', 'minute']]

In [168]:
trump_tweet_with_features_cleaned

Unnamed: 0,hour,linked,hash,dot,polarity,subjectivity,text,year,month,day,minute
0,11,0,0,1,0.450000,0.833333,Such a beautiful and important evening! The fo...,2016,11,9,36
1,19,1,0,1,1.000000,1.000000,Happy 241st birthday to the U.S. Marine Corps!...,2016,11,10,31
2,2,0,0,1,0.580000,0.676667,A fantastic day in D.C. Met with President Oba...,2016,11,11,10
3,2,0,0,1,0.009375,0.675000,Just had a very open and successful presidenti...,2016,11,11,19
4,11,0,0,1,0.410000,0.563333,Love the fact that the small groups of protest...,2016,11,11,14
...,...,...,...,...,...,...,...,...,...,...,...
12318,15,0,0,1,-0.140625,0.458333,It was just explained to me that for next week...,2019,11,7,16
12319,15,1,1,1,0.000000,0.000000,THANK YOU! #MAGA https://t.co/e6dZshYFMV,2019,11,7,18
12320,15,0,0,1,-0.116071,0.750000,The Amazon Washington Post and three lowlife r...,2019,11,7,27
12321,15,0,0,1,0.169697,0.349053,The Radical Left Dems and LameStream Media are...,2019,11,7,41


In [169]:
tfidf_vector_all = pd.DataFrame(tfidf.transform(trump_tweet_with_features_cleaned.text).toarray(), columns = tfidf.get_feature_names())

In [170]:
to_generate = pd.concat([trump_tweet_with_features_cleaned, tfidf_vector_all], axis = 1).drop('text', axis = 1, errors='ignore')

In [171]:
to_generate

Unnamed: 0,hour,linked,hash,dot,polarity,subjectivity,year,month,day,minute,about,after,again,against,all,also,am,america,american,amp,an,and,are,as,at,back,bad,be,because,been,before,being,better,big,border,but,by,can,china,co,collusion,congress,country,crime,day.1,deal,democrats,dems,did,do,...,state,states,strong,tax,than,thank,that,the,their,them,there,they,this,time,to,today,total,trade,trump,two,united,up,us,very,vote,wall,want,was,way,we,well,were,what,when,which,whitehouse,who,why,will,win,witch,with,work,working,world,would,year.1,years,you,your
0,11,0,0,1,0.450000,0.833333,2016,11,9,36,0.000000,0.0,0.314953,0.000000,0.230941,0.0,0.0,0.0,0.0,0.000000,0.0,0.262353,0.000000,0.258873,0.0,0.0,0.0,0.199657,0.0,0.0,0.339969,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.102580,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.207097,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.395671,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1,19,1,0,1,1.000000,1.000000,2016,11,10,31,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.285876,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.463842,0.000000,0.180355,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.219746,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.381256,0.554051
2,2,0,0,1,0.580000,0.676667,2016,11,11,10,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.364535,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.320658,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.216091,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
3,2,0,0,1,0.009375,0.675000,2016,11,11,19,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.147141,0.221420,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.267925,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.115065,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.553239,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
4,11,0,0,1,0.410000,0.563333,2016,11,11,14,0.000000,0.0,0.000000,0.000000,0.292238,0.0,0.0,0.0,0.0,0.000000,0.0,0.165994,0.000000,0.000000,0.0,0.0,0.0,0.252651,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.331288,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.239590,0.259616,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.262065,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.250346,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12318,15,0,0,1,-0.140625,0.458333,2019,11,7,16,0.000000,0.0,0.000000,0.223011,0.000000,0.0,0.0,0.0,0.0,0.145649,0.0,0.170318,0.000000,0.168059,0.0,0.0,0.0,0.129616,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.122915,0.133189,0.0,0.000000,0.0,0.144049,0.143742,0.000000,0.162279,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.14982,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.218509,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
12319,15,1,1,1,0.000000,0.000000,2019,11,7,18,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.395065,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.641004,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.526874,0.000000
12320,15,0,0,1,-0.116071,0.750000,2019,11,7,27,0.298151,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.237211,0.0,0.277387,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.359451,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.108459,0.0,0.000000,0.0,0.469210,0.234105,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.218964,0.0,0.0,0.0,0.0,0.325292,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
12321,15,0,0,1,0.169697,0.349053,2019,11,7,41,0.000000,0.0,0.000000,0.288874,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.220618,0.165994,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.270379,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.172524,0.0,0.267093,0.0,0.000000,0.000000,0.000000,0.210205,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.284245,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000


In [172]:
to_generate.isnull().values.any()

False

In [236]:
prediction = model_with_tfidf.predict(to_generate).reshape(-1, 1)

In [237]:
prediction

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [238]:
trumpiness = model_with_tfidf.predict_proba(to_generate).reshape(-1, 1)

# JK, WRONG DATASET

In [178]:
trump_tweet_original_df['polarity'] = trump_tweet_original_df['text'].apply(lambda text: TextBlob(text).sentiment[0])
trump_tweet_original_df['subjectivity'] = trump_tweet_original_df['text'].apply(lambda text: TextBlob(text).sentiment[1])

In [179]:
trump_tweet_original_df['year'] = trump_tweet_original_df.created_at.dt.year
trump_tweet_original_df['month'] = trump_tweet_original_df.created_at.dt.month
trump_tweet_original_df['day'] = trump_tweet_original_df.created_at.dt.day
trump_tweet_original_df['hour'] = trump_tweet_original_df.created_at.dt.hour
trump_tweet_original_df['minute'] = trump_tweet_original_df.created_at.dt.minute
trump_tweet_original_df = trump_tweet_original_df.drop(['created_at'], axis = 1, errors='ignore')

In [188]:
trump_tweet_original_df = trump_tweet_original_df.drop(['source', 'retweet_count', 'favorite_count', 'is_retweet'], axis = 1, errors='ignore')


In [189]:
trump_tweet_original_df['hash'] = (trump_tweet_original_df['text'].str.contains('#') == True).astype(int)
trump_tweet_original_df['linked'] = (trump_tweet_original_df['text'].str.contains('https://t.co/') == True).astype(int)
trump_tweet_original_df['dot'] = (trump_tweet_original_df['text'].str.contains('...') == True).astype(int)


In [190]:
trump_tweet_original_df.head()


Unnamed: 0,text,id_str,text_cleaned,polarity,subjectivity,year,month,day,hour,minute,hash,linked,dot
0,Such a beautiful and important evening! The fo...,796315640307060738,beautiful important evening forgotten man woma...,0.45,0.833333,2016,11,9,11,36,0,0,1
1,Happy 241st birthday to the U.S. Marine Corps!...,796797436752707585,happy 241st birthday us marine corps thank ser...,1.0,1.0,2016,11,10,19,31,0,1,1
2,A fantastic day in D.C. Met with President Oba...,796897928048766976,fantastic day dc met president obama first tim...,0.58,0.676667,2016,11,11,2,10,0,0,1
3,Just had a very open and successful presidenti...,796900183955095552,open successful presidential election professi...,0.009375,0.675,2016,11,11,2,19,0,0,1
4,Love the fact that the small groups of protest...,797034721075228672,love fact small groups protesters last night p...,0.41,0.563333,2016,11,11,11,14,0,0,1


In [191]:
tfidf_vector_all = pd.DataFrame(tfidf.transform(trump_tweet_original_df.text_cleaned).toarray(), columns = tfidf.get_feature_names())


In [200]:
to_generate = pd.concat([trump_tweet_original_df, tfidf_vector_all], axis = 1).drop(['text', 'text_cleaned', 'id_str'], axis = 1, errors='ignore')


In [201]:
to_generate


Unnamed: 0,polarity,subjectivity,year,month,day,hour,minute,hash,linked,dot,about,after,again,against,all,also,am,america,american,amp,an,and,are,as,at,back,bad,be,because,been,before,being,better,big,border,but,by,can,china,co,collusion,congress,country,crime,day.1,deal,democrats,dems,did,do,...,state,states,strong,tax,than,thank,that,the,their,them,there,they,this,time,to,today,total,trade,trump,two,united,up,us,very,vote,wall,want,was,way,we,well,were,what,when,which,whitehouse,who,why,will,win,witch,with,work,working,world,would,year.1,years,you,your
0,0.450000,0.833333,2016,11,9,11,36,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.000000,1.000000,2016,11,10,19,31,0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.63273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.774372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.580000,0.676667,2016,11,11,2,10,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.386724,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.340177,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.009375,0.675000,2016,11,11,2,19,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.410000,0.563333,2016,11,11,11,14,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.55701,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12318,-0.140625,0.458333,2019,11,7,15,16,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.262328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.393556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12319,0.000000,0.000000,2019,11,7,15,18,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12320,-0.116071,0.750000,2019,11,7,15,27,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.506218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12321,0.169697,0.349053,2019,11,7,15,41,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.380295,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.399798,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [202]:
to_generate.isnull().values.any()

False

In [232]:
prediction = model_with_tfidf.predict(to_generate).reshape(-1, 1)

In [233]:
len(prediction)

12323

In [234]:
prediction

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [223]:
trumpiness = model_with_tfidf.predict_proba(to_generate)[:,0].reshape(-1, 1)

In [224]:
trumpiness

array([[0.7],
       [0.7],
       [0.8],
       ...,
       [0.7],
       [0.7],
       [0.9]])

In [210]:
len(trump_tweet_original_df)

12323

In [226]:
trump_tweet_original_df['prediction'] = prediction
trump_tweet_original_df['trumpiness'] = trumpiness


In [227]:
trump_tweet_original_df

Unnamed: 0,text,id_str,text_cleaned,polarity,subjectivity,year,month,day,hour,minute,hash,linked,dot,prediction,trumpiness
0,Such a beautiful and important evening! The fo...,796315640307060738,beautiful important evening forgotten man woma...,0.450000,0.833333,2016,11,9,11,36,0,0,1,0,0.7
1,Happy 241st birthday to the U.S. Marine Corps!...,796797436752707585,happy 241st birthday us marine corps thank ser...,1.000000,1.000000,2016,11,10,19,31,0,1,1,0,0.7
2,A fantastic day in D.C. Met with President Oba...,796897928048766976,fantastic day dc met president obama first tim...,0.580000,0.676667,2016,11,11,2,10,0,0,1,0,0.8
3,Just had a very open and successful presidenti...,796900183955095552,open successful presidential election professi...,0.009375,0.675000,2016,11,11,2,19,0,0,1,0,0.6
4,Love the fact that the small groups of protest...,797034721075228672,love fact small groups protesters last night p...,0.410000,0.563333,2016,11,11,11,14,0,0,1,0,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12318,It was just explained to me that for next week...,1192460764235419654,explained next weeks fake hearing trial house ...,-0.140625,0.458333,2019,11,7,15,16,0,0,1,0,0.6
12319,THANK YOU! #MAGA https://t.co/e6dZshYFMV,1192461425358385154,thank maga httpstcoe6dzshyfmv,0.000000,0.000000,2019,11,7,15,18,1,1,1,0,0.7
12320,The Amazon Washington Post and three lowlife r...,1192463709400117250,amazon washington post three lowlife reporters...,-0.116071,0.750000,2019,11,7,15,27,0,0,1,0,0.7
12321,The Radical Left Dems and LameStream Media are...,1192467215360102401,radical left dems lamestream media trying make...,0.169697,0.349053,2019,11,7,15,41,0,0,1,0,0.7


In [231]:
trump_tweet_original_df.to_excel("result.xlsx")