In [1]:
import pandas as pd
import numpy as np

In [2]:
from sqlalchemy import create_engine

In [3]:
import datetime as dt

In [4]:
engine = create_engine("sqlite:///../db/twitter_db.sqlite")
conn = engine.connect()

In [5]:
tweets_df = pd.read_sql("SELECT * FROM tweet_data", conn)

In [6]:
tweets_df.head(2)

Unnamed: 0,id,created_at,created_at_time,created_at_date,created_at_datetime,tweet_id,tweet_id_str,full_text,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_screen_name,user_id,user_id_str,user_name,user_screen_name,retweet_count,favorite_count
0,1,Wed Jul 31 22:21:23 +0000 2019,22:21:23.000000,2019-07-31,2019-07-31 22:21:23.000000,1156691352983412737,1156691352983412737,"Here's the deal, President Trump inherited the...",,,,,,939091,939091,Joe Biden,JoeBiden,938,4134
1,2,Wed Jul 31 21:28:00 +0000 2019,21:28:00.000000,2019-07-31,2019-07-31 21:28:00.000000,1156677917469896704,1156677917469896704,I’ve got some exciting news: I am now on Snapc...,,,,,,939091,939091,Joe Biden,JoeBiden,173,790


In [7]:
len(tweets_df)

41891

In [8]:
tweets_df["created_at_datetime"] = tweets_df["created_at_datetime"].apply(lambda x: dt.datetime.strptime(x,
                                                                "%Y-%m-%d %H:%M:%S.%f"))

In [9]:
tweets_df = tweets_df.loc[tweets_df["created_at_datetime"] < dt.datetime(2019,8,15), :].reset_index(drop=True)

In [10]:
len(tweets_df)

40678

In [11]:
grouped_df = tweets_df.groupby(tweets_df['user_name']).median()
grouped_df = grouped_df[['retweet_count']].sort_values('retweet_count', ascending = False).iloc[:7]
# grouped_df.sort_values('retweet_count', ascending = False)
grouped_df
top_candidates =[]
for i, r in grouped_df.iterrows():
    top_candidates.append(i)
# Remove Eric Swalwell from list (dropped out)
top_candidates.pop(3)
top_candidates

['Donald J. Trump',
 'Bernie Sanders',
 'Kamala Harris',
 'Elizabeth Warren',
 'Joe Biden',
 'Tulsi Gabbard']

In [12]:
tweets_df = tweets_df.loc[tweets_df['user_name'].isin(top_candidates), :].reset_index(drop=True)

In [13]:
len(tweets_df)

10520

In [14]:
tweets_df["day"] = tweets_df["created_at_datetime"].apply(lambda x: dt.datetime.strftime(x, "%A"))
tweets_df["hour"] = tweets_df["created_at_datetime"].apply(lambda x: dt.datetime.strftime(x, "%H"))
tweets_df["month"] = tweets_df["created_at_datetime"].apply(lambda x: dt.datetime.strftime(x, "%B"))

In [15]:
tweets_df.drop(columns = ["created_at", "created_at_time", "created_at_date","created_at_datetime",
                    "tweet_id", "tweet_id_str", "in_reply_to_status_id",
                    "in_reply_to_status_id_str", "in_reply_to_user_id",
                    "in_reply_to_user_id_str", "in_reply_to_screen_name",
                    "user_id_str", "user_id", "user_screen_name", "id"], inplace = True)

In [16]:
tweets_df = tweets_df[["user_name", "month", "day", "hour", "retweet_count", "favorite_count", "full_text"]]

In [17]:
tweets_df.head(2)

Unnamed: 0,user_name,month,day,hour,retweet_count,favorite_count,full_text
0,Joe Biden,July,Wednesday,22,938,4134,"Here's the deal, President Trump inherited the..."
1,Joe Biden,July,Wednesday,21,173,790,I’ve got some exciting news: I am now on Snapc...


In [18]:
len(tweets_df.groupby(tweets_df["user_name"]).count())

6

In [19]:
X_count_df = tweets_df[["full_text", "month", "day", "hour", "retweet_count", "favorite_count"]]
X_count_df = pd.get_dummies(X_count_df, columns = ["month", "day", "hour"])

In [20]:
X_count_df.head(2)

Unnamed: 0,full_text,retweet_count,favorite_count,month_April,month_August,month_December,month_February,month_January,month_July,month_June,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,"Here's the deal, President Trump inherited the...",938,4134,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,I’ve got some exciting news: I am now on Snapc...,173,790,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [21]:
import nltk
import re
import string
pd.set_option('display.max_colwidth', 100) # To extend column width

stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

In [22]:
def clean_text(text):
    text = text.replace('&amp;', '&')
    text = text.replace('\n', ' ')
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count_vect = count_vect.fit_transform(X_count_df['full_text'])

In [24]:
X_count_df.drop(columns = ['full_text'], inplace = True)

In [25]:
X_count_df = pd.concat([X_count_df, pd.DataFrame(X_count_vect.toarray(), columns=count_vect.get_feature_names())], axis=1)

In [26]:
X_count_df.shape

(10520, 20445)

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [29]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)# n_jobs=-1 for parallelizing search
gs_fit = gs.fit(X_count_df, tweets_df['user_name'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
7,252.122776,14.219756,9.433148,5.556332,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.626009,0.533967,0.579173,...,0.587357,0.034813,1,0.993344,0.992157,0.992159,0.991683,0.991566,0.992182,0.000629
10,250.422096,17.569658,5.37024,2.062982,,150,"{'max_depth': None, 'n_estimators': 150}",0.60038,0.541093,0.575844,...,0.586312,0.029621,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,352.435807,68.070064,25.718048,15.771828,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.61177,0.533492,0.580124,...,0.586122,0.036637,3,0.9937,0.992157,0.993584,0.99299,0.990972,0.992681,0.001015
11,302.1771,22.202971,2.333361,1.181525,,300,"{'max_depth': None, 'n_estimators': 300}",0.616042,0.527791,0.583928,...,0.585646,0.035177,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,198.791914,53.12368,1.699875,0.547993,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.610346,0.523515,0.593438,...,0.585266,0.038948,5,0.973137,0.972668,0.970892,0.97315,0.967213,0.971412,0.002257


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
X_tdidf_df = tweets_df[["full_text", "month", "day", "hour", "retweet_count", "favorite_count"]]
X_tdidf_df = pd.get_dummies(X_tdidf_df, columns = ["month", "day", "hour"])

In [33]:
# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf_vect = tfidf_vect.fit_transform(X_tdidf_df['full_text'])

In [34]:
X_tdidf_df.drop(columns = ['full_text'], inplace = True)

In [37]:
X_tdidf_df = pd.concat([X_tdidf_df, pd.DataFrame(X_tfidf_vect.toarray(), columns=tfidf_vect.get_feature_names())], axis=1)

In [38]:
X_tdidf_df.shape

(10520, 20445)

In [39]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)# n_jobs=-1 for parallelizing search
gs_fit = gs.fit(X_tdidf_df, tweets_df['user_name'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
4,138.122783,4.95895,2.618398,0.639628,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.608923,0.549169,0.573942,...,0.587833,0.030188,1,0.973731,0.971361,0.972318,0.970298,0.969589,0.971459,0.001467
7,172.688825,8.645928,3.026706,0.593666,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.60655,0.537292,0.582501,...,0.585646,0.030537,2,0.993225,0.992632,0.993109,0.992872,0.990259,0.992419,0.001099
5,206.100773,9.678352,6.023996,3.633669,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.610346,0.541568,0.575369,...,0.584506,0.029152,3,0.975514,0.974213,0.970773,0.970892,0.968639,0.972006,0.002501
8,244.1013,24.340598,4.21513,1.547126,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.60038,0.529691,0.582977,...,0.580323,0.030414,4,0.993581,0.992513,0.992278,0.992515,0.99014,0.992206,0.001127
11,247.67951,29.237682,2.185556,0.770084,,300,"{'max_depth': None, 'n_estimators': 300}",0.596108,0.52114,0.58155,...,0.576236,0.037537,5,1.0,1.0,1.0,1.0,1.0,1.0,0.0
