In [1]:
import pandas as pd
import numpy as np

In [2]:
from sqlalchemy import create_engine

In [3]:
import datetime as dt

In [4]:
engine = create_engine("sqlite:///../db/twitter_db.sqlite")
conn = engine.connect()

In [5]:
tweets_df = pd.read_sql("SELECT * FROM tweet_data", conn)

In [6]:
tweets_df.head(2)

Unnamed: 0,id,created_at,created_at_time,created_at_date,created_at_datetime,tweet_id,tweet_id_str,full_text,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_screen_name,user_id,user_id_str,user_name,user_screen_name,retweet_count,favorite_count
0,1,Wed Jul 31 22:21:23 +0000 2019,22:21:23.000000,2019-07-31,2019-07-31 22:21:23.000000,1156691352983412737,1156691352983412737,"Here's the deal, President Trump inherited the...",,,,,,939091,939091,Joe Biden,JoeBiden,938,4134
1,2,Wed Jul 31 21:28:00 +0000 2019,21:28:00.000000,2019-07-31,2019-07-31 21:28:00.000000,1156677917469896704,1156677917469896704,I’ve got some exciting news: I am now on Snapc...,,,,,,939091,939091,Joe Biden,JoeBiden,173,790


In [7]:
len(tweets_df)

41891

In [8]:
tweets_df["created_at_datetime"] = tweets_df["created_at_datetime"].apply(lambda x: dt.datetime.strptime(x,
                                                                "%Y-%m-%d %H:%M:%S.%f"))

In [9]:
tweets_df = tweets_df.loc[tweets_df["created_at_datetime"] < dt.datetime(2019,8,15), :].reset_index(drop=True)

In [10]:
len(tweets_df)

40678

In [11]:
grouped_df = tweets_df.groupby(tweets_df['user_name']).median()
grouped_df = grouped_df[['retweet_count']].sort_values('retweet_count', ascending = False).iloc[:7]
# grouped_df.sort_values('retweet_count', ascending = False)
grouped_df
top_candidates =[]
for i, r in grouped_df.iterrows():
    top_candidates.append(i)
# Remove Eric Swalwell from list (dropped out)
top_candidates.pop(3)
top_candidates

['Donald J. Trump',
 'Bernie Sanders',
 'Kamala Harris',
 'Elizabeth Warren',
 'Joe Biden',
 'Tulsi Gabbard']

In [12]:
tweets_df = tweets_df.loc[tweets_df['user_name'].isin(top_candidates), :].reset_index(drop=True)

In [13]:
len(tweets_df)

10520

In [14]:
tweets_df["day"] = tweets_df["created_at_datetime"].apply(lambda x: dt.datetime.strftime(x, "%A"))
tweets_df["hour"] = tweets_df["created_at_datetime"].apply(lambda x: dt.datetime.strftime(x, "%H"))
tweets_df["month"] = tweets_df["created_at_datetime"].apply(lambda x: dt.datetime.strftime(x, "%B"))

In [15]:
tweets_df.drop(columns = ["created_at", "created_at_time", "created_at_date","created_at_datetime",
                    "tweet_id", "tweet_id_str", "in_reply_to_status_id",
                    "in_reply_to_status_id_str", "in_reply_to_user_id",
                    "in_reply_to_user_id_str", "in_reply_to_screen_name",
                    "user_id_str", "user_id", "user_screen_name", "id"], inplace = True)

In [16]:
tweets_df = tweets_df[["user_name", "month", "day", "hour", "retweet_count", "favorite_count", "full_text"]]

In [17]:
tweets_df.head(2)

Unnamed: 0,user_name,month,day,hour,retweet_count,favorite_count,full_text
0,Joe Biden,July,Wednesday,22,938,4134,"Here's the deal, President Trump inherited the..."
1,Joe Biden,July,Wednesday,21,173,790,I’ve got some exciting news: I am now on Snapc...


In [18]:
len(tweets_df.groupby(tweets_df["user_name"]).count())

6

In [19]:
X_df = tweets_df[["full_text", "month", "day", "hour", "retweet_count", "favorite_count"]]
X_df = pd.get_dummies(X_df, columns = ["month", "day", "hour"])

In [20]:
X_df.head(2)

Unnamed: 0,full_text,retweet_count,favorite_count,month_April,month_August,month_December,month_February,month_January,month_July,month_June,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,"Here's the deal, President Trump inherited the...",938,4134,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,I’ve got some exciting news: I am now on Snapc...,173,790,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [21]:
len(X_df)

10520

In [22]:
import nltk
import re
import string
pd.set_option('display.max_colwidth', 100) # To extend column width

stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

In [23]:
def clean_text(text):
    text = text.replace('&amp;', '&')
    text = text.replace('\n', ' ')
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

# NGramVectorizer
ngram_vect = CountVectorizer(ngram_range=(2,2), analyzer=clean_text)
X_count = ngram_vect.fit_transform(X_df['full_text'])

In [25]:
X_count.toarray().shape

(10520, 20400)

In [26]:
X_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [27]:
X_df.shape

(10520, 46)

In [28]:
X_df.drop(columns = ["full_text"], inplace = True)

In [29]:
X_df.head(2)

Unnamed: 0,retweet_count,favorite_count,month_April,month_August,month_December,month_February,month_January,month_July,month_June,month_March,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,938,4134,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,173,790,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [30]:
X_df = pd.concat([X_df, pd.DataFrame(X_count.toarray(), columns=ngram_vect.get_feature_names())], axis=1)

In [31]:
X_df.shape

(10520, 20445)

In [32]:
X_df.head(2)

Unnamed: 0,retweet_count,favorite_count,month_April,month_August,month_December,month_February,month_January,month_July,month_June,month_March,...,شده,فساد,فقط,مدتهاست,مردم,موجب,هستند,چهلسالشکست,که,۴۰
0,938,4134,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,173,790,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
X_df = X_df.to_sparse(fill_value = 0)

In [34]:
X_df.head(2)

Unnamed: 0,retweet_count,favorite_count,month_April,month_August,month_December,month_February,month_January,month_July,month_June,month_March,...,شده,فساد,فقط,مدتهاست,مردم,موجب,هستند,چهلسالشکست,که,۴۰
0,938,4134,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,173,790,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
X_df.columns

Index(['retweet_count', 'favorite_count', 'month_April', 'month_August',
       'month_December', 'month_February', 'month_January', 'month_July',
       'month_June', 'month_March',
       ...
       'شده', 'فساد', 'فقط', 'مدتهاست', 'مردم', 'موجب', 'هستند', 'چهلسالشکست',
       'که', '۴۰'],
      dtype='object', length=20445)

In [36]:
columns_list = list(X_df.columns)

In [37]:
import pickle

with open('columns_v2.pkl', 'wb') as f:
   pickle.dump(columns_list, f)

In [38]:
len(X_df.columns)

20445

In [39]:
X_df.shape

(10520, 20445)

In [40]:
data_y = tweets_df.values
data_x = X_df.values
X = data_x[:, 0:]
y = data_y[:,0]

In [41]:
X

array([[  938,  4134,     0, ...,     0,     0,     0],
       [  173,   790,     0, ...,     0,     0,     0],
       [  712,  4376,     0, ...,     0,     0,     0],
       ...,
       [  741, 20234,     0, ...,     0,     0,     0],
       [18289, 68718,     0, ...,     0,     0,     0],
       [   11,    74,     0, ...,     0,     0,     0]], dtype=int64)

In [42]:
from scipy.sparse import csr_matrix

In [43]:
X_sparse = csr_matrix(X)

In [44]:
X_sparse.shape

(10520, 20445)

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [46]:
type(X_train)

numpy.ndarray

In [47]:
from sklearn.preprocessing import MaxAbsScaler

In [48]:
X_scaler = MaxAbsScaler().fit(X_train)

In [49]:
from sklearn.externals import joblib
scaler_filename = "mas_scaler_v2.save"
joblib.dump(X_scaler, scaler_filename) 

['mas_scaler_v2.save']

In [50]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [51]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [52]:
np.save('classes_v2.npy', label_encoder.classes_)

In [53]:
from keras.utils import to_categorical

# Step 2: One-hot encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

Using TensorFlow backend.


In [54]:
X_train.shape

(7890, 20445)

In [56]:
# first, create a normal neural network with 2 inputs, 6 hidden nodes, and 2 outputs
from keras.models import Sequential
from keras.layers import Dense

deep_model = Sequential()
deep_model.add(Dense(units=1000, activation='relu', input_dim=20445))
deep_model.add(Dense(units=1000, activation='relu'))
deep_model.add(Dense(units=6, activation='softmax'))

In [57]:
deep_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 1000)              20446000  
_________________________________________________________________
dense_5 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 6006      
Total params: 21,453,006
Trainable params: 21,453,006
Non-trainable params: 0
_________________________________________________________________


In [58]:
# Compile the model
deep_model.compile(optimizer='adam',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])

In [59]:
# Fit the model to the training data
deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=5,
    shuffle=True,
    verbose=2
)

Instructions for updating:
Use tf.cast instead.
Epoch 1/5
 - 101s - loss: 0.8719 - acc: 0.6757
Epoch 2/5
 - 97s - loss: 0.0932 - acc: 0.9725
Epoch 3/5
 - 98s - loss: 0.0060 - acc: 0.9989
Epoch 4/5
 - 98s - loss: 9.4685e-04 - acc: 0.9997
Epoch 5/5
 - 98s - loss: 2.0414e-04 - acc: 1.0000


<keras.callbacks.History at 0x21ca34340f0>

In [60]:
model_loss, model_accuracy = deep_model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Deep Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Deep Neural Network - Loss: 0.9790602894680497, Accuracy: 0.7840304182509505


In [61]:
deep_model.save("candidate_classifier_v2.h5")