# TRUMP ANAL

In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import string
import gensim
import shutil

from io import StringIO
from keras import layers 
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.layers.core import Activation, Dropout, Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Flatten, Conv1D, GlobalMaxPooling1D, LSTM

MAX_WORDS = 15000

### Merge Embedding Files

In [58]:
if not os.path.exists("glove.6B.100d.txt"):
    with open("glove.6B.100d.txt", "w+") as output:
        for i in range(0,8):
            print("Writing embeddings_" + str(i))
            with open(".embeddings/embeddings_"+str(i)) as part:
                output.write("".join(part.readlines()))
    shutil.rmtree(".embeddings/")
    print("Embedding file created!")
else:
    print("Embedding file already exists!")
    
print("Done!")

Embedding file already exists!
Done!


### Prepare Labels

In [None]:
df = pd.read_json("trump_tweets_11_17.json") # read json
replacement_dict = {"Twitter for iPhone": 1, "Twitter for Android": 0}
df = df.replace(to_replace=replacement_dict)   # replace labels with 1s and 0s
df = df[df.source.apply(lambda x: type(x) == int)]  # remove tweets from other sources
df = df.sample(frac=1)

### Split into training and testing

In [None]:
all_texts = df.sample(frac=1, random_state=0) # random sampling with reproducibility

X_train, X_test, y_train, y_test = train_test_split(
    all_texts['text'], 
    all_texts['source'], 
    test_size=0.20, 
    random_state=42)

### Tokenize Text for Embedding Layer

In [None]:
#create word-to-index dictionary
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

# convert text to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# tokenizer dictionary
dictionary = tokenizer.word_index

In [None]:
#length of current dictionary vocab
vocab_size = len(tokenizer.word_index) + 1

maxlen = 65

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
#construct embeddings using GlOve
embeddings_dictionary = dict()

#file is too large to store on git, must download and place in folder manually
glove_file = open('./glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [None]:
#create embedding matrix
embedding_matrix = np.zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
vocab_size == len(embedding_matrix)

# Feed Forward Network

In [None]:
model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

In [None]:
ffn_history = model.fit(X_train, y_train, batch_size=128, epochs=15, verbose=1, validation_split=0.2)

In [None]:
ffn_score = model.evaluate(X_test, y_test, verbose=1)

In [None]:
print("Test Score:", ffn_score[0])
print("Test Accuracy:", ffn_score[1])

In [None]:
# plots
plt.figure(figsize=(15,10))
plt.subplot(2,1,1)
plt.plot(ffn_history.history['acc'])
plt.plot(ffn_history.history['val_acc'])
plt.title('Feed forward model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train','validation'], loc='upper left')

plt.subplot(2,1,2)
plt.plot(ffn_history.history['loss'])
plt.plot(ffn_history.history['val_loss'])
plt.title('Feed forward model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train','validation'], loc='upper right')
plt.show()

# CNN

In [None]:
cnn = Sequential()

embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
cnn.add(embedding_layer)

cnn.add(Conv1D(128, 5, activation='relu'))
cnn.add(GlobalMaxPooling1D())
cnn.add(Dense(1, activation='sigmoid'))
cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
print(cnn.summary())

In [None]:
cnn_history = cnn.fit(X_train, y_train, batch_size=128, epochs=15, verbose=1, validation_split=0.2)

cnn_score = cnn.evaluate(X_test, y_test, verbose=1)

In [None]:
print("Test Score:", cnn_score[0])
print("Test Accuracy:", cnn_score[1])

In [None]:
plt.figure(figsize=(15,10))

plt.subplot(2,1,1)
plt.plot(cnn_history.history['acc'])
plt.plot(cnn_history.history['val_acc'])
plt.title('CNN model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train','test'], loc = 'upper left')

plt.subplot(2,1,2)
plt.plot(cnn_history.history['loss'])
plt.plot(cnn_history.history['val_loss'])
plt.title('CNN model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train','test'], loc = 'upper right')
plt.show()

# LSTM model

In [None]:
lstm_model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
lstm_model.add(embedding_layer)
lstm_model.add(LSTM(128))

lstm_model.add(Dense(1, activation='sigmoid'))
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
print(lstm_model.summary())

In [None]:
lstm_history = lstm_model.fit(X_train, y_train, batch_size=128, epochs=15, verbose=1, validation_split=0.2)

lstm_score = lstm_model.evaluate(X_test, y_test, verbose=1)

In [None]:
print("Test Score:", lstm_score[0])
print("Test Accuracy:", lstm_score[1])

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,1,1)
plt.plot(lstm_history.history['acc'])
plt.plot(lstm_history.history['val_acc'])
plt.title('LSTM model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train','test'], loc='upper left')

plt.subplot(2,1,2)
plt.plot(lstm_history.history['loss'])
plt.plot(lstm_history.history['val_loss'])
plt.title('CNN model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

In [None]:
lstm_model.predict(X_test)[2]