# Word Embeddings Model for Twitter Sentiment Analysis

Word embeddings using pre-trained word embeddings from GloVe

Deep Neural Network to get relationships between tokens

# Imports and Constants

In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, plot_confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelBinarizer
import evaluation_functions
from sklearn.feature_extraction.text import TfidfTransformer 
from imblearn.over_sampling import SMOTE
import ast
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras import models
from keras import layers
from keras import metrics
from sklearn.metrics import confusion_matrix
import seaborn as sns
from keras.callbacks import EarlyStopping
from nltk.stem import WordNetLemmatizer
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import os

ModuleNotFoundError: No module named 'imblearn'

In [None]:
DATA_FILE_PATH = 'C:\\Users\\Yostina\\Twitter\\'
TOKENIZED_DATA_FILE_NAME = 'tokenized_Feedback_data.csv'
pd.set_option('display.max_colwidth', None)
RANDOM_STATE = 42
# glove file originaly downloaded from https://nlp.stanford.edu/projects/glove/
GLOVE_FILE_PATH = '../data/glove.twitter.27B/'
GLOVE_FILE_NAME = 'glove.twitter.27B.100d.txt'
MODEL_FILE_PATH = '../models/emb_model_05/'
MODEL_FILE_NAME = 'best_model.h5'

# Load Data

In [None]:
df = pd.read_csv(DATA_FILE_PATH + TOKENIZED_DATA_FILE_NAME)

In [None]:
df.head()

In [None]:
# convert list of strings represented as a string to a list of strings
df.tokens = df.tokens.map(ast.literal_eval)

In [None]:
# convert list of strings (tokens) to single string for use with Keras Tokenizer
df['tokens_text'] = df.tokens.map(' '.join)

In [None]:
data = df[['tokens_text', 'Category']]

In [None]:
data.head()

# Get vectors from GloVe file

In [None]:
total_vocabulary = set(token for i, token_list in df.tokens.items() for token in token_list)

In [None]:
glove = {}
with open(GLOVE_FILE_PATH+GLOVE_FILE_NAME, 'rb') as f:
    for line in f:
        parts = line.split()
        token = parts[0].decode('utf-8')
        if token in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[token] = vector

# Split Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['tokens_text'], 
                                                    data['Category'],
                                                   test_size = 0.2,
                                                   random_state = RANDOM_STATE,
                                                   stratify = data['Category'])

In [None]:
X_train.shape

In [None]:
X_test.shape

# Convert Words into Integers

In [None]:
MAX_TOKENS = 30
NUM_WORDS = 5000

In [None]:
tokenizer = Tokenizer(num_words=NUM_WORDS, filters='', lower=False, split=' ')

In [None]:
tokenizer.fit_on_texts(X_train)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train_seq_pad = pad_sequences(X_train_seq)
X_test_seq_pad = pad_sequences(X_test_seq)

# Encode Labels

In [None]:
le = LabelEncoder()

In [None]:
y_train_enc = le.fit_transform(y_train)

In [None]:
y_test_enc = le.transform(y_test)

In [None]:
y_train_oh = to_categorical(y_train_enc)
y_test_oh = to_categorical(y_test_enc)

# Create Validation Set

In [None]:
X_train_emb, X_val_emb, y_train_emb, y_val_emb = train_test_split(X_train_seq_pad, 
                                                                  y_train_oh, 
                                                                  test_size=0.1, 
                                                                  random_state=RANDOM_STATE)

# Create Matrix of Weights

from glove dictionary

In [None]:
emb_matrix = np.zeros((NUM_WORDS, 100))
for token, index in tokenizer.word_index.items():
    if index < NUM_WORDS:
        vect = glove.get(token)
        if vect is not None:
            emb_matrix[index] = vect
    else:
        break

# Setup Model

In [None]:
emb_model = models.Sequential()

In [None]:
emb_model.add(layers.Embedding(input_dim=NUM_WORDS, output_dim=100, input_length=MAX_TOKENS))

In [None]:
emb_model.add(layers.LSTM(25, return_sequences=True))

In [None]:
emb_model.add(layers.GlobalMaxPool1D())
emb_model.add(layers.Dropout(0.5))
emb_model.add(layers.Dense(50, activation='relu'))
emb_model.add(layers.Dropout(0.5))

In [None]:
emb_model.add(layers.Dense(3, activation='softmax'))

Set the weights in the embedding layer to be the ones from the Glove vectors and set trainable to False so that they are not changed.

In [None]:
emb_model.layers[0].set_weights([emb_matrix])
emb_model.layers[0].trainable = False

In [2]:
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

NameError: name 'EarlyStopping' is not defined

In [None]:
save_best = ModelCheckpoint(os.path.join(MODEL_FILE_PATH,MODEL_FILE_NAME), 
                     monitor='val_categorical_accuracy', 
                     mode='max', 
                     verbose=1, 
                     save_best_only=True)

In [3]:
emb_model.compile(loss='categorical_crossentropy', 
                  optimizer='nadam', 
                  metrics=[metrics.categorical_accuracy]) #categorical_accuracy

NameError: name 'emb_model' is not defined

In [4]:
emb_model_results = emb_model.fit(X_train_emb, 
                                  y_train_emb, 
                                  validation_data=(X_val_emb, y_val_emb), 
                                  epochs=20,
                                  callbacks=[early_stopping, save_best])

NameError: name 'emb_model' is not defined

In [None]:
def visualize_training_results(results):
    history = results.history
    plt.figure()
    plt.plot(history['val_loss'])
    plt.plot(history['loss'])
    plt.legend(['val_loss', 'loss'])
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.show()
    
    plt.figure()
    plt.plot(history['val_categorical_accuracy'])
    plt.plot(history['categorical_accuracy'])
    plt.legend(['val_categorical_accuracy', 'categorical_accuracy'])
    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.show()

In [5]:
visualize_training_results(emb_model_results)

NameError: name 'visualize_training_results' is not defined

# Load Saved Model

In [6]:
saved_model = load_model(MODEL_FILE_PATH+MODEL_FILE_NAME)

NameError: name 'load_model' is not defined

# Evaluate Model on Test Data

In [7]:
y_hat = saved_model.predict(X_test_seq_pad)

NameError: name 'saved_model' is not defined

In [8]:
results = saved_model.evaluate(X_test_seq_pad, y_test_oh)

NameError: name 'saved_model' is not defined

In [9]:
results

NameError: name 'results' is not defined

In [10]:
print(f'Test Accuracy = {round(results[1], 4)}')

NameError: name 'results' is not defined

In [11]:
print(classification_report(y_test_enc, np.argmax(y_hat, axis=1), target_names=le.classes_))

NameError: name 'classification_report' is not defined

In [12]:
cm = confusion_matrix(y_test_enc, np.argmax(y_hat, axis=1))

NameError: name 'confusion_matrix' is not defined

In [13]:
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(cm, 
            annot=True, 
            xticklabels=le.classes_, 
            yticklabels=le.classes_,
            fmt='d',
            ax=ax)
# fix for mpl bug that cuts off top/bottom of seaborn viz
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.show()

NameError: name 'plt' is not defined

# Summary

...

The deep neural network improved the accuracy of the model compared the the baseline word embedding model, emb_model
_01, but at the expense of the recall of the Negative class.