# Cluster for EBC Sentiment Analysis

Word Embeddings with 8 nodes in the embedding layer of the Neural Network

Lemmatization is used on the input tokens/words.

# Imports and Constants

In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, plot_confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelBinarizer
import evaluation_functions
from sklearn.feature_extraction.text import TfidfTransformer 
from imblearn.over_sampling import SMOTE
import ast
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras import models
from keras import layers
from keras import metrics
from sklearn.metrics import confusion_matrix
import seaborn as sns
from keras.callbacks import EarlyStopping
from nltk.stem import WordNetLemmatizer
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import os

Using TensorFlow backend.


ImportError: Traceback (most recent call last):
  File "C:\Users\Yostina\anaconda3\lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 64, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.

See https://www.tensorflow.org/install/errors

for some common reasons and solutions.  Include the entire stack trace
above this error message when asking for help.

In [None]:
DATA_FILE_PATH = 'C:\\Users\\Yostina\\SentimentNLP\\'
TOKENIZED_DATA_FILE_NAME = 'tokenized_twitter_data.csv'
pd.set_option('display.max_colwidth', None)
RANDOM_STATE = 42
MODEL_FILE_PATH = '../models/'
MODEL_FILE_NAME = 'emb_model_01_best_model.h5'
RUN_MODEL = False

# Load Data

In [None]:
df = pd.read_csv(DATA_FILE_PATH + TOKENIZED_DATA_FILE_NAME)

In [None]:
df.head()

In [None]:
# convert list of strings represented as a string to a list of strings
df.tokens = df.tokens.map(ast.literal_eval)

In [None]:
def lemmatize(token_list):
    """return a lemmatized version of the passed in token list"""
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(token) for token in token_list]

In [None]:
df['tokens_lemma'] = df.tokens.map(lemmatize)

In [None]:
# convert list of strings (tokens) to single string for use with Keras Tokenizer
df['tokens'] = df.tokens_lemma.map(' '.join)

In [None]:
data = df[['tokens', 'sentiment']]

In [None]:
data.head()

# Split Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['tokens'], 
                                                    data['sentiment'],
                                                   test_size = 0.2,
                                                   random_state = RANDOM_STATE,
                                                   stratify = data['sentiment'])

In [None]:
X_train.shape

In [None]:
X_test.shape

# Convert Words into Integers

In [None]:
# Set Max number of tokens per tweet
MAX_TOKENS = 30
# Set Max number of words/tokens in the corpus
NUM_WORDS = 5000

In [None]:
tokenizer = Tokenizer(num_words=NUM_WORDS, filters='', lower=False, split=' ')

In [None]:
tokenizer.fit_on_texts(X_train)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train_seq_pad = pad_sequences(X_train_seq)
X_test_seq_pad = pad_sequences(X_test_seq)

# Encode Labels

In [None]:
le = LabelEncoder()

In [None]:
y_train_enc = le.fit_transform(y_train)

In [None]:
y_test_enc = le.transform(y_test)

In [None]:
y_train_oh = to_categorical(y_train_enc)
y_test_oh = to_categorical(y_test_enc)

# Create Validation Set

In [None]:
X_train_emb, X_val_emb, y_train_emb, y_val_emb = train_test_split(X_train_seq_pad, 
                                                                  y_train_oh, 
                                                                  test_size=0.1, 
                                                                  random_state=RANDOM_STATE)

# Setup Model

In [None]:
emb_model = models.Sequential()

In [None]:
emb_model.add(layers.Embedding(input_dim=NUM_WORDS, output_dim=8, input_length=MAX_TOKENS))

In [None]:
emb_model.add(layers.Flatten())

In [None]:
emb_model.add(layers.Dense(3, activation='softmax'))

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

In [None]:
save_best = ModelCheckpoint(os.path.join(MODEL_FILE_PATH,MODEL_FILE_NAME), 
                     monitor='val_categorical_accuracy', 
                     mode='max', 
                     verbose=1, 
                     save_best_only=True)

In [None]:
emb_model.compile(loss='categorical_crossentropy', 
                  optimizer='nadam', 
                  metrics=[metrics.categorical_accuracy])

In [None]:
if RUN_MODEL = True
    emb_model_results = emb_model.fit(X_train_emb, 
                                      y_train_emb, 
                                      validation_data=(X_val_emb, y_val_emb), 
                                      epochs=20,
                                      callbacks=[early_stopping, save_best])

In [None]:
def visualize_training_results(results):
    history = results.history
    plt.figure()
    plt.plot(history['val_loss'])
    plt.plot(history['loss'])
    plt.legend(['val_loss', 'loss'])
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.show()
    
    plt.figure()
    plt.plot(history['val_categorical_accuracy'])
    plt.plot(history['categorical_accuracy'])
    plt.legend(['val_categorical_accuracy', 'categorical_accuracy'])
    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.show()

In [None]:
visualize_training_results(emb_model_results)

# Load Saved Model

In [None]:
saved_model = load_model(MODEL_FILE_PATH+MODEL_FILE_NAME)

# Evaluate Model on Test Data

In [None]:
y_hat = saved_model.predict(X_test_seq_pad)

In [None]:
results = saved_model.evaluate(X_test_seq_pad, y_test_oh)

In [None]:
results

In [None]:
print(f'Test Accuracy = {round(results[1], 4)}')

In [None]:
print(classification_report(y_test_enc, np.argmax(y_hat, axis=1), target_names=le.classes_))

In [None]:
cm = confusion_matrix(y_test_enc, np.argmax(y_hat, axis=1))

In [None]:
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(cm, 
            annot=True, 
            xticklabels=le.classes_, 
            yticklabels=le.classes_,
            fmt='d',
            ax=ax)
# fix for mpl bug that cuts off top/bottom of seaborn viz
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.show()

# Summary

The model does a poor job identifying the minority classes of negative and positive which have recall scores of 0.25 and 0.51 respectively.  It still have a high accuracy of 0.66 because it does such a good job correctly identifying the majority class of neutral which has a recall score of 0.79.