# Lemmatized, Back-translated Augmented Train

# **This section shows the results for the back-translation dataset trained using default embeddings.**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd
import gensim
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import matthews_corrcoef


train_df_bt = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IS460 ML Datasets/latest/lemmatized_bt_augment_train.csv")
test_df_bt = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IS460 ML Datasets/latest/test.csv")
valid_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IS460 ML Datasets/latest/validation.csv")


In [None]:
def convert_prediction_1D(predictions):
  result = []
  for array in predictions:
    highest_proba = max(array)

    if list(array).index(highest_proba) == 0:
      result.append(-1)
    elif list(array).index(highest_proba) == 1:
      result.append(0)
    else:
      result.append(1)

  return result

In [None]:
df_bt.head()

Unnamed: 0.1,Unnamed: 0,text,sentiment,lemmatized and stopwords_removed
0,0,should uber use driverless cars to ease safety...,0,uber use driverless car ease safety concern
1,1,oh hai minorityreport is making your driverles...,0,oh hai minorityreport driverless transportatio...
2,2,who is responsible if a self driving car gets ...,0,responsible self drive car accident
3,3,i almost got rear ended by the google car iron...,-1,got rear end google car ironic
4,4,self driving cars will be a hit until the firs...,-1,self drive car hit family hit algorithm sue


In [None]:
train_df_bt = train_df_bt[['lemmatized and stopwords_removed', 'sentiment']]
train_df_bt = train_df_bt.rename(columns={"lemmatized and stopwords_removed": "text_cleaned"})
train_df_bt = train_df_bt.rename(columns={"text_cleaned": "text"}) 

test_df_bt = test_df_bt[['lemmatized and stopwords_removed', 'sentiment']]
test_df_bt = test_df_bt.rename(columns={"lemmatized and stopwords_removed": "text_cleaned"})
test_df_bt = test_df_bt.rename(columns={"text_cleaned": "text"}) 

valid_df = valid_df[['lemmatized and stopwords_removed', 'sentiment']]
valid_df = valid_df.rename(columns={"lemmatized and stopwords_removed": "text_cleaned"})
valid_df = valid_df.rename(columns={"text_cleaned": "text"}) 




In [None]:
train_df_bt['text'] = train_df_bt['text'].astype("str")
train_df_bt['sentiment'] = train_df_bt['sentiment'].astype("str")

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 6000

# Max number of words in each thread. For this one, we just used the average length of word of all sentences in the data,
#  to prevent overfitting to the longest sentence, as most of the words are around that length. This particular var, affects the training process ALOT.

MAX_SEQUENCE_LENGTH = max([len(s.split()) for s in train_df_bt['text']])

# This is fixed.
EMBEDDING_DIM = 100

# num_words = the maximum number of words to keep, based on word frequency. Only the most common num_words-1 words will be kept.
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_df_bt['text'].values)


word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 6691 unique tokens.


In [None]:
X_train = tokenizer.texts_to_sequences(train_df_bt['text'].values)
X_test = tokenizer.texts_to_sequences(test_df_bt['text'].values)
X_valid = tokenizer.texts_to_sequences(valid_df['text'].values)


X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
X_valid = pad_sequences(X_valid, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor (X_train):', X_train.shape)
print('Shape of data tensor (X_test):', X_test.shape)
print('Shape of data tensor (X_valid):', X_valid.shape)


Shape of data tensor (X_train): (8162, 43)
Shape of data tensor (X_test): (671, 43)
Shape of data tensor (X_valid): (670, 43)


In [None]:
Y_train = pd.get_dummies(train_df_bt['sentiment']).values
Y_test = pd.get_dummies(test_df_bt['sentiment']).values
Y_valid = pd.get_dummies(valid_df['sentiment']).values


print('Shape of label tensor (Y_train):', Y_train.shape)
print('Shape of label tensor (Y_test):', Y_test.shape)

# X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size = 0.2, random_state = 99)
print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)

Shape of label tensor (Y_train): (8162, 3)
Shape of label tensor (Y_test): (671, 3)
(8162, 43) (8162, 3)
(670, 43) (670, 3)


In [None]:
# LSTM
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2

model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(layers.Embedding(len(word_index)+1, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(LSTM(128, return_sequences=True, dropout=0.5, recurrent_dropout=0.2))
model.add(LSTM(32))
model.add(Dense(32,activation='relu')) 
model.add(layers.Dense(3, activation="softmax"))
model.compile(optimizer=adam, loss="categorical_crossentropy", 
     metrics=['accuracy'])


model.summary()

epochs = 25
batch_size = 16

	
es = EarlyStopping(monitor='val_loss', mode="min", patience=3)
# history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.3, callbacks=[es])
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_valid, Y_valid), callbacks=[es])
# history = model.fit(X_train, Y_train, validation_data=(X_valid, Y_valid), epochs=epochs, batch_size=batch_size)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 117, 100)          860200    
_________________________________________________________________
lstm_14 (LSTM)               (None, 117, 128)          117248    
_________________________________________________________________
lstm_15 (LSTM)               (None, 32)                20608     
_________________________________________________________________
dense_14 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_15 (Dense)             (None, 3)                 99        
Total params: 999,211
Trainable params: 999,211
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25


In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n Loss: {:0.3f}\n  Test Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = convert_prediction_1D(predictions)


precision, recall, f1_score, none = precision_recall_fscore_support(test_df_lem['sentiment'], predictions, average='weighted')
m_corr = matthews_corrcoef(test_df_lem['sentiment'], predictions)

print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)
print("Matthew Corr Score is: ", m_corr)

Test set
 Loss: 0.985
  Test Accuracy: 0.663
Precision is:  0.6660694240451904
Recall is:  0.6631892697466468
F1 Score is:  0.6602634488711754
Matthew Corr Score is:  0.3738437698084896


# Lemmatized, Original Data

# **This section shows the results for the original dataset trained using default embeddings (for your reference to compare with back-translation results in this notebook).**

In [None]:
train_df_lem = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IS460 ML Datasets/latest/lemmatized_original_train.csv")
test_df_lem = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IS460 ML Datasets/latest/test.csv")
valid_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IS460 ML Datasets/latest/validation.csv")

train_df_lem = train_df_lem[['lemmatized and stopwords_removed', 'sentiment']]
train_df_lem = train_df_lem.rename(columns={"lemmatized and stopwords_removed": "text_cleaned"})
train_df_lem = train_df_bt.rename(columns={"text_cleaned": "text"}) 

test_df_lem = test_df_lem[['lemmatized and stopwords_removed', 'sentiment']]
test_df_lem = test_df_lem.rename(columns={"lemmatized and stopwords_removed": "text_cleaned"})
test_df_lem = test_df_lem.rename(columns={"text_cleaned": "text"}) 

valid_df = valid_df[['lemmatized and stopwords_removed', 'sentiment']]
valid_df = valid_df.rename(columns={"lemmatized and stopwords_removed": "text_cleaned"})
valid_df = valid_df.rename(columns={"text_cleaned": "text"}) 

train_df_lem['text'] = train_df_lem['text'].astype("str")
train_df_lem['sentiment'] = train_df_lem['sentiment'].astype("str")

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 6000

# Max number of words in each thread. For this one, we just used the average length of word of all sentences in the data,
#  to prevent overfitting to the longest sentence, as most of the words are around that length. This particular var, affects the training process ALOT.

MAX_SEQUENCE_LENGTH = max([len(s.split()) for s in train_df_lem['text']])

# This is fixed.
EMBEDDING_DIM = 100

# num_words = the maximum number of words to keep, based on word frequency. Only the most common num_words-1 words will be kept.
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_df_lem['text'].values)


word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_train = tokenizer.texts_to_sequences(train_df_lem['text'].values)
X_test = tokenizer.texts_to_sequences(test_df_lem['text'].values)
X_valid = tokenizer.texts_to_sequences(valid_df['text'].values)


X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
X_valid = pad_sequences(X_valid, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor (X_train):', X_train.shape)
print('Shape of data tensor (X_test):', X_test.shape)
print('Shape of data tensor (X_valid):', X_valid.shape)

Y_train = pd.get_dummies(train_df_lem['sentiment']).values
Y_test = pd.get_dummies(test_df_lem['sentiment']).values
Y_valid = pd.get_dummies(valid_df['sentiment']).values

print('Shape of label tensor (Y_train):', Y_train.shape)
print('Shape of label tensor (Y_test):', Y_test.shape)

# X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size = 0.2, random_state = 99)
print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)


Found 8601 unique tokens.
Shape of data tensor (X_train): (8162, 117)
Shape of data tensor (X_test): (671, 117)
Shape of data tensor (X_valid): (670, 117)
Shape of label tensor (Y_train): (8162, 3)
Shape of label tensor (Y_test): (671, 3)
(8162, 117) (8162, 3)
(670, 117) (670, 3)


In [None]:
# LSTM
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2

model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(layers.Embedding(len(word_index)+1, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(LSTM(128, return_sequences=True, dropout=0.5, recurrent_dropout=0.2))
model.add(LSTM(32))
model.add(Dense(32,activation='relu')) 
model.add(layers.Dense(3, activation="softmax"))
model.compile(optimizer=adam, loss="categorical_crossentropy", 
     metrics=['accuracy'])


model.summary()

epochs = 25
batch_size = 16

	
es = EarlyStopping(monitor='val_loss', mode="min", patience=3)
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_valid, Y_valid), callbacks=[es])

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 117, 100)          860200    
_________________________________________________________________
lstm_12 (LSTM)               (None, 117, 128)          117248    
_________________________________________________________________
lstm_13 (LSTM)               (None, 32)                20608     
_________________________________________________________________
dense_12 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_13 (Dense)             (None, 3)                 99        
Total params: 999,211
Trainable params: 999,211
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25


In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n Loss: {:0.3f}\n  Test Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = convert_prediction_1D(predictions)


precision, recall, f1_score, none = precision_recall_fscore_support(test_df_lem['sentiment'], predictions, average='weighted')
m_corr = matthews_corrcoef(test_df_lem['sentiment'], predictions)

print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)
print("Matthew Corr Score is: ", m_corr)

Test set
 Loss: 1.086
  Test Accuracy: 0.617
Precision is:  0.6032164042042557
Recall is:  0.61698956780924
F1 Score is:  0.6069937080316189
Matthew Corr Score is:  0.26197648657716915


# Lemmatized, Synonym, Augmented Data


# **This section shows the results for the synonym replacement dataset trained using default embeddings (for your reference to compare with back-translation results).**

In [None]:
train_df_syn = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IS460 ML Datasets/latest/lemmatized_synonym_augment_train.csv")
test_df_lem = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IS460 ML Datasets/latest/test.csv")
valid_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IS460 ML Datasets/latest/validation.csv")

train_df_syn = train_df_syn[['lemmatized and stopwords_removed', 'sentiment']]
train_df_syn = train_df_syn.rename(columns={"lemmatized and stopwords_removed": "text_cleaned"})
train_df_syn = train_df_syn.rename(columns={"text_cleaned": "text"}) 

test_df_lem = test_df_lem[['lemmatized and stopwords_removed', 'sentiment']]
test_df_lem = test_df_lem.rename(columns={"lemmatized and stopwords_removed": "text_cleaned"})
test_df_lem = test_df_lem.rename(columns={"text_cleaned": "text"}) 

valid_df = valid_df[['lemmatized and stopwords_removed', 'sentiment']]
valid_df = valid_df.rename(columns={"lemmatized and stopwords_removed": "text_cleaned"})
valid_df = valid_df.rename(columns={"text_cleaned": "text"}) 

train_df_syn['text'] = train_df_syn['text'].astype("str")
train_df_syn['sentiment'] = train_df_syn['sentiment'].astype("str")

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 6000

# Max number of words in each thread. For this one, we just used the average length of word of all sentences in the data,
#  to prevent overfitting to the longest sentence, as most of the words are around that length. This particular var, affects the training process ALOT.

MAX_SEQUENCE_LENGTH = max([len(s.split()) for s in train_df_syn['text']])

# This is fixed.
EMBEDDING_DIM = 100

# num_words = the maximum number of words to keep, based on word frequency. Only the most common num_words-1 words will be kept.
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_df_syn['text'].values)


word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_train = tokenizer.texts_to_sequences(train_df_syn['text'].values)
X_test = tokenizer.texts_to_sequences(test_df_lem['text'].values)
X_valid = tokenizer.texts_to_sequences(valid_df['text'].values)


X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
X_valid = pad_sequences(X_valid, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor (X_train):', X_train.shape)
print('Shape of data tensor (X_test):', X_test.shape)
print('Shape of data tensor (X_valid):', X_valid.shape)

Y_train = pd.get_dummies(train_df_syn['sentiment']).values
Y_test = pd.get_dummies(test_df_lem['sentiment']).values
Y_valid = pd.get_dummies(valid_df['sentiment']).values

print('Shape of label tensor (Y_train):', Y_train.shape)
print('Shape of label tensor (Y_test):', Y_test.shape)

# X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size = 0.2, random_state = 99)
print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)


Found 7454 unique tokens.
Shape of data tensor (X_train): (8162, 25)
Shape of data tensor (X_test): (671, 25)
Shape of data tensor (X_valid): (670, 25)
Shape of label tensor (Y_train): (8162, 3)
Shape of label tensor (Y_test): (671, 3)
(8162, 25) (8162, 3)
(670, 25) (670, 3)


In [None]:
# LSTM
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2

model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(layers.Embedding(len(word_index)+1, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(LSTM(128, return_sequences=True, dropout=0.5, recurrent_dropout=0.2))
model.add(LSTM(32))
model.add(Dense(32,activation='relu')) 
model.add(layers.Dense(3, activation="softmax"))
model.compile(optimizer=adam, loss="categorical_crossentropy", 
     metrics=['accuracy'])


model.summary()

epochs = 25
batch_size = 16

	
es = EarlyStopping(monitor='val_loss', mode="min", patience=3)
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_valid, Y_valid), callbacks=[es])

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 25, 100)           745500    
_________________________________________________________________
lstm_10 (LSTM)               (None, 25, 128)           117248    
_________________________________________________________________
lstm_11 (LSTM)               (None, 32)                20608     
_________________________________________________________________
dense_10 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_11 (Dense)             (None, 3)                 99        
Total params: 884,511
Trainable params: 884,511
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25


In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n Loss: {:0.3f}\n  Test Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = convert_prediction_1D(predictions)


precision, recall, f1_score, none = precision_recall_fscore_support(test_df_lem['sentiment'], predictions, average='weighted')
m_corr = matthews_corrcoef(test_df_lem['sentiment'], predictions)

print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)
print("Matthew Corr Score is: ", m_corr)

Test set
 Loss: 1.019
  Test Accuracy: 0.666
Precision is:  0.6570211455312583
Recall is:  0.6661698956780924
F1 Score is:  0.6596717180857491
Matthew Corr Score is:  0.35635645703901353


# **This section will show the results for the different embeddings, for the back-translation dataset.**

***Modelling with back-translation dataset***

<ul>
  <li> LSTM with default Embeddings (results are in the first section above) </li>
  <li> LSTM with Word2Vec Embeddings </li>
  <li> LSTM with Pre-trained Word2Vec Embeddings </li>
  <li> LSTM with Glove Embeddings </li>
</ul>

# ***CBOW + LSTM (Highest Performance from above)***

In [None]:
"""
CBOW Model
"""
text_sentences = train_df_bt['text'].apply(lambda x: x.split())

model = gensim.models.Word2Vec(sentences=text_sentences, size=100, window=5, workers=4, min_count=1)
words = list(model.wv.vocab)
print(len(words))

filename = 'selfdriving_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

6700


In [None]:
import numpy as np
embeddings_index = {}
f = open("/content/selfdriving_embedding_word2vec.txt")
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:])
  embeddings_index[word] = coefs
f.close()

num_words = len(word_index) + 1
word2vec_embedding_matrix = np.zeros((num_words, 100))

for word, i in word_index.items():
  if i > num_words: 
    continue

  embedding_vector = embeddings_index.get(word) 
  if embedding_vector is not None:
    word2vec_embedding_matrix[i] = embedding_vector 
print(num_words)


6692


In [None]:
# CBOW LSTM

from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2


model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(Embedding(num_words, 100, weights=[word2vec_embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=False))
model.add(LSTM(256, return_sequences=True, dropout=0.5))
model.add(LSTM(32))
model.add(Dense(32,activation='relu')) 
model.add(layers.Dense(3, activation="softmax"))

model.compile(optimizer=adam, loss="categorical_crossentropy", 
     metrics=['accuracy'])

epochs = 25
batch_size = 16

model.summary()
	
es = EarlyStopping(monitor='val_loss', mode="min", patience=3)
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_valid, Y_valid), callbacks=[es])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 43, 100)           669200    
                                                                 
 lstm (LSTM)                 (None, 43, 256)           365568    
                                                                 
 lstm_1 (LSTM)               (None, 32)                36992     
                                                                 
 dense (Dense)               (None, 32)                1056      
                                                                 
 dense_1 (Dense)             (None, 3)                 99        
                                                                 
Total params: 1,072,915
Trainable params: 403,715
Non-trainable params: 669,200
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/2

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n Loss: {:0.3f}\n  Test Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = convert_prediction_1D(predictions)


precision, recall, f1_score, none = precision_recall_fscore_support(test_df_bt['sentiment'], predictions, average='weighted')
m_corr = matthews_corrcoef(test_df_bt['sentiment'], predictions)

print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)
print("Matthew Corr Score is: ", m_corr)

Test set
 Loss: 1.045
  Test Accuracy: 0.365
Precision is:  0.4877826484956897
Recall is:  0.3651266766020864
F1 Score is:  0.32023744838627877
Matthew Corr Score is:  0.027181000001429895


# ***Skipgram Word2Vec LSTM***

In [None]:
"""
Skip-Gram Model
"""

text_sentences = train_df_bt['text'].apply(lambda x: x.split())

model = gensim.models.Word2Vec(sentences=text_sentences, size=100, window=5, workers=4, min_count=1, sg=1)
words = list(model.wv.vocab)
print(len(words))

filename = 'selfdriving_embedding_word2vec_skipgram.txt'
model.wv.save_word2vec_format(filename, binary=False)


6700


In [None]:
# SkipGram word2vec
import numpy as np
embeddings_index = {}
f = open("/content/selfdriving_embedding_word2vec_skipgram.txt")
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:])
  embeddings_index[word] = coefs
f.close()

num_words = len(word_index) + 1
word2vec_embedding_matrix = np.zeros((num_words, 100))

for word, i in word_index.items():
  if i > num_words: 
    continue

  embedding_vector = embeddings_index.get(word) 
  if embedding_vector is not None:
    word2vec_embedding_matrix[i] = embedding_vector 
print(num_words)


6692


In [None]:
# Skipgram LSTM

from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2


model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(Embedding(num_words, 100, weights=[word2vec_embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=False))
model.add(LSTM(256, return_sequences=True, dropout=0.5))
model.add(LSTM(32))
model.add(Dense(32,activation='relu')) 
model.add(layers.Dense(3, activation="softmax"))

model.compile(optimizer=adam, loss="categorical_crossentropy", 
     metrics=['accuracy'])

epochs = 25
batch_size = 16

model.summary()
	
es = EarlyStopping(monitor='val_loss', mode="min", patience=3)
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_valid, Y_valid), callbacks=[es])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 43, 100)           669200    
                                                                 
 lstm_2 (LSTM)               (None, 43, 256)           365568    
                                                                 
 lstm_3 (LSTM)               (None, 32)                36992     
                                                                 
 dense_2 (Dense)             (None, 32)                1056      
                                                                 
 dense_3 (Dense)             (None, 3)                 99        
                                                                 
Total params: 1,072,915
Trainable params: 403,715
Non-trainable params: 669,200
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n Loss: {:0.3f}\n  Test Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = convert_prediction_1D(predictions)


precision, recall, f1_score, none = precision_recall_fscore_support(test_df_bt['sentiment'], predictions, average='weighted')
m_corr = matthews_corrcoef(test_df_bt['sentiment'], predictions)

print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)
print("Matthew Corr Score is: ", m_corr)

Test set
 Loss: 1.044
  Test Accuracy: 0.492
Precision is:  0.4998327680422084
Recall is:  0.4918032786885246
F1 Score is:  0.4787298883537848
Matthew Corr Score is:  0.10344206873497415


# ***Glove + LSTM***

In [None]:
# Glove
import numpy as np
embeddings_index = {}
f = open("/content/drive/MyDrive/Colab Notebooks/IS460 ML Datasets/glove.6B.300d.txt")
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:])
  embeddings_index[word] = coefs
f.close()

num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 300))

for word, i in word_index.items():
  if i > num_words: 
    continue

  embedding_vector = embeddings_index.get(word) 
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector 
print(num_words)


6692


In [None]:
# Glove LSTM

from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2


model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(Embedding(num_words, 300, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=False))
model.add(LSTM(256, return_sequences=True, dropout=0.5, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001), bias_regularizer=l2(0.001)))
model.add(LSTM(32))
model.add(Dense(32,activation='relu')) 
model.add(layers.Dense(3, activation="softmax"))

model.compile(optimizer=adam, loss="categorical_crossentropy", 
     metrics=['accuracy'])

model.summary()

epochs = 25
batch_size = 16

es = EarlyStopping(monitor='val_loss', mode="min", patience=3)
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_valid, Y_valid), callbacks=[es])

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 43, 300)           2007600   
                                                                 
 lstm_4 (LSTM)               (None, 43, 256)           570368    
                                                                 
 lstm_5 (LSTM)               (None, 32)                36992     
                                                                 
 dense_4 (Dense)             (None, 32)                1056      
                                                                 
 dense_5 (Dense)             (None, 3)                 99        
                                                                 
Total params: 2,616,115
Trainable params: 608,515
Non-trainable params: 2,007,600
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n Loss: {:0.3f}\n  Test Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = convert_prediction_1D(predictions)


precision, recall, f1_score, none = precision_recall_fscore_support(test_df_bt['sentiment'], predictions, average='weighted')
m_corr = matthews_corrcoef(test_df_bt['sentiment'], predictions)

print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)
print("Matthew Corr Score is: ", m_corr)

Test set
 Loss: 0.876
  Test Accuracy: 0.660
Precision is:  0.6707380210997993
Recall is:  0.6602086438152012
F1 Score is:  0.6647920304655108
Matthew Corr Score is:  0.38326362998216223


# ***Word2Vec Pretrained w/ LSTM***

In [None]:
train_df_bt = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IS460 ML Datasets/latest/lemmatized_bt_augment_train.csv")
test_df_bt = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IS460 ML Datasets/latest/test.csv")
valid_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IS460 ML Datasets/latest/validation.csv")

train_df_bt = train_df_bt[['lemmatized and stopwords_removed', 'sentiment']]
train_df_bt = train_df_bt.rename(columns={"lemmatized and stopwords_removed": "text_cleaned"})
train_df_bt = train_df_bt.rename(columns={"text_cleaned": "text"}) 

test_df_bt = test_df_bt[['lemmatized and stopwords_removed', 'sentiment']]
test_df_bt = test_df_bt.rename(columns={"lemmatized and stopwords_removed": "text_cleaned"})
test_df_bt = test_df_bt.rename(columns={"text_cleaned": "text"}) 

valid_df = valid_df[['lemmatized and stopwords_removed', 'sentiment']]
valid_df = valid_df.rename(columns={"lemmatized and stopwords_removed": "text_cleaned"})
valid_df = valid_df.rename(columns={"text_cleaned": "text"}) 

train_df_bt['text'] = train_df_bt['text'].astype("str")
train_df_bt['sentiment'] = train_df_bt['sentiment'].astype("str")

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 6000

# Max number of words in each thread. For this one, we just used the average length of word of all sentences in the data,
#  to prevent overfitting to the longest sentence, as most of the words are around that length. This particular var, affects the training process ALOT.

MAX_SEQUENCE_LENGTH = max([len(s.split()) for s in train_df_bt['text']])

# This is fixed.
EMBEDDING_DIM = 100

# num_words = the maximum number of words to keep, based on word frequency. Only the most common num_words-1 words will be kept.
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_df_bt['text'].values)


word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_train = tokenizer.texts_to_sequences(train_df_bt['text'].values)
X_test = tokenizer.texts_to_sequences(test_df_bt['text'].values)
X_valid = tokenizer.texts_to_sequences(valid_df['text'].values)


X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
X_valid = pad_sequences(X_valid, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor (X_train):', X_train.shape)
print('Shape of data tensor (X_test):', X_test.shape)
print('Shape of data tensor (X_valid):', X_valid.shape)

Y_train = pd.get_dummies(train_df_bt['sentiment']).values
Y_test = pd.get_dummies(test_df_bt['sentiment']).values
Y_valid = pd.get_dummies(valid_df['sentiment']).values

print('Shape of label tensor (Y_train):', Y_train.shape)
print('Shape of label tensor (Y_test):', Y_test.shape)

# X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size = 0.2, random_state = 99)
print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)


Found 6691 unique tokens.
Shape of data tensor (X_train): (8162, 43)
Shape of data tensor (X_test): (671, 43)
Shape of data tensor (X_valid): (670, 43)
Shape of label tensor (Y_train): (8162, 3)
Shape of label tensor (Y_test): (671, 3)
(8162, 43) (8162, 3)
(670, 43) (670, 3)


In [None]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300")

In [None]:
"""
Pre-trained Word2Vec Model
"""

text_sentences = train_df_bt['text'].apply(lambda x: x.split())
words = list(model.wv.vocab)
print(len(words))

filename = 'selfdriving_embedding_word2vec_pretrained.txt'
model.wv.save_word2vec_format(filename, binary=False)

  
  # Remove the CWD from sys.path while we load stuff.


3000000


In [None]:
import numpy as np
labels = np.asarray(model.index2word)
vectors = np.asarray(model.vectors)
word_embeddings = dict(zip(labels, vectors))

In [None]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 300))

for word, i in word_index.items():
  if i > num_words: 
    continue

  embedding_vector = word_embeddings.get(word) 
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector 
print(num_words)

6692


In [None]:
# Pre-trained Word2Vec LSTM

from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2


model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(Embedding(num_words, 300, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=False))
model.add(LSTM(256, return_sequences=True, dropout=0.5, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001), bias_regularizer=l2(0.001)))
model.add(LSTM(32))
model.add(Dense(32,activation='relu')) 
model.add(layers.Dense(3, activation="softmax"))

model.compile(optimizer=adam, loss="categorical_crossentropy", 
     metrics=['accuracy'])

model.summary() 

epochs = 25
batch_size = 16

es = EarlyStopping(monitor='val_loss', mode="min", patience=3)
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_valid, Y_valid), callbacks=[es])


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 43, 300)           2007600   
                                                                 
 lstm (LSTM)                 (None, 43, 256)           570368    
                                                                 
 lstm_1 (LSTM)               (None, 32)                36992     
                                                                 
 dense (Dense)               (None, 32)                1056      
                                                                 
 dense_1 (Dense)             (None, 3)                 99        
                                                                 
Total params: 2,616,115
Trainable params: 608,515
Non-trainable params: 2,007,600
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n Loss: {:0.3f}\n  Test Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = convert_prediction_1D(predictions)


precision, recall, f1_score, none = precision_recall_fscore_support(test_df_bt['sentiment'], predictions, average='weighted')
m_corr = matthews_corrcoef(test_df_bt['sentiment'], predictions)

print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)
print("Matthew Corr Score is: ", m_corr)

Test set
 Loss: 0.871
  Test Accuracy: 0.645
Precision is:  0.6679277370159373
Recall is:  0.6453055141579732
F1 Score is:  0.6535235493623104
Matthew Corr Score is:  0.3754780956171252
