In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import gensim
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
import keras


In [4]:
directory_file_path = '/content/drive/MyDrive/comp9444/preprocessed_dataset/'

# Specify the path to your Excel file
# file_path = '/content/drive/MyDrive/comp9444/COVIDSenti-B_original_paper_under.csv'
clean_over_path = directory_file_path + "COVIDSenti-B_cleanest_over.csv"
clean_under_path = directory_file_path + "COVIDSenti-B_cleanest_under.csv"
clean_path = directory_file_path + "COVIDSenti-B_cleanest.csv"

original_over_path = directory_file_path + "COVIDSenti-B_original_paper.csv"
original_under_path = directory_file_path + "COVIDSenti-B_original_paper_under.csv"
original_path = directory_file_path + "COVIDSenti-B_original_paper.csv"

most_features_over_path = directory_file_path + "COVIDSenti-B_most_features_over.csv"
most_features_under_path = directory_file_path + "COVIDSenti-B_most_features_under.csv"
most_features_path = directory_file_path + "COVIDSenti-B_most_features.csv"
# Read the Excel file into a DataFrame
# df = pd.read_csv(file_path, header=0)
# # Encoding the label column
# df['label'] = df['label'].map({'neu':0, 'pos':1, 'neg':2})

# df.dropna(subset=['processed'], inplace=True)

# Display the DataFrame
# df

In [None]:
X_train, X_test, y_train, y_test = train_test_split (df['processed'], df['label'] , test_size=0.2)

In [5]:
glove_path = '/content/drive/MyDrive/comp9444/glove.twitter.27B/glove.twitter.27B.50d.txt'

glove_embeddings = {}
with open(glove_path, "r") as file:
    for line in file:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        glove_embeddings[word] = vector

In [None]:
# df['processed'] = df['processed'].apply(lambda x: x.split()).tolist()
max_length = df.processed.apply(lambda x: len(x.split())).max()

t = Tokenizer()
t.fit_on_texts(df.processed)
vocab_size = len(t.word_index) + 1
encoded_tweets = t.texts_to_sequences(df.processed)
padded_tweets = pad_sequences(encoded_tweets, maxlen=max_length, padding='post')

vocab_size = len(t.word_index) + 1

In [None]:
embedding_matrix = np.zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
x_train, x_test, y_train, y_test = train_test_split(padded_tweets, df.label, test_size=0.2, stratify=df.label)

In [None]:
# To stop the training of the model earlier if 3 consecutive loss stays the same (does not decrease)
callback = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, mode='min')

In [None]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=50, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))



In [None]:
# Step 4: Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Step 5: Train the model
# y_train should be your labels (0 for neutral, 1 for positive, 2 for negative)
# 20 - 50 epochs (for a good number of epoch training for bi-lstm)
# With callback, there is a chance that the training will stop before the end of 50 epochs.
model.fit(x_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[callback])

Epoch 1/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 102ms/step - accuracy: 0.6481 - loss: 0.7898 - val_accuracy: 0.6233 - val_loss: 0.8961
Epoch 2/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 85ms/step - accuracy: 0.6804 - loss: 0.7460 - val_accuracy: 0.6233 - val_loss: 0.8685
Epoch 3/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 94ms/step - accuracy: 0.7152 - loss: 0.6597 - val_accuracy: 0.5831 - val_loss: 0.8873


<keras.src.callbacks.history.History at 0x789bdcb70190>

In [None]:
# Step 6: Predict on test data
y_pred = model.predict(x_test)
y_pred

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step


array([[0.41789153, 0.07875817, 0.50335026],
       [0.13629407, 0.8386559 , 0.02504995],
       [0.0754955 , 0.0029609 , 0.9215436 ],
       ...,
       [0.10319408, 0.02320137, 0.8736046 ],
       [0.5266073 , 0.05467797, 0.41871473],
       [0.14649247, 0.81557506, 0.03793248]], dtype=float32)

In [None]:
# Step 7: convert y_pred to class label results

# Convert probabilities to class labels
y_pred_labels = np.argmax(y_pred, axis=1)
y_pred_labels

array([2, 1, 2, ..., 2, 0, 1])

In [None]:
# Convert y_pred_labels to a pandas Series
y_pred_series = pd.Series(y_pred_labels, index=y_test.index)
y_pred_series

Unnamed: 0,0
3488,2
5855,1
3024,2
327,2
4860,1
...,...
2407,0
680,2
1767,2
2566,0


In [None]:
# Understand the class label (y_predict)
class_counts = y_pred_series.value_counts()
print(class_counts)

2    613
0    306
1    292
Name: count, dtype: int64


In [None]:
# step 8: results (word2vec (pretrained-googlenews) + bidirectional LSTM)
# Tesing on original_df dataset

# {'neu':0, 'pos':1, 'neg':2}
target_names = ["neu", "pos", "neg"]

report_dict = classification_report(y_test, y_pred_series, target_names=target_names, output_dict=True)

# Convert the report dictionary into a DataFrame for better readability
report_df = pd.DataFrame(report_dict).transpose()

# Create an empty row with NaN values
empty_row = pd.DataFrame([[" "] * len(report_df.columns)], columns=report_df.columns)

# Insert the empty row after 'neg' to separate the row for readability
report_df = pd.concat([report_df.loc[:'neg'], empty_row, report_df.loc['accuracy':]], ignore_index=False)

report_df.index.values[3] = ' '

# Display the summary
print("Original Normal Sampling Set")
print("")
print(report_df)

Original Normal Sampling Set

             precision    recall  f1-score  support
neu           0.575163  0.442211       0.5    398.0
pos            0.80137  0.576355  0.670487    406.0
neg           0.548124  0.825553  0.658824    407.0
                                                   
accuracy       0.61602   0.61602   0.61602  0.61602
macro avg     0.641552  0.614706   0.60977   1211.0
weighted avg  0.641914   0.61602  0.610536   1211.0


In [6]:
def train_model(df, glove_embeddings):
  df['label'] = df['label'].map({'neu':0, 'pos':1, 'neg':2})

  df.dropna(subset=['processed'], inplace=True)

  X_train, X_test, y_train, y_test = train_test_split (df['processed'], df['label'] , test_size=0.2)

  max_length = df.processed.apply(lambda x: len(x.split())).max()

  t = Tokenizer()
  t.fit_on_texts(df.processed)
  vocab_size = len(t.word_index) + 1
  encoded_tweets = t.texts_to_sequences(df.processed)
  padded_tweets = pad_sequences(encoded_tweets, maxlen=max_length, padding='post')

  vocab_size = len(t.word_index) + 1

  embedding_matrix = np.zeros((vocab_size, 50))
  for word, i in t.word_index.items():
      embedding_vector = glove_embeddings.get(word)
      if embedding_vector is not None:
          embedding_matrix[i] = embedding_vector

  x_train, x_test, y_train, y_test = train_test_split(padded_tweets, df.label, test_size=0.2, stratify=df.label)

  callback = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, mode='max')

  # Build the model
  model = Sequential()
  model.add(Embedding(input_dim=vocab_size, output_dim=50, weights=[embedding_matrix], input_length=max_length, trainable=False))
  model.add(Bidirectional(LSTM(128, return_sequences=True)))
  model.add(Dropout(0.2))
  model.add(Bidirectional(LSTM(64)))
  model.add(Dropout(0.2))
  model.add(Dense(3, activation='softmax'))

  model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

  model.fit(x_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[callback])

  return x_test, y_test, model


In [7]:
def create_report_df(X_test_padded, y_test, model):
  # Predict on test data
  y_pred = model.predict(X_test_padded)

  # Convert y_pred to class label results
  # Convert probabilities to class labels
  y_pred_labels = np.argmax(y_pred, axis=1)

  # Convert y_pred_labels to a pandas Series
  y_pred_series = pd.Series(y_pred_labels, index=y_test.index)

  # {'neu':0, 'pos':1, 'neg':2}
  target_names = ["neu", "pos", "neg"]

  report_dict = classification_report(y_test, y_pred_series, target_names=target_names, output_dict=True)

  # Convert the report dictionary into a DataFrame for better readability
  report_df = pd.DataFrame(report_dict).transpose()

  # Create an empty row with NaN values
  empty_row = pd.DataFrame([[" "] * len(report_df.columns)], columns=report_df.columns)

  # Insert the empty row after 'neg' to separate the row for readability
  report_df = pd.concat([report_df.loc[:'neg'], empty_row, report_df.loc['accuracy':]], ignore_index=False)

  report_df.index.values[3] = ' '

  return report_df

# Original Undersampling Set


In [17]:
pd.set_option('display.max_colwidth', 100)

df = pd.read_csv(original_under_path, header=0)

x_test, y_test, model = train_model(df, glove_embeddings)
report_df = create_report_df(x_test, y_test, model)

print("Original Undersampling Set")
print("")
print(report_df)

Epoch 1/50




[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 117ms/step - accuracy: 0.4197 - loss: 1.0655 - val_accuracy: 0.5315 - val_loss: 0.9634
Epoch 2/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 166ms/step - accuracy: 0.5811 - loss: 0.9156 - val_accuracy: 0.5552 - val_loss: 0.9294
Epoch 3/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 111ms/step - accuracy: 0.5960 - loss: 0.8717 - val_accuracy: 0.5717 - val_loss: 0.9433
Epoch 4/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 126ms/step - accuracy: 0.6382 - loss: 0.8238 - val_accuracy: 0.5253 - val_loss: 1.0203
Epoch 5/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 149ms/step - accuracy: 0.6381 - loss: 0.7968 - val_accuracy: 0.6017 - val_loss: 0.8808
Epoch 6/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 127ms/step - accuracy: 0.6817 - loss: 0.7139 - val_accuracy: 0.5810 - val_loss: 0.9081
Epoch 7/50
[1m122/12

# Original Oversampling Set

In [18]:
df = pd.read_csv(original_over_path, header=0)

x_test, y_test, model = train_model(df, glove_embeddings)
report_df = create_report_df(x_test, y_test, model)

print("Original Oversampling Set")
print("")
print(report_df)

Epoch 1/50




[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 176ms/step - accuracy: 0.7467 - loss: 0.6844 - val_accuracy: 0.7727 - val_loss: 0.5964
Epoch 2/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 140ms/step - accuracy: 0.7803 - loss: 0.5770 - val_accuracy: 0.8013 - val_loss: 0.5411
Epoch 3/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 138ms/step - accuracy: 0.8054 - loss: 0.5280 - val_accuracy: 0.8097 - val_loss: 0.5186
Epoch 4/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 143ms/step - accuracy: 0.8223 - loss: 0.4866 - val_accuracy: 0.8209 - val_loss: 0.4922
Epoch 5/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 134ms/step - accuracy: 0.8408 - loss: 0.4432 - val_accuracy: 0.8226 - val_loss: 0.4926
Epoch 6/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 139ms/step - accuracy: 0.8527 - loss: 0.4082 - val_accuracy: 0.8282 - val_loss: 0.4749
Epoch 7/50
[1m59

# Original Normal Sampling Set

In [23]:
df = pd.read_csv(original_path, header=0)

x_test, y_test, model = train_model(df, glove_embeddings)
report_df = create_report_df(x_test, y_test, model)

print("Original Normal sampling Set")
print("")
print(report_df)

Epoch 1/50




[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 137ms/step - accuracy: 0.7478 - loss: 0.6790 - val_accuracy: 0.7796 - val_loss: 0.5871
Epoch 2/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 137ms/step - accuracy: 0.7850 - loss: 0.5762 - val_accuracy: 0.8072 - val_loss: 0.5352
Epoch 3/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 140ms/step - accuracy: 0.8090 - loss: 0.5264 - val_accuracy: 0.8146 - val_loss: 0.5097
Epoch 4/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 140ms/step - accuracy: 0.8211 - loss: 0.4860 - val_accuracy: 0.8228 - val_loss: 0.4928
Epoch 5/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 136ms/step - accuracy: 0.8422 - loss: 0.4352 - val_accuracy: 0.8255 - val_loss: 0.4790
Epoch 6/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 144ms/step - accuracy: 0.8611 - loss: 0.3902 - val_accuracy: 0.8236 - val_loss: 0.4913
Epoch 7/50
[1m594/5

# Most features Normal Set

In [8]:
df = pd.read_csv(most_features_path, header=0)

x_test, y_test, model = train_model(df, glove_embeddings)
report_df = create_report_df(x_test, y_test, model)

print("Most features Normal sampling Set")
print("")
print(report_df)



Epoch 1/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 15ms/step - accuracy: 0.7422 - loss: 0.6895 - val_accuracy: 0.7744 - val_loss: 0.5918
Epoch 2/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.7859 - loss: 0.5690 - val_accuracy: 0.8008 - val_loss: 0.5274
Epoch 3/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.8136 - loss: 0.5115 - val_accuracy: 0.8087 - val_loss: 0.5049
Epoch 4/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.8283 - loss: 0.4748 - val_accuracy: 0.8221 - val_loss: 0.4848
Epoch 5/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.8471 - loss: 0.4353 - val_accuracy: 0.8202 - val_loss: 0.4816
Epoch 6/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.8560 - loss: 0.4017 - val_accuracy: 0.8285 - val_loss: 0.4852
Epoch 7/50
[1m600/6

# Most features Undersampling Set

In [21]:
df = pd.read_csv(most_features_under_path, header=0)

x_test, y_test, model = train_model(df, glove_embeddings)
report_df = create_report_df(x_test, y_test, model)

print("Most features Under sampling Set")
print("")
print(report_df)

Epoch 1/50




[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 113ms/step - accuracy: 0.4396 - loss: 1.0576 - val_accuracy: 0.5697 - val_loss: 0.9464
Epoch 2/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 110ms/step - accuracy: 0.5611 - loss: 0.9489 - val_accuracy: 0.6025 - val_loss: 0.8847
Epoch 3/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 112ms/step - accuracy: 0.6188 - loss: 0.8626 - val_accuracy: 0.6127 - val_loss: 0.8533
Epoch 4/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 110ms/step - accuracy: 0.6310 - loss: 0.8187 - val_accuracy: 0.6383 - val_loss: 0.8000
Epoch 5/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 96ms/step - accuracy: 0.6771 - loss: 0.7611 - val_accuracy: 0.6475 - val_loss: 0.7890
Epoch 6/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 129ms/step - accuracy: 0.7004 - loss: 0.6990 - val_accuracy: 0.6557 - val_loss: 0.8259
Epoch 7/50
[1m122/122

# Most features Oversampling Set

In [22]:
df = pd.read_csv(most_features_over_path, header=0)

x_test, y_test, model = train_model(df, glove_embeddings)
report_df = create_report_df(x_test, y_test, model)

print("Most features Over sampling Set")
print("")
print(report_df)

Epoch 1/50




[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 136ms/step - accuracy: 0.5731 - loss: 0.8998 - val_accuracy: 0.7354 - val_loss: 0.6213
Epoch 2/50
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 135ms/step - accuracy: 0.7472 - loss: 0.5956 - val_accuracy: 0.8177 - val_loss: 0.4592
Epoch 3/50
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 143ms/step - accuracy: 0.8396 - loss: 0.4032 - val_accuracy: 0.8674 - val_loss: 0.3405
Epoch 4/50
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 138ms/step - accuracy: 0.8977 - loss: 0.2722 - val_accuracy: 0.8945 - val_loss: 0.2928
Epoch 5/50
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 135ms/step - accuracy: 0.9299 - loss: 0.1938 - val_accuracy: 0.9139 - val_loss: 0.2546
Epoch 6/50
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 138ms/step - accuracy: 0.9519 - loss: 0.1437 - val_accuracy: 0.9244 - val_loss: 0.2436
Epo

# Cleanest features Normal Set

In [9]:
df = pd.read_csv(clean_path, header=0)

x_test, y_test, model = train_model(df, glove_embeddings)
report_df = create_report_df(x_test, y_test, model)

print("Cleanest Normal sampling Set")
print("")
print(report_df)

Epoch 1/50




[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.7400 - loss: 0.6892 - val_accuracy: 0.7695 - val_loss: 0.5996
Epoch 2/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.7897 - loss: 0.5658 - val_accuracy: 0.8057 - val_loss: 0.5327
Epoch 3/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.8137 - loss: 0.5132 - val_accuracy: 0.8167 - val_loss: 0.5001
Epoch 4/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.8312 - loss: 0.4654 - val_accuracy: 0.8261 - val_loss: 0.4944
Epoch 5/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.8400 - loss: 0.4413 - val_accuracy: 0.8362 - val_loss: 0.4752
Epoch 6/50
[1m594/594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - accuracy: 0.8593 - loss: 0.3902 - val_accuracy: 0.8379 - val_loss: 0.4779
Epoch 7/50
[1m594/594[0m [3

# Cleanest features UnderSampling Set

In [10]:
df = pd.read_csv(clean_under_path, header=0)

x_test, y_test, model = train_model(df, glove_embeddings)
report_df = create_report_df(x_test, y_test, model)

print("Cleanest undersampling Set")
print("")
print(report_df)

Epoch 1/50




[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.4344 - loss: 1.0592 - val_accuracy: 0.5741 - val_loss: 0.9480
Epoch 2/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.5916 - loss: 0.9041 - val_accuracy: 0.6029 - val_loss: 0.8969
Epoch 3/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6034 - loss: 0.8666 - val_accuracy: 0.5874 - val_loss: 0.8875
Epoch 4/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.6464 - loss: 0.8018 - val_accuracy: 0.6183 - val_loss: 0.8538
Epoch 5/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.6643 - loss: 0.7568 - val_accuracy: 0.6183 - val_loss: 0.8334
Epoch 6/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.7037 - loss: 0.6991 - val_accuracy: 0.6399 - val_loss: 0.8195
Epoch 7/50
[1m122/122[0m [32m━

# Cleanest features OverSampling Set

In [11]:
df = pd.read_csv(clean_over_path, header=0)

x_test, y_test, model = train_model(df, glove_embeddings)
report_df = create_report_df(x_test, y_test, model)

print("Cleanest oversampling Set")
print("")
print(report_df)

Epoch 1/50




[1m1344/1344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.5717 - loss: 0.9045 - val_accuracy: 0.6937 - val_loss: 0.6931
Epoch 2/50
[1m1344/1344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.7543 - loss: 0.5879 - val_accuracy: 0.7936 - val_loss: 0.4994
Epoch 3/50
[1m1344/1344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - accuracy: 0.8360 - loss: 0.4056 - val_accuracy: 0.8543 - val_loss: 0.3664
Epoch 4/50
[1m1344/1344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.8959 - loss: 0.2765 - val_accuracy: 0.8968 - val_loss: 0.2848
Epoch 5/50
[1m1344/1344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.9293 - loss: 0.1972 - val_accuracy: 0.9121 - val_loss: 0.2447
Epoch 6/50
[1m1344/1344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.9467 - loss: 0.1509 - val_accuracy: 0.9205 - val_loss: 0.2369
Epoch 7/50
[1m