In [1]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')


import pandas as pd
import gensim
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report
import keras
from gensim.models import KeyedVectors
import os



Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [None]:
# directory_file_path = '/content/drive/My Drive/final_dataset/resampling'

# # Loop through the files in the directory to check file names
# for filename in os.listdir(directory_file_path):
#     if filename.endswith('.csv'):  # Check if the file ends with .csv
#         name = os.path.join(directory_file_path, filename)
#         print(name)

In [3]:

word2vec_path = r'/content/drive/My Drive/GoogleNews-vectors-negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)


def w2v_bilstm(df):
    df = df.dropna(subset=['processed'])

    df = df[['processed', 'label']]
    df['label'] = df['label'].map({'neu':0, 'pos':1, 'neg':2})

    df['processed'] =  [x.split() for x in df['processed']]
    X_train, X_test, y_train, y_test = train_test_split (df['processed'], df['label'] , test_size=0.2)

    words = set(w2v_model.index_to_key)
    word_to_idx = {word: i+1 for i, word in enumerate(words)}  # 0 is reserved for padding

    # Convert text to sequences of indices
    def text_to_sequence(text):
        return [word_to_idx.get(word, 0) for word in text if word in words]

    X_train_seq = [text_to_sequence(text) for text in X_train]
    X_test_seq = [text_to_sequence(text) for text in X_test]

    max_length = max(len(seq) for seq in X_train_seq)

    # Pad sequences
    X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
    X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

    # Create embedding matrix
    vocab_size = len(word_to_idx) + 1  # +1 for padding token
    print(f'vocab size: {vocab_size}')
    embedding_matrix = np.zeros((vocab_size, 300))
    for word, i in word_to_idx.items():
        embedding_matrix[i] = w2v_model[word]
    # print(embedding_matrix)

    # To stop the training of the model earlier if 3 consecutive loss stays the same (does not decrease)
    callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3, mode='min')

    # Build the model
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=300, weights=[embedding_matrix], input_length=max_length, trainable=False))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.2))
    model.add(Dense(3, activation='softmax'))

    # Step 4: Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train_padded, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[callback])

    # Step 6: Predict on test data
    y_pred = model.predict(X_test_padded)
    y_pred

    # Convert probabilities to class labels
    y_pred_labels = np.argmax(y_pred, axis=1)
    y_pred_labels

    # Convert y_pred_labels to a pandas Series
    y_pred_series = pd.Series(y_pred_labels, index=y_test.index)
    y_pred_series

    # step 8: results (word2vec (pretrained-googlenews) + bidirectional LSTM)
    # Tesing on original_df dataset

    # {'neu':0, 'pos':1, 'neg':2}
    target_names = ["neu", "pos", "neg"]

    report_dict = classification_report(y_test, y_pred_series, target_names=target_names, output_dict=True)

    # Convert the report dictionary into a DataFrame for better readability
    report_df = pd.DataFrame(report_dict).transpose()

    # Create an empty row with NaN values
    empty_row = pd.DataFrame([[" "] * len(report_df.columns)], columns=report_df.columns)

    # Insert the empty row after 'neg' to separate the row for readability
    report_df = pd.concat([report_df.loc[:'neg'], empty_row, report_df.loc['accuracy':]], ignore_index=False)

    report_df.index.values[3] = ' '

    # Display the summary
    print("")
    print(report_df)

In [None]:
# path_1 = os.path.join('Data/final_dataset/', 'COVIDSenti-A_cleanest.csv')
# path_under = '/content/drive/My Drive/final_dataset/resampling/COVIDSenti_cleanest_over.csv'
# df = pd.read_csv(path_under, encoding='utf-8')

# w2v_bilstm(df)


# directory_file_path = '/content/drive/My Drive/final_dataset/'
# file_l = []
# # Loop through the files in the directory to check file names
# for filename in os.listdir(directory_file_path):
#   if filename.endswith('.csv'):
#     file_l.append(filename)  # Check if the file ends with .csv
#     print(filename)

file_l = ['/content/drive/My Drive/final_dataset/COVIDSenti-C_mostfeature.csv',
          '/content/drive/My Drive/final_dataset/resampling/COVIDSenti-B_origin_over.csv',
          '/content/drive/My Drive/final_dataset/resampling/COVIDSenti-B_origin_under.csv',
          '/content/drive/My Drive/final_dataset/resampling/COVIDSenti-C_mostfeature_over.csv',
          '/content/drive/My Drive/final_dataset/resampling/COVIDSenti-C_mostfeature_under.csv',
          '/content/drive/My Drive/final_dataset/resampling/COVIDSenti-C_cleanest_over.csv',
          '/content/drive/My Drive/final_dataset/resampling/COVIDSenti-C_cleanest_under.csv']

for i in file_l:
  path = i
  print('#'*30, '\n', i,'\n start')
  df = pd.read_csv(path, encoding='utf-8')
  w2v_bilstm(df)


############################## 
 /content/drive/My Drive/final_dataset/COVIDSenti-C_mostfeature.csv 
 start
vocab size: 3000001
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Original Normal Sampling Set

             precision    recall  f1-score   support
neu           0.882554  0.885178  0.883864    4372.0
pos           0.562201  0.525727  0.543353     447.0
neg           0.705096  0.714649   0.70984    1181.0
                                                    
accuracy      0.824833  0.824833  0.824833  0.824833
macro avg     0.716617  0.708518  0.712352    6000.0
weighted avg  0.823758  0.824833  0.824242    6000.0
############################## 
 /content/drive/My Drive/final_dataset/resampling/COVIDSent

In [4]:
# path_1 = os.path.join('Data/final_dataset/', 'COVIDSenti-A_cleanest.csv')
# path_under = '/content/drive/My Drive/final_dataset/resampling/COVIDSenti_cleanest_over.csv'
# df = pd.read_csv(path_under, encoding='utf-8')

# w2v_bilstm(df)


# directory_file_path = '/content/drive/My Drive/final_dataset/'
# file_l = []
# # Loop through the files in the directory to check file names
# for filename in os.listdir(directory_file_path):
#   if filename.endswith('.csv'):
#     file_l.append(filename)  # Check if the file ends with .csv
#     print(filename)

file_l = ['/content/drive/My Drive/final_dataset/COVIDSenti-C_cleanest.csv']

for i in file_l:
  path = i
  print('#'*30, '\n', i,'\n start')
  df = pd.read_csv(path, encoding='utf-8')
  w2v_bilstm(df)


############################## 
 /content/drive/My Drive/final_dataset/COVIDSenti-C_cleanest.csv 
 start
vocab size: 3000001
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50

             precision    recall  f1-score   support
neu           0.879947  0.910304  0.894868    4348.0
pos           0.641566   0.47651  0.546855     447.0
neg           0.734417  0.711909  0.722988    1142.0
                                                    
accuracy      0.839481  0.839481  0.839481  0.839481
macro avg     0.751977  0.699574   0.72157    5937.0
weighted avg  0.834006  0.839481  0.835604    5937.0


In [None]:
directory_file_path = 'Data/final_dataset/'

# Loop through the files in the directory to check file names
for filename in os.listdir(directory_file_path):
    if filename.endswith('.csv'):  # Check if the file ends with .csv
        print(filename)

COVIDSenti-A_cleanest.csv
COVIDSenti-A_mostfeature.csv
COVIDSenti-A_origin.csv
COVIDSenti-B_cleanest.csv
COVIDSenti-B_mostfeature.csv
COVIDSenti-B_origin.csv
COVIDSenti-C_cleanest.csv
COVIDSenti-C_mostfeature.csv
COVIDSenti-C_origin.csv
COVIDSenti_cleanest.csv
COVIDSenti_mostfeature.csv
COVIDSenti_origin.csv
