In [1]:
import pandas as pd
import numpy as np
import re
from preprocessing import *
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from keras.utils import pad_sequences
from sklearn.metrics import classification_report


# Sequence Model Classification with Skip-Gram Embeddings

### Read the data

In [2]:
##############For Training Data#####################
# Read the data from the csv file named 'preprocessedData.csv' arabic data
train = pd.read_csv('../Dataset/cleaned_train.csv', encoding='utf-8')
# Unpack the data into text and stance
Train_X = train['text']
Train_X = [x.split(" ") for x in Train_X]
stance_Train_Y = train['stance']
cat_Train_Y = train['category']

##############For Testing Data#####################
test = pd.read_csv('../Dataset/cleaned_dev.csv', encoding='utf-8')
# Perform the data preprocessing
test = clean_data(test)
# Unpack the data into text, and stance
Test_X = test['text']
Test_X = [x.split(" ") for x in Test_X]
stance_Test_Y = test['stance']
cat_Test_Y = test['category']


### Download the pre-trained skip-gram model

In [3]:
# Here we want to apply Skip-Gram model to the data
import gensim
from gensim.models import KeyedVectors

# Load the model
sg_model = KeyedVectors.load_word2vec_format('../model/Word2VecSkipGram300D.bin', binary=True)

### Store the embeddings as features

In [4]:
# Loop over the training data and replace each word with its embedding
# Store the embedding in a different array


max_len = 0
for i in range(len(Train_X)):
    if len(Train_X[i]) > max_len:
        max_len = len(Train_X[i])

counter = 0
Train_X_sg = np.zeros((len(Train_X), max_len, 300))
for i in range(len(Train_X)):
    for j in range(len(Train_X[i])):
        if Train_X[i][j] in sg_model:
            Train_X_sg[i][j] = sg_model[Train_X[i][j]]
        else:
            counter += 1
print("Number of training words not in the model: ", counter)
counter = 0
Test_X_sg = np.zeros((len(Test_X), max_len, 300))
for i in range(len(Test_X)):
    for j in range(len(Test_X[i])):
        if Test_X[i][j] in sg_model:
            Test_X_sg[i][j] = sg_model[Test_X[i][j]]
        else:
            counter += 1
print("Number of testing words not in the model: ", counter)

Number of training words not in the model:  36311
Number of testing words not in the model:  5008


## For Stance Classification

### Apply SMOTE to the training data to balance the classes

In [5]:
Train_X_sg_shaped = np.reshape(Train_X_sg, (len(Train_X_sg), -1))
# Here we want to apply SMOTE to the data to balance the data against 3 classes
# Count the number of each class
from collections import Counter
from imblearn.over_sampling import SMOTE
print(Counter(stance_Train_Y))
# transform the dataset
oversample = SMOTE()
SMOTE_Train_X_sg, SMOTE_stance_Train_Y = oversample.fit_resample(Train_X_sg_shaped, stance_Train_Y)
print(Counter(SMOTE_stance_Train_Y))
# Reshape the data to be 3D
SMOTE_Train_X_sg = np.reshape(SMOTE_Train_X_sg, (len(SMOTE_Train_X_sg), max_len, 300))


Counter({1: 5538, 0: 1012, -1: 438})
Counter({1: 5538, 0: 5538, -1: 5538})


### Classify without SMOTE

In [6]:
# Build the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
lstm_model = Sequential()
lstm_model.add(LSTM(128, input_shape=(max_len, 300), return_sequences=True))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(64))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(3, activation='softmax'))
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
stance_Train_Y = np.array(stance_Train_Y) + 1
# Fit the lstm_model
history = lstm_model.fit(Train_X_sg, stance_Train_Y, epochs=12, batch_size=32, verbose=1)
predections_lstm = lstm_model.predict(Test_X_sg)
print(classification_report(np.argmax(predections_lstm, axis=1), stance_Test_Y+1))

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       1.00      0.80      0.89      1000

    accuracy                           0.80      1000
   macro avg       0.33      0.27      0.30      1000
weighted avg       1.00      0.80      0.89      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Classify with SMOTE

In [7]:
# Build the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
SMOTE_lstm_model = Sequential()
SMOTE_lstm_model.add(LSTM(128, input_shape=(max_len, 300), return_sequences=True))
SMOTE_lstm_model.add(Dropout(0.2))
SMOTE_lstm_model.add(LSTM(64))
SMOTE_lstm_model.add(Dropout(0.2))
SMOTE_lstm_model.add(Dense(3, activation='softmax'))
SMOTE_lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
SMOTE_stance_Train_Y = np.array(SMOTE_stance_Train_Y) + 1
# Fit the SMOTE_lstm_model
history = SMOTE_lstm_model.fit(SMOTE_Train_X_sg, SMOTE_stance_Train_Y, epochs=2, batch_size=32, verbose=1)
predections_SMOTE_lstm = SMOTE_lstm_model.predict(Test_X_sg)
print(classification_report(np.argmax(predections_SMOTE_lstm, axis=1), stance_Test_Y+1))

Epoch 1/2
Epoch 2/2
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.13      0.22      1000
           2       0.00      0.00      0.00         0

    accuracy                           0.13      1000
   macro avg       0.33      0.04      0.07      1000
weighted avg       1.00      0.13      0.22      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### For Category Classification

In [9]:
# Map the categories to numbers
cat_Train_Y = cat_Train_Y.map({'info_news': 0, 'personal': 1, 'celebrity': 2, 'plan': 3, 'unrelated': 4, 'others': 5, 'requests': 6, 'rumors': 7, 'advice': 8, 'restrictions': 9})
cat_Test_Y = cat_Test_Y.map({'info_news': 0, 'personal': 1, 'celebrity': 2, 'plan': 3, 'unrelated': 4, 'others': 5, 'requests': 6, 'rumors': 7, 'advice': 8, 'restrictions': 9})

In [10]:
# Build the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
cat_lstm_model = Sequential()
cat_lstm_model.add(LSTM(128, input_shape=(max_len, 300), return_sequences=True))
cat_lstm_model.add(Dropout(0.2))
cat_lstm_model.add(LSTM(64))
cat_lstm_model.add(Dropout(0.2))
cat_lstm_model.add(Dense(3, activation='softmax'))
cat_lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the cat_lstm_model
history = cat_lstm_model.fit(Train_X_sg, cat_Train_Y, epochs=2, batch_size=32, verbose=1)
predections_lstm = cat_lstm_model.predict(Test_X_sg)
print(classification_report(np.argmax(predections_lstm, axis=1), cat_Test_Y))

Epoch 1/2
Epoch 2/2
              precision    recall  f1-score   support

           0       1.00      0.55      0.71      1000
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0

    accuracy                           0.55      1000
   macro avg       0.10      0.05      0.07      1000
weighted avg       1.00      0.55      0.71      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
