In [1]:
import numpy as np 
import pandas as pd 

import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

from gensim.models import Word2Vec
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score
import contractions

import tensorflow as tf
from keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# from multiprocessing import Pool
import time
import torch

from pathlib import Path

# Get project path
project_path = Path.cwd().resolve().parent.parent
print(project_path)

/Users/elizavetachefanova/Documents/Courses/NLP course/Final project


Using GPU for studying

In [None]:
# config = tf.compat.v1.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 0.4
# session = tf.compat.v1.Session(config=config)
# K.set_session(session)

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora

---
# <font color=green>Preprocess the data</font>

In [2]:
# DataFrame with Train Data
train_df = pd.read_csv(f'{project_path}/Output/Cleaned_train_data.csv', index_col=0)

---
# <font color=green>LSTM based on Word2Vec</font>

In [3]:
#  The Function for text preprocessing
def tokenize_string(text):

    # Before lemmatizing replace all constructions with normal words
    text_upd = contractions.fix(text)

    # Tokenize the data and use only lower letters
    words = word_tokenize(text_upd.lower())
    
    # Create a lemmatizer object
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(word, pos = "v") for word in words] 
    
    # # Get rid of punctuation
    words = [word for word in lemmas if word not in string.punctuation]
    
    # Remove stop words
    # Stop words corpus (179 in total)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    return words

In [4]:
# Tokenizing the series for train data
print('I have just started updatind train_df')
start = time.time()
train_df['Preprocessed_text'] = train_df.question_text.apply(tokenize_string)
finish = time.time()

print('Time spent:', int((finish - start)//60), 'min, ', round((finish - start)%60), 'sec')

I have just started updatind train_df
Time spent: 5 min,  13 sec


In [5]:
train_df.to_csv(f'{project_path}/Output/Cleaned_test_data.csv')

In [6]:
y_train = train_df.target.values

In [7]:
word2vec_model = Word2Vec(train_df['Preprocessed_text'], vector_size=700, window=5, min_count=5)

In [8]:
# Function for preprocessing the whole phrase instead of separate phrases
def phrase_vector(word2vec_model, phrase):
    phrase = [word for word in phrase if word in word2vec_model.wv.key_to_index]
    
    if len(phrase) == 0:
        return np.zeros(word2vec_model.vector_size)
    
    return np.mean(word2vec_model.wv[phrase], axis=0)

In [9]:
X_train = np.expand_dims(np.array([phrase_vector(word2vec_model, phrase) for phrase in train_df['Preprocessed_text']]), axis=1)

In [9]:
# del train_df

## <font color = purple>Split data before studying the model</font>

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=42)

In [12]:
# LSTM model
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(1, X_train.shape[-1])))   
model.add(LSTM(64, return_sequences=True))  
model.add(LSTM(32)) 
model.add(Dense(1, activation='sigmoid'))

checkpoint_filepath = 'tmp/checkpoint/model_best.h5'

model_callbacks = [
    ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_auc',
        mode='max',
        save_best_only=True,
        verbose=1)
]

In [14]:
# BiDirectionalLSTM model
model = Sequential()
model.add(Bidirectional(LSTM(256, return_sequences=True), input_shape=(1, X_train.shape[-1])))   
model.add(Bidirectional(LSTM(128, return_sequences=True)))    
model.add(Bidirectional(LSTM(64, return_sequences=True))) 
model.add(Bidirectional(LSTM(32))) 
model.add(Dense(1, activation='sigmoid'))

checkpoint_filepath = 'tmp/checkpoint/model_best.h5'

model_callbacks = [
    ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_auc',
        mode='max',
        save_best_only=True,
        verbose=1)
]

In [15]:
# Compile and train the model
model.compile(loss='BinaryFocalCrossentropy', optimizer='adam', metrics=['AUC'])
class_weights = compute_sample_weight(class_weight='balanced', y=y_train)
unique_classes = np.unique(y_train)
class_weights = {cls: 1.0/np.mean(class_weights[y_train==cls]) for cls in unique_classes}

In [16]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, 
          class_weight=class_weights, callbacks = model_callbacks)

Epoch 1/10
Epoch 1: val_auc improved from -inf to 0.94543, saving model to tmp/checkpoint/model_best.h5
Epoch 2/10
Epoch 2: val_auc improved from 0.94543 to 0.94668, saving model to tmp/checkpoint/model_best.h5
Epoch 3/10
Epoch 3: val_auc improved from 0.94668 to 0.94764, saving model to tmp/checkpoint/model_best.h5
Epoch 4/10
Epoch 4: val_auc improved from 0.94764 to 0.94795, saving model to tmp/checkpoint/model_best.h5
Epoch 5/10
Epoch 5: val_auc did not improve from 0.94795
Epoch 6/10
Epoch 6: val_auc improved from 0.94795 to 0.94807, saving model to tmp/checkpoint/model_best.h5
Epoch 7/10
Epoch 7: val_auc did not improve from 0.94807
Epoch 8/10
Epoch 8: val_auc did not improve from 0.94807
Epoch 9/10
Epoch 9: val_auc did not improve from 0.94807
Epoch 10/10
Epoch 10: val_auc did not improve from 0.94807


<keras.src.callbacks.History at 0x296b452b0>

In [17]:
# The model weights (that are considered the best) are loaded into the model.
model.load_weights(checkpoint_filepath)

Prepare the tests dataset for predicition

In [None]:
# DataFrame with Test data
# test_df = pd.read_csv(f'/kaggle/input/quora-insincere-questions-classification/test.csv')
# print('I have just started updatind test_df')
# start = time.time()
# test_df['Preprocessed_text'] = test_df.question_text.apply(tokenize_string)
# finish = time.time()

# print('Time spent:', int((finish - start)//60), 'min, ', round((finish - start)%60), 'sec')

In [None]:
# X_test = np.expand_dims(np.array([phrase_vector(word2vec_model, phrase) 
#                                    for phrase in test_df['Preprocessed_text']]), axis=1)

In [None]:
# del word2vec_model

In [18]:
predictions = model.predict(X_test)



In [19]:
np.argmax(predictions, axis=1).sum()

0

In [20]:
# flatten the array
preds = predictions.flatten()

# find the 95th percentile value
threshold = np.percentile(preds, 93)

# create a new array where values higher than the threshold are 1 and others are 0
y_pred = np.where(preds > threshold, 1, 0)

In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97    245063
           1       0.56      0.64      0.60     16162

    accuracy                           0.95    261225
   macro avg       0.77      0.80      0.79    261225
weighted avg       0.95      0.95      0.95    261225



In [22]:
print(balanced_accuracy_score(y_test, y_pred))

0.8029814768182395


---
# <font color=green>Making the final file</font>

In [None]:
# pd.DataFrame({'qid': test_df.qid, 'prediction': y_pred}).set_index('qid').to_csv('submission.csv')