In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, ShuffleSplit
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.calibration import CalibratedClassifierCV
import xgboost as xgb
import time

In [2]:
# Importing the dataset
traindata = pd.read_csv("data/train.csv")
testdata = pd.read_csv('data/test.csv')

In [3]:
traindata = traindata.dropna()
print(traindata.shape)

(404287, 6)


In [4]:
# Importing the NLP features
train_data = pd.read_csv("data/extended_nlp_features_train.csv")
test_data = pd.read_csv("data/extended_nlp_features_test.csv")

In [5]:
# Defining cosine similarity and euclidean distance between two vectors
def cosine_euclidean(u, v):
    return np.array([np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)), np.linalg.norm(u - v)])

In [6]:
# open .npy files and loop through the sentence embeddings for train data
with open('temp_train_question1_sentenceBERT.npy', 'rb') as q1_vec, open('temp_train_question2_sentenceBERT.npy', 'rb') as q2_vec:
    distances = []
    while True:
        try:
            q1_20k = np.load(q1_vec, allow_pickle=True)
            q2_20k = np.load(q2_vec, allow_pickle=True)
            for q1,q2 in zip(q1_20k, q2_20k):
                dists = cosine_euclidean(q1, q2)
                distances.append(dists)
        except IOError as e:
            distances = np.array(distances)
            break

In [7]:
distances = pd.DataFrame(distances, columns=['cosine_simlarity_bert', 'euclidean_distance_bert'])

In [8]:
distances.shape

(404287, 2)

In [9]:
train_data = pd.concat([train_data, pd.DataFrame(distances)], axis=1)

In [10]:
train_data.shape

(404287, 30)

In [11]:
# open .npy files and loop through the sentence embeddings for test data
with open('temp_test_question1_sentenceBERT.npy', 'rb') as q1_vec, open('temp_test_question2_sentenceBERT.npy', 'rb') as q2_vec:
    distances = []
    while True:
        try:
            q1_20k = np.load(q1_vec, allow_pickle=True)
            q2_20k = np.load(q2_vec, allow_pickle=True)
            for q1,q2 in zip(q1_20k, q2_20k):
                dists = cosine_euclidean(q1, q2)
                distances.append(dists)
        except IOError as e:
            distances = np.array(distances)
            break
distances = pd.DataFrame(distances, columns=['cosine_simlarity_bert', 'euclidean_distance_bert'])
test_data = pd.concat([test_data, pd.DataFrame(distances)], axis=1)

In [12]:
train_data.drop(columns=['question1_final', 'question2_final'], inplace=True)

In [13]:
train_data.shape

(404287, 28)

In [14]:
# For test data
test_data.drop(columns=['question1_final', 'question2_final'], inplace=True)

In [15]:
train_data.to_csv("data/train_features.csv", index=False)
test_data.to_csv("data/test_features.csv", index=False)

## Data Preparation for Modelling

In [16]:
train_data = train_data.to_numpy()

In [17]:
train_data.shape

(404287, 28)

In [18]:
scaler = MinMaxScaler()

In [19]:
scaler.fit_transform(train_data)
train_data.shape

(404287, 28)

In [20]:
# For test data
test_data = test_data.to_numpy()
test_data = scaler.transform(test_data)
test_data

array([[0.09401709, 0.06041479, 0.0859375 , ..., 0.25124378, 0.62265475,
        0.59195244],
       [0.10940171, 0.03697024, 0.109375  , ..., 0.40677565, 0.79336938,
        0.4548728 ],
       [0.1008547 , 0.02524797, 0.109375  , ..., 0.41585178, 0.83977757,
        0.40547467],
       ...,
       [0.1042735 , 0.04328224, 0.078125  , ..., 0.28713575, 0.52086247,
        0.69004651],
       [0.18974359, 0.11000902, 0.15625   , ..., 0.40378465, 0.93764583,
        0.23590894],
       [0.0974359 , 0.03967538, 0.0625    , ..., 0.46898839, 0.75761155,
        0.48530176]])

In [21]:
with open('temp_testdata.npy', 'wb') as f:
    batch = 20000
    while(len(test_data)):
        tempdata = test_data[:batch]
        test_data = test_data[batch:]
        np.save(f, tempdata, allow_pickle=True)

In [22]:
def loadVectors(filename):
    with open(filename, 'rb') as f:
        q_vectors = []
        while True:
            try:
                q_vec = np.load(f, allow_pickle=True)
                q_vectors.extend(list(q_vec))
            except IOError as e:
                q_vectors = np.array(q_vectors)
                break
    return q_vectors

In [23]:
train_question1_vec = loadVectors('temp_train_question1_sentenceBERT.npy')
train_question2_vec = loadVectors('temp_train_question2_sentenceBERT.npy')

In [24]:
train_data = np.hstack((train_data, train_question1_vec, train_question2_vec))

In [25]:
train_data.shape

(404287, 1564)

We have 1564 features (28 + 768 + 768).
- 28 are extracted features.
- 768+768 for sentence embedding of question 1 and question 2.

In [26]:
with open('train_data.npy', 'wb') as f:
    np.save(f, train_data, allow_pickle=True)