# **Initializing And Loading Data**
https://www.kaggle.com/datasets/quora/question-pairs-dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
DUMP_PATH = "/content/drive/MyDrive/Colab Notebooks/dumps/quora_question_pairs/"
NOTEBOOK_NAME = "02_1"
N_SELECTIONS = 3
SAMPLE_SIZE = 30_000
TEST_SIZE = 0.2
RANDOM_STATE = 42

os.listdir(DUMP_PATH)

In [None]:
df = pd.read_csv(os.path.join(DUMP_PATH, 'final_preprocessed_data_allowStopwords1.csv'))

# Shuffling the Dataset
df = df.sample(df.shape[0], random_state=RANDOM_STATE)

# Picking a sample
if SAMPLE_SIZE == None:
    SAMPLE_SIZE = min(df.is_duplicate.value_counts().values) * 2
df = pd.concat([
    df[df.is_duplicate == 0].iloc[:SAMPLE_SIZE//2],
    df[df.is_duplicate == 1].iloc[:SAMPLE_SIZE//2]
])

# Shuffling the Dataset
df = df.sample(df.shape[0], random_state=RANDOM_STATE)

df.index = range(SAMPLE_SIZE)

print(df.shape)
df.head()

In [None]:
X = df.iloc[:, 6:]
y = df.is_duplicate

cols = X.columns

# SCALING
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
i_zeros = list(df[df['is_duplicate'] == 0].index)
i_ones = list(df[df['is_duplicate'] == 1].index)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def model_accuracy(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=y
    )
    model = RandomForestClassifier()

    model.fit(X_train, y_train)

    pred_test = model.predict(X_test)

    return accuracy_score(pred_test, y_test)

# **Visualizing**

In [None]:
def dist_plot(X, n_cols=4, cols=[]):
    # X.shape = (30000, 22)
    n_rows = int(np.ceil(X.shape[1]/n_cols))
    n_rows = max(n_rows, 2)
    # fig_size = 20 // n_cols
    fig_size = 5

    fig, axes = plt.subplots(
        n_rows,    # no of rows
        n_cols,    # no of cols
        figsize = (fig_size*n_cols, fig_size*n_rows),
    )

    for i in range(X.shape[1]):
        ax = axes[i//n_cols, i%n_cols]

        sns.distplot(X[:, i][i_zeros], label='non duplicate', ax=ax)
        sns.distplot(X[:, i][i_ones], label='duplicate', ax=ax)
        if len(cols):
            ax.set_title(cols[i])

    plt.legend()
    plt.show()

def scatter_plot_3d(X, s=2):
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111, projection='3d')

    COLORS = ['tab:blue', 'tab:orange']

    ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=[COLORS[i] for i in y], s=s)
    plt.show()

In [None]:
dist_plot(X, n_cols=4, cols=cols)

In [None]:
scatter_plot_3d(X)

# **Feature Selection**

In [None]:
def scores_to_index(scores):
    return [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1], reverse=True)]

In [None]:
def plot_on_priority(scores):
    priority_i = scores_to_index(scores)

    print(cols[priority_i])

    dist_plot(
        X[:, priority_i],
        cols=[f"{i}: {j}" for i, j in zip(cols[priority_i], scores[priority_i])]
    )
    scatter_plot_3d(X[:, priority_i])

    return cols[priority_i]

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier


# BACKWARD SELECTION
backward_selector = RFE(
    LogisticRegression(max_iter=150),
    n_features_to_select=N_SELECTIONS,
    step=1
)
backward_selector.fit(X, y)
backward_score = 0 - backward_selector.ranking_
backward_cols = scores_to_index(backward_score)


# FORWARD SELECTION
forward_selector = SelectKBest(f_classif, k=N_SELECTIONS)
forward_selector.fit(X, y)
forward_score = forward_selector.scores_
forward_cols = scores_to_index(forward_score)


# RANDOM FOREST SELECTION
forest_selector = RandomForestClassifier(
    n_estimators=100,
    max_depth=3,
    bootstrap=True,
    n_jobs=-1,
    random_state=RANDOM_STATE
)
forest_selector.fit(X, y)
forest_score = forest_selector.feature_importances_
forest_cols = scores_to_index(forest_score)

In [None]:
# FINDING BEST COLUMNS
best = []
for i, j, k in zip(forward_cols, backward_cols, forest_cols):
    if i_ones not in best:
        best.append(i)
    if j not in best:
        best.append(j)
    if k not in best:
        best.append(k)
BEST_COLS_I = best
BEST_COLS = list(cols[best])
BEST_COLS

In [None]:
# PLOTTING COLUMNS BASED ON THEIR PRIORITY
dist_plot(
    X[:, best],
    cols = cols[best]
)
scatter_plot_3d(X[:, best])

# **Combination Using Linear Methods**

In [None]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import KernelPCA
from sklearn.manifold import TSNE
from sklearn.manifold import MDS

pca = PCA(n_components=N_SELECTIONS)
X_pca = pca.fit_transform(X)


# lda = LinearDiscriminantAnalysis(n_components=2, solver='svd')
# X_lda = lda.fit_transform(X, y)


svd = TruncatedSVD(
    n_components=N_SELECTIONS,
    algorithm='randomized',
    random_state=RANDOM_STATE
)
X_svd = svd.fit_transform(X)


# kpca = KernelPCA(
#     n_components=N_SELECTIONS,
#     kernel='rbf',
#     gamma=15,
#     random_state=RANDOM_STATE
# )
# X_kpca = kpca.fit_transform(X)


# tsne = TSNE(
#     n_components = N_SELECTIONS
# )
# X_tsne = tsne.fit_transform(X)


# mds = MDS(n_components=N_SELECTIONS, metric=True, random_state=RANDOM_STATE)
# X_mds = mds.fit_transform(X)

In [None]:
scatter_plot_3d(X_pca)
scatter_plot_3d(X_svd)