In [4]:
cd ..

C:\Prasad\MSIITM\Research\Dataset\train\pre_req_dataset


In [6]:
import pandas as pd

data = pd.read_csv("KaggleDataset/train.csv") # Read the main file
metad = pd.read_csv("KaggleDataset/metadata.csv")    # Read the meta data file

#add suffixes to the column of meta data so that we can distinguish between pre and conept columns
metad.columns = [f'{col}_pre' for col in metad.columns]

# Merge both the files based on the concept names
new_data = pd.merge(data, metad, left_on='pre requisite', right_on='video name_pre', how="left")

# Remove the existing _pre from the columns
metad.columns = [f'{col}'[:-4] for col in metad.columns]

#Attach the suffixes to the existing columns
metad.columns = [f'{col}_concept' for col in metad.columns]

# Merge both the columns together based on the video name
new_data = pd.merge(new_data, metad, left_on='concept', right_on='video name_concept', how="left")

In [7]:
data = new_data[['label','transcript_pre','transcript_concept']]

In [8]:
from sklearn.model_selection import train_test_split

#Train Test Split
X = data
y = data[['label']]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import os
import gzip
import csv


#Check if dataset exsist. If not, download and extract  it
# dataset_path = 'quora-dataset/'

train_samples = []
for row in X_train.iterrows():
    train_samples.append(InputExample(texts=[row[1]['transcript_pre'], row[1]['transcript_concept']], label=int(row[1]['label'])))
    train_samples.append(InputExample(texts=[row[1]['transcript_pre'], row[1]['transcript_concept']], label=int(row[1]['label'])))


# logger.info("Read dev dataset")
dev_samples = []
for row in X_test.iterrows():
    dev_samples.append(InputExample(texts=[row[1]['transcript_pre'], row[1]['transcript_concept']], label=int(row[1]['label'])))


#Configuration
train_batch_size = 16
num_epochs = 4
model_save_path = 'model'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


#We use distilroberta-base with a single label, i.e., it will output a value between 0 and 1 indicating the similarity of the two questions
model = CrossEncoder('sentence-transformers/paraphrase-MiniLM-L3-v2', num_labels=1)

# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)


# We add an evaluator, which evaluates the performance during training
evaluator = CEBinaryClassificationEvaluator.from_input_examples(dev_samples, name='prereq')


# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
# logger.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=500,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/paraphrase-MiniLM-L3-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/280 [00:00<?, ?it/s]

Iteration:   0%|          | 0/280 [00:00<?, ?it/s]

Iteration:   0%|          | 0/280 [00:00<?, ?it/s]

Iteration:   0%|          | 0/280 [00:00<?, ?it/s]

In [16]:
# logger.info("Read dev dataset")
import numpy as np
test_data = []
for i in range(len(X_test)):
    test_data.append([X_test.iloc[i]['transcript_pre'],X_test.iloc[i]['transcript_concept']])

pred = model.predict(test_data)

y_pred_norm = np.round(pred)

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, y_pred_norm)
precision = precision_score(y_test, y_pred_norm)
recall = recall_score(y_test, y_pred_norm)
f1 = f1_score(y_test, y_pred_norm)

print(f"Test Results for :")
print(f"Accuracy: ", accuracy)
print(f"Precision: ", precision)
print(f"Recall: ",recall)
print(f"F1-score: ", f1)
print("\n")

Test Results for :
Accuracy:  0.7357142857142858
Precision:  0.6586538461538461
Recall:  0.6401869158878505
F1-score:  0.6492890995260663


