In [1]:
import numpy as np
import sys
import transformers
import pandas as pd
import importlib

sys.path.append("../")
from src.data import datasets
importlib.reload(datasets)

<module 'src.data.datasets' from '/home/mas-server/etu/nn/paraphrase_detection/notebooks/../src/data/datasets.py'>

In [2]:
df = pd.read_csv("../data/interim/quora.csv")
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
print(df.shape)
df = df.dropna().reset_index(drop=True)
print(df.shape)

(404290, 6)
(404287, 6)


In [4]:
print(f"Possible labels: {df.is_duplicate.unique()}")
print(f"Label counts:\n {df.is_duplicate.value_counts()}")

Possible labels: [0 1]
Label counts:
 0    255024
1    149263
Name: is_duplicate, dtype: int64


In [5]:
# Using debertav3 as autotokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained("microsoft/deberta-v3-large", use_fast=True)
# Example output
tokenizer_out = tokenizer(df.loc[0, "question1"], df.loc[0, "question2"])
print(f"Original sentences:\nQ1:{df.loc[0, 'question1']}\nQ2:{df.loc[0, 'question2']}")
print(f"Tokenizer out:\n{tokenizer.decode(tokenizer_out['input_ids'])}")

Original sentences:
Q1:What is the step by step guide to invest in share market in india?
Q2:What is the step by step guide to invest in share market?
Tokenizer out:
[CLS] What is the step by step guide to invest in share market in india?[SEP] What is the step by step guide to invest in share market?[SEP]




In [6]:
print(f"Vocabulary size {tokenizer.vocab_size}")

Vocabulary size 128000


In [7]:
train, validate, test = np.split(
    df.sample(frac=1, random_state=42), 
    [int(.7*len(df)), int(.85*len(df))]
    )
print(f"Train samples: {train.shape[0]}, val samples: {validate.shape[0]}, test samples: {test.shape[0]} ")

Train samples: 283000, val samples: 60643, test samples: 60644 


In [8]:
train.to_csv("../data/processed/train.csv", index=False)
validate.to_csv("../data/processed/val.csv", index=False)
test.to_csv("../data/processed/test.csv", index=False)