In [7]:
from datasets import load_dataset
import pandas as pd

# Set display options for Pandas
pd.set_option('display.max_colwidth', None)  # No truncation of column content
pd.set_option('display.width', None)  # No truncation of DataFrame display width

In [2]:
# Load the QQP dataset
qqp_dataset = load_dataset("glue", "qqp")

train-00000-of-00001.parquet:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/3.73M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/36.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/363846 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/40430 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/390965 [00:00<?, ? examples/s]

In [3]:
# Convert the train dataset to a Pandas DataFrame for analysis
train_df = pd.DataFrame(qqp_dataset['train'])

In [4]:
# Convert the train dataset to a Pandas DataFrame for analysis
valid_df = pd.DataFrame(qqp_dataset['validation'])

In [5]:
train_df.shape, valid_df.shape

((363846, 4), (40430, 4))

In [21]:
train_df.sample(5)

Unnamed: 0,question1,question2,label,idx,question1_length,question2_length
311084,Is Magical Girl anime genre underrated in America?,Why are most Magical Girl anime underrated in America?,1,311084,50,54
129991,"What were the major effects of the cambodia earthquake, and how do these effects compare to the Cascadia earthquake in 1700?","What were the major effects of the cambodia earthquake, and how do these effects compare to the Arica earthquake in 1868?",1,129991,124,121
327758,How difficult is aiims question paper?,How difficult is it to get selected at AIIMS New Delhi?,1,327758,38,55
211711,Is really possible to earn 10k-15k from home based job?,How can I get a home based job and earn????,0,211711,55,43
226455,Which movies are the best examples of the Hero's Journey?,What great films do not follow the classic '3-Act Structure' or 'Hero's Journey'?,0,226455,57,81


In [9]:
# Total samples
total_samples = len(train_df)
print(f"Total samples in training set: {total_samples}")

Total samples in training set: 363846


In [10]:
# Duplicate and non-duplicate labels
label_counts = train_df['label'].value_counts()
print("\nLabel Distribution:")
print(label_counts)



Label Distribution:
0    229468
1    134378
Name: label, dtype: int64


In [11]:
# Percentage distribution of labels
label_percentages = label_counts / total_samples * 100
print("\nLabel Percentage Distribution:")
print(label_percentages)


Label Percentage Distribution:
0    63.067342
1    36.932658
Name: label, dtype: float64


In [12]:
# Average length of questions
train_df['question1_length'] = train_df['question1'].apply(lambda x: len(str(x)))
train_df['question2_length'] = train_df['question2'].apply(lambda x: len(str(x)))

avg_length_q1 = train_df['question1_length'].mean()
avg_length_q2 = train_df['question2_length'].mean()
print(f"\nAverage Length of Question 1: {avg_length_q1:.2f}")
print(f"Average Length of Question 2: {avg_length_q2:.2f}")


Average Length of Question 1: 59.63
Average Length of Question 2: 60.19


In [13]:
# Correlation between question lengths
correlation_length = train_df['question1_length'].corr(train_df['question2_length'])
print(f"Correlation between Question 1 and Question 2 lengths: {correlation_length:.2f}")

Correlation between Question 1 and Question 2 lengths: 0.48


In [14]:
from transformers import pipeline

In [16]:
# Load the QQP validation dataset
qqp_dataset_val = load_dataset("glue", "qqp", split="validation")

In [17]:
# Load a pre-trained model fine-tuned on QQP
model_name  = "textattack/bert-base-uncased-QQP"
classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)

config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [24]:
result = classifier({
    "text": "How difficult is AIIMS question paper?",
    "text_pair": "How difficult is it to get selected at AIIMS New Delhi?"
})

print(result)


{'label': 'LABEL_1', 'score': 0.9737574458122253}


In [18]:
# Define a function to test validation samples
def test_validation_samples(dataset, classifier, num_samples=10):
    print("Testing first", num_samples, "samples from validation set...")
    for i in range(num_samples):
        sentence1 = dataset[i]["question1"]
        sentence2 = dataset[i]["question2"]
        label     = dataset[i]["label"]  # Ground truth label
        
        prediction = classifier({"text_a": sentence1, "text_b": sentence2})[0]
        
        predicted_label = prediction["label"]
        confidence      = prediction["score"]
        
        print(f"Sample {i+1}:")
        print(f"  Question 1: {sentence1}")
        print(f"  Question 2: {sentence2}")
        print(f"  Ground Truth: {label}")
        print(f"  Predicted: {predicted_label} (Confidence: {confidence:.4f})\n")

In [20]:
# Test first 10 validation samples
test_validation_samples(qqp_dataset_val, classifier, num_samples=10)

Testing first 10 samples from validation set...


ValueError: You need to specify either `text` or `text_target`.