In [7]:
from datasets import load_dataset
import pandas as pd

# Set display options for Pandas
pd.set_option('display.max_colwidth', None)  # No truncation of column content
pd.set_option('display.width', None)  # No truncation of DataFrame display width

In [2]:
# Load the QQP dataset
qqp_dataset = load_dataset("glue", "qqp")

train-00000-of-00001.parquet:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/3.73M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/36.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/363846 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/40430 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/390965 [00:00<?, ? examples/s]

In [3]:
# Convert the train dataset to a Pandas DataFrame for analysis
train_df = pd.DataFrame(qqp_dataset['train'])

In [4]:
# Convert the train dataset to a Pandas DataFrame for analysis
valid_df = pd.DataFrame(qqp_dataset['validation'])

In [5]:
train_df.shape, valid_df.shape

((363846, 4), (40430, 4))

In [8]:
train_df.sample(5)

Unnamed: 0,question1,question2,label,idx
49694,Is there any system in our governance to know that whether the law made by legislature are effective to deal the situation for which it was enacted?,Are there any effective models (theoretical or real world) that can serve as a template by which the United States can reform its health care system?,0,49694
218095,What are the strongest majors in terms of job prospects and what are the weakest majors at Jackson State?,What are the strongest majors in terms of job prospects and what are the weakest majors at Columbus State?,0,218095
103723,What career advice would you give someone interested in becoming a Market Research Analyst?,What advice can you give someone who wants to trade Currencies?,0,103723
11027,What is the difference between reference frame and reference point?,Is our frame of reference in space time at the speed of light?,0,11027
226668,"How do the personalities of Harvard students compare with students at other Ivies (especially Yale, Princeton, and Brown)?","What is the typical student at a top university like Harvard, Yale and Princeton like in terms of personality, character and interests?",0,226668


In [9]:
# Total samples
total_samples = len(train_df)
print(f"Total samples in training set: {total_samples}")

Total samples in training set: 363846


In [10]:
# Duplicate and non-duplicate labels
label_counts = train_df['label'].value_counts()
print("\nLabel Distribution:")
print(label_counts)



Label Distribution:
0    229468
1    134378
Name: label, dtype: int64


In [11]:
# Percentage distribution of labels
label_percentages = label_counts / total_samples * 100
print("\nLabel Percentage Distribution:")
print(label_percentages)


Label Percentage Distribution:
0    63.067342
1    36.932658
Name: label, dtype: float64


In [12]:
# Average length of questions
train_df['question1_length'] = train_df['question1'].apply(lambda x: len(str(x)))
train_df['question2_length'] = train_df['question2'].apply(lambda x: len(str(x)))

avg_length_q1 = train_df['question1_length'].mean()
avg_length_q2 = train_df['question2_length'].mean()
print(f"\nAverage Length of Question 1: {avg_length_q1:.2f}")
print(f"Average Length of Question 2: {avg_length_q2:.2f}")


Average Length of Question 1: 59.63
Average Length of Question 2: 60.19


In [13]:
# Correlation between question lengths
correlation_length = train_df['question1_length'].corr(train_df['question2_length'])
print(f"Correlation between Question 1 and Question 2 lengths: {correlation_length:.2f}")

Correlation between Question 1 and Question 2 lengths: 0.48
