## Data split for isign in Training-Validation and Testing and also take random sample of each

In [38]:
import pandas as pd
import numpy as np

In [45]:
isign = pd.read_csv("iSign_v1.1.csv")
print(len(isign))
isign.head()

127237


Unnamed: 0,uid,text
0,1782bea75c7d-1,Page 111
1,1782bea75c7d-2,Make it shorter.
2,1782bea75c7d-3,"One day, Akbar drew a line on the floor and or..."
3,1782bea75c7d-4,Make this line shorter.
4,1782bea75c7d-5,but don't rub out any part of it.


In [47]:
# Shuffle the dataset
isign = isign.sample(frac=1, random_state=42).reset_index(drop=True)
print(len(isign))
isign.head()

127237


Unnamed: 0,uid,text
0,2PD6WQLA63s--23,"He thought that once it shutdown, the girl wou..."
1,QEU6biu1a4w--29,"This story is not recent, but it repeatedly be..."
2,006faf1f7c93-33,Here you can see a picture of kids riding vari...
3,zpPORpaGXKs--11,Also many airlines were suffering losses. This...
4,V9j19eiXyo4--34,"Justice Abdul Quddhose of Madras High Court, r..."


<h3> <font color = 'yellow'> Data Preprocessing </font></h3>

### Remove rows with Nan texts

In [48]:
# Filter rows with non-NaN texts
non_nan_isign = isign[isign['text'].notna()]

# Count rows with NaN values in the 'text' column
nan_count = isign['text'].isna().sum()

print("\nNumber of rows with NaN values in 'text':", nan_count)


Number of rows with NaN values in 'text': 391


In [49]:
isign = non_nan_isign
print(len(isign))
isign.head()

126846


Unnamed: 0,uid,text
0,2PD6WQLA63s--23,"He thought that once it shutdown, the girl wou..."
1,QEU6biu1a4w--29,"This story is not recent, but it repeatedly be..."
2,006faf1f7c93-33,Here you can see a picture of kids riding vari...
3,zpPORpaGXKs--11,Also many airlines were suffering losses. This...
4,V9j19eiXyo4--34,"Justice Abdul Quddhose of Madras High Court, r..."


<h3> <font color = 'yellow'> Data split in Train-Validation-Test in 80-10-10% </font></h3>

In [50]:
# Define the split ratios
train_ratio = 0.8
validation_ratio = 0.1
test_ratio = 0.1

# Calculate the indices for splits
train_end = int(train_ratio * len(isign))
validation_end = train_end + int(validation_ratio * len(isign))

# Create the 'data_part' column based on the indices
isign['data_part'] = ['Train']*train_end    +   ['Validation']*(validation_end - train_end)   +   ['Test']*(len(isign) - validation_end)

# Shuffle the dataset again if needed
isign = isign.sample(frac=1, random_state=42).reset_index(drop=True)

In [51]:
print(isign['data_part'].value_counts())
isign.head()

data_part
Train         101476
Test           12686
Validation     12684
Name: count, dtype: int64


Unnamed: 0,uid,text,data_part
0,VqlfNumk0cg--220,colour 2,Train
1,3ab94767fe39-38,and talk about the picture.,Train
2,4ea55d87345a-172,Who am I?,Train
3,hM959_pXlrE--160,These ends are the poles of the magnet.,Validation
4,CO-cLZl36nI--101,2,Validation


<h3> <font color = 'yellow'> Take 20% fraction of each Train-Validation-Test </font></h3>

In [52]:
# Sample 20% from each subset (Train, Validation, Test)
train_sample = isign[isign['data_part'] == 'Train'].sample(frac=0.2, random_state=42)
validation_sample = isign[isign['data_part'] == 'Validation'].sample(frac=0.2, random_state=42)
test_sample = isign[isign['data_part'] == 'Test'].sample(frac=0.2, random_state=42)

# Concatenate the sampled dataframes
sampled_data = pd.concat([train_sample, validation_sample, test_sample])

# Shuffle the concatenated DataFrame
sampled_data = sampled_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the new DataFrame
print(sampled_data['data_part'].value_counts())
sampled_data.head()

data_part
Train         20295
Test           2537
Validation     2537
Name: count, dtype: int64


Unnamed: 0,uid,text,data_part
0,nLG-B3e-pP0--48,"See, to remove impurities or harmful components.",Train
1,OS0q5XwHSys--18,It says sexual act performed by a man on his o...,Train
2,s9bWguOVijc--23,"Therefore, it wants the government officers to...",Train
3,xAZMH5dLB4k--16,He has to appear before the court & come for q...,Test
4,u8XxXbHuYPM--9,The TADA court then sent him to jail. He remai...,Train


In [53]:
# Save the new DataFrame to a CSV file
sampled_data.to_csv("isign_revised.csv", index=False)

print("Data saved as isign_revised.csv")

Data saved as isign_revised.csv
