In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/utterances.csv')  

We will initiate data processing by primarily saving only the text and labels of the comments. For the preliminary training phase, our strategy is to deploy elementary models that utilize solely the text to evaluate their performance. Consequently, we will extract just the text and label fields and partition them into separate datasets for training, testing, and validation purposes.

In [7]:
# Step 1 & 2: Filter the DataFrame to include only the necessary columns ('text' and 'meta.success') and clean the text if necessary
# For simplicity, we're directly using the 'text' column; additional cleaning might be needed depending on the model requirements
data = df[['text', 'meta.success']].dropna(subset=['meta.success'])
data['label'] = data['meta.success'].apply(lambda x: 1 if x == 1.0 else 0)

# Step 3: Split the dataset into training, validation, and ttype(train, temp = train_test_split(data, test_size=0.2, random_stype(train, test = train_test_split(temp, test_size=0.5, random_state=42)

# Display the sizes of each dataset to confirm successful split
len(train), len(validation), len(test)

(15771, 1971, 1972)

In [6]:
# Calculate and display the distribution of the label for each subset of the data
label_distribution_train = train['label'].value_counts(normalize=True)
label_distribution_validation = validation['label'].value_counts(normalize=True)
label_distribution_test = test['label'].value_counts(normalize=True)

  
print("Label Distribution:")  
print("TRAINING SET:")  
for label, count in label_distribution_train.items():  
    print(f"Label {label}: {count} instances")  
  
print("\nVALIDATION SET:")  
for label, count in label_distribution_validation.items():  
    print(f"Label {label}: {count} instances")  
  
print("\nTEST SET:")  
for label, count in label_distribution_test.items():  
    print(f"Label {label}: {count} instances")  

Label Distribution:
TRAINING SET:
Label 1: 0.6283685245070065 instances
Label 0: 0.37163147549299347 instances

VALIDATION SET:
Label 1: 0.639269406392694 instances
Label 0: 0.3607305936073059 instances

TEST SET:
Label 1: 0.6338742393509128 instances
Label 0: 0.3661257606490872 instances


In [9]:
# Save the training, validation, and test datasets to disk
train.to_csv('../data/train.csv', index=False)
validation.to_csv('../data/validation.csv', index=False)
test.to_csv('../data/test.csv', index=False)

## Further processing if we need more stuff like processing for BERT models etc..