In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("sentio-data.csv")

print(f"Total rows: {len(df)}")
print(f"Label distribution:\n{df['label'].value_counts()}")
print(f"Unique labels: {df['label'].unique()}")

labels = df['label'].unique().tolist()
samples_per_label = 3000 // len(labels)

increment_samples = []
for label in labels:
    label_df = df[df['label'] == label]
    sampled = label_df.sample(n=samples_per_label, random_state=42)
    increment_samples.append(sampled)

increment_df = pd.concat(increment_samples)
remaining_df = df.drop(increment_df.index)

train_df, test_df = train_test_split(
    remaining_df,
    test_size=0.2,
    stratify=remaining_df['label'],
    random_state=42
)

for dataset in [increment_df, train_df, test_df]:
    dataset = dataset.sample(frac=1, random_state=42)

increment_df = increment_df[['text', 'label']].reset_index(drop=True)
train_df = train_df[['text', 'label']].reset_index(drop=True)
test_df = test_df[['text', 'label']].reset_index(drop=True)

increment_df.to_csv("sentio-data-increment.csv", index=False)
train_df.to_csv("sentio-data-train.csv", index=False)
test_df.to_csv("sentio-data-test.csv", index=False)

print(f"\nIncrement set: {len(increment_df)} rows")
print(increment_df['label'].value_counts())
print(f"\nTrain set: {len(train_df)} rows")
print(train_df['label'].value_counts())
print(f"\nTest set: {len(test_df)} rows")
print(test_df['label'].value_counts())

Total rows: 57042
Label distribution:
label
Normal                  16351
Depression              15404
Suicidal                10653
Stress                   6668
Anxiety                  3888
Bipolar                  2877
Personality disorder     1201
Name: count, dtype: int64
Unique labels: ['Anxiety' 'Normal' 'Depression' 'Suicidal' 'Stress' 'Bipolar'
 'Personality disorder']

Increment set: 2996 rows
label
Anxiety                 428
Normal                  428
Depression              428
Suicidal                428
Stress                  428
Bipolar                 428
Personality disorder    428
Name: count, dtype: int64

Train set: 43236 rows
label
Normal                  12738
Depression              11981
Suicidal                 8180
Stress                   4992
Anxiety                  2768
Bipolar                  1959
Personality disorder      618
Name: count, dtype: int64

Test set: 10810 rows
label
Normal                  3185
Depression              2995
Suicidal    