In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [4]:
# Load dataset
df = pd.read_csv("data/feature_extracted_dataset.csv")

In [5]:
# Split features and label
X = df.drop(columns=['binary_label'])
y = df['binary_label'].map({'NO': 0, 'YES': 1})  # Convert to binary 0/1

In [6]:
# Optional: train-test split before balancing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train) # type: ignore



In [8]:
# Save or continue with resampled X_resampled, y_resampled
print("Before SMOTE:", y_train.value_counts().to_dict())
print("After SMOTE:", pd.Series(y_resampled, dtype='int').value_counts().to_dict()) # type: ignore

Before SMOTE: {1: 25048, 0: 8439}
After SMOTE: {1: 25048, 0: 25048}


In [9]:
# Combine features and labels
balanced_df = pd.DataFrame(X_resampled)
balanced_df['label'] = y_resampled

  balanced_df['label'] = y_resampled


In [10]:
balanced_df

Unnamed: 0,polarity,subjectivity,pronoun_count,negation_count,temporal_count,word_count,avg_word_length,able,actually,advice,...,work,working,world,worse,worst,worth,wrong,year,years,label
0,-0.263095,0.682738,6.0,3.0,0.000000,45.0,4.822222,0.286232,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,1
1,0.000000,0.000000,0.0,1.0,1.000000,7.0,5.428571,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0
2,-0.086429,0.615238,14.0,2.0,1.000000,87.0,4.505747,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,1
3,0.078611,0.370556,26.0,2.0,0.000000,224.0,5.486607,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.608540,1
4,0.120833,0.424405,16.0,1.0,1.000000,102.0,4.509804,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.142886,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50091,-0.034195,0.290662,0.0,0.0,0.000000,5.0,7.947128,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0
50092,0.084192,0.548230,12.0,1.0,1.194002,103.0,5.275184,0.000000,0.0,0.0,...,0.265545,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.029025,0
50093,0.346781,0.726803,1.0,1.0,0.900960,24.0,4.554047,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0
50094,-0.009217,0.466035,9.0,0.0,0.000000,84.0,5.130952,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0


In [12]:
# Save to CSV
balanced_df.to_csv("data/balanced_dataset.csv", index=False)

In [13]:
# Save untouched test set
test_df = pd.DataFrame(X_test)
test_df['label'] = y_test.values  # ensure consistent format

In [14]:
test_df

Unnamed: 0,polarity,subjectivity,pronoun_count,negation_count,temporal_count,word_count,avg_word_length,able,actually,advice,...,work,working,world,worse,worst,worth,wrong,year,years,label
4094,0.100000,0.831481,6.0,1.0,0.0,32.0,5.312500,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,1
41693,0.333333,0.666667,1.0,0.0,0.0,11.0,5.909091,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0
25926,-0.490909,0.825000,6.0,0.0,0.0,55.0,5.181818,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,1
25872,0.063095,0.705952,6.0,0.0,0.0,99.0,5.434343,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,1
32295,-0.114881,0.635913,11.0,4.0,0.0,74.0,4.905405,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12856,0.212500,0.438770,5.0,3.0,0.0,104.0,5.230769,0.0,0.0,0.225752,...,0.0,0.000000,0.000000,0.0,0.0,0.224196,0.0,0.000000,0.000000,1
39031,0.118182,0.327273,0.0,0.0,0.0,12.0,6.333333,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0
5145,-0.283333,0.766667,6.0,0.0,0.0,37.0,5.054054,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,1
22759,0.168627,0.476471,31.0,3.0,4.0,237.0,5.037975,0.0,0.0,0.139812,...,0.0,0.132089,0.115364,0.0,0.0,0.000000,0.0,0.187563,0.174986,1


In [15]:
test_df.to_csv("data/test_set.csv", index=False)