In [9]:
import pandas as pd

df = pd.read_csv('data/train.csv', index_col='id')
df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,1.0,No,4.0,4.0,No,13.0,,Extrovert
...,...,...,...,...,...,...,...,...
18519,3.0,No,7.0,3.0,No,9.0,7.0,Extrovert
18520,1.0,,6.0,7.0,No,6.0,5.0,Extrovert
18521,7.0,Yes,1.0,1.0,Yes,1.0,,Introvert
18522,,Yes,1.0,0.0,Yes,5.0,2.0,Introvert


In [10]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(df, test_size=0.2, random_state=42)
train

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1799,1.0,,7.0,4.0,No,10.0,5.0,Extrovert
11931,2.0,No,4.0,6.0,No,6.0,8.0,Extrovert
14307,4.0,No,5.0,5.0,No,7.0,6.0,Extrovert
12157,3.0,No,6.0,,No,8.0,8.0,Extrovert
18124,2.0,No,7.0,7.0,No,15.0,4.0,Extrovert
...,...,...,...,...,...,...,...,...
11284,9.0,,1.0,3.0,Yes,5.0,3.0,Introvert
11964,3.0,No,6.0,6.0,No,,3.0,Extrovert
5390,3.0,,7.0,3.0,No,14.0,8.0,Extrovert
860,3.0,No,4.0,,No,9.0,9.0,Extrovert


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier

categorical = df.drop(columns=['Personality']).select_dtypes(include=['object']).columns.tolist()
numerical = df.select_dtypes(exclude=['object']).columns.tolist()

pipeline = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('scaler', RobustScaler(), numerical)
    ])),
    ('model', LGBMClassifier(
        objective='binary',
        metric='binary_logloss',
        class_weight={'Introvert': 3, 'Extrovert': 1},
        num_iterations=2000,
        subsample=0.8,
        num_leaves=150,
        n_jobs=10,
    ))
])

pipeline.fit(
    train.drop(columns=['Personality']),
    train['Personality'],
)

[LightGBM] [Info] Number of positive: 3873, number of negative: 10946
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000417 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 75
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.514912 -> initscore=0.059668
[LightGBM] [Info] Start training from score 0.059668


In [12]:
from sklearn.metrics import f1_score, accuracy_score

val_predictions = pipeline.predict(val.drop(columns=['Personality']))

f1 = f1_score(val['Personality'], val_predictions, pos_label='Introvert')
accuracy = accuracy_score(val['Personality'], val_predictions)

print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

F1 Score: 0.9049630411826821
Accuracy: 0.951417004048583


