In [1]:
import pandas as pd

df = pd.read_csv('data/train.csv')
df

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert
...,...,...,...,...,...,...,...,...,...
18519,18519,3.0,No,7.0,3.0,No,9.0,7.0,Extrovert
18520,18520,1.0,,6.0,7.0,No,6.0,5.0,Extrovert
18521,18521,7.0,Yes,1.0,1.0,Yes,1.0,,Introvert
18522,18522,,Yes,1.0,0.0,Yes,5.0,2.0,Introvert


In [2]:
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train, val = TabularDataset(train_df), TabularDataset(val_df)

In [3]:
predictor = TabularPredictor(
    label='Personality',
    problem_type='binary',
    path='autogluon_models',
    eval_metric='f1',
    positive_class='Introvert',
    verbosity=2,
).fit(
    train.sample(frac=0.8, random_state=42),
    #hyperparameters='toy',
    presets='best_quality',
    calibrate_decision_threshold='auto',
    dynamic_stacking=True,
    ag_args_fit={
        'num_cpus': 10,
    },
    time_limit=1200,
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.11.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.5.0: Tue Apr 22 19:54:43 PDT 2025; root:xnu-11417.121.6~2/RELEASE_ARM64_T8132
CPU Count:          10
Memory Avail:       20.65 GB / 32.00 GB (64.5%)
Disk Space Avail:   116.98 GB / 926.35 GB (12.6%)
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 300s of the 1200s of remaining time (25%).
	Running DyStack sub-fit in a ray process to avoi

In [4]:
predictor.leaderboard(val, extra_metrics=['f1', 'roc_auc', 'accuracy'])

Unnamed: 0,model,score_test,f1,roc_auc,accuracy,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetFastAI_r102_BAG_L1,0.938559,0.938559,0.965194,0.968691,0.94016,f1,0.094344,0.047871,23.410725,0.094344,0.047871,23.410725,1,True,24
1,NeuralNetTorch_r30_BAG_L1,0.938559,0.938559,0.9646,0.968691,0.940235,f1,0.164645,0.070922,73.91948,0.164645,0.070922,73.91948,1,True,30
2,NeuralNetFastAI_r191_BAG_L1,0.938559,0.938559,0.965574,0.968691,0.940754,f1,0.296213,0.100539,76.086188,0.296213,0.100539,76.086188,1,True,17
3,NeuralNetFastAI_BAG_L1,0.938559,0.938559,0.968143,0.968691,0.940754,f1,0.317661,0.079839,37.639325,0.317661,0.079839,37.639325,1,True,10
4,WeightedEnsemble_L2,0.938559,0.938559,0.968143,0.968691,0.940754,f1,0.31847,0.082048,38.436004,0.000809,0.002209,0.796679,2,True,35
5,RandomForestGini_BAG_L1,0.938494,0.938494,0.963232,0.968691,0.937163,f1,0.053875,0.154715,0.39124,0.053875,0.154715,0.39124,1,True,5
6,RandomForestEntr_BAG_L1,0.938494,0.938494,0.963347,0.968691,0.93751,f1,0.056112,0.155004,0.339966,0.056112,0.155004,0.339966,1,True,6
7,CatBoost_r177_BAG_L1,0.937997,0.937997,0.965845,0.968421,0.939389,f1,0.009529,0.007858,12.367815,0.009529,0.007858,12.367815,1,True,14
8,CatBoost_r13_BAG_L1,0.937997,0.937997,0.966252,0.968421,0.938929,f1,0.015597,0.010357,19.314137,0.015597,0.010357,19.314137,1,True,25
9,CatBoost_BAG_L1,0.937997,0.937997,0.966523,0.968421,0.939082,f1,0.036533,0.009837,21.368592,0.036533,0.009837,21.368592,1,True,7


In [5]:
test = TabularDataset('data/test.csv')
test_predictions = predictor.predict(test)
test_predictions

Loaded data from: data/test.csv | Columns = 8 / 8 | Rows = 6175 -> 6175


0       Extrovert
1       Introvert
2       Extrovert
3       Extrovert
4       Introvert
          ...    
6170    Extrovert
6171    Introvert
6172    Extrovert
6173    Extrovert
6174    Introvert
Name: Personality, Length: 6175, dtype: object

In [6]:
sub = pd.read_csv('data/sample_submission.csv')
sub['Personality'] = test_predictions
sub.to_csv('submission.csv', index=False)