In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/clear-data/train.csv
/kaggle/input/clear-data/test.csv
/kaggle/input/playground-series-s5e7/sample_submission.csv
/kaggle/input/playground-series-s5e7/train.csv
/kaggle/input/playground-series-s5e7/test.csv


In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.simplefilter("ignore", UserWarning)

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [4]:
train=pd.read_csv("/kaggle/input/clear-data/train.csv")
test=pd.read_csv("/kaggle/input/clear-data/test.csv")

In [5]:
numerical_features = train.select_dtypes(include=['number']).columns
categorical_cols = train.select_dtypes(exclude=['number']).columns

train[numerical_features] = train[numerical_features].fillna(train[numerical_features].mean())

for col in categorical_cols:
    if train[col].isnull().any():
        train[col] = train[col].fillna(train[col].mode()[0])

In [6]:
numerical_features = test.select_dtypes(include=['number']).columns
categorical_cols = test.select_dtypes(exclude=['number']).columns

test[numerical_features] = test[numerical_features].fillna(test[numerical_features].mean())

for col in categorical_cols:
    if test[col].isnull().any():
        test[col] = test[col].fillna(test[col].mode()[0])

In [7]:
for feature in ['Stage_fear','Drained_after_socializing']:
    train[feature]=le.fit_transform(train[feature])
    test[feature]=le.fit_transform(test[feature])

In [8]:
train.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,0,6.0,4.0,0,15.0,5.0,Extrovert
1,1,1.0,0,7.0,3.0,0,10.0,8.0,Extrovert
2,2,6.0,1,1.0,0.0,1,3.0,0.0,Introvert
3,3,3.0,0,7.0,3.0,0,11.0,5.0,Extrovert
4,4,1.0,0,4.0,4.0,0,13.0,7.0,Extrovert


In [9]:
train['Time_spent_Alone']=train['Time_spent_Alone'].astype(int)
test['Time_spent_Alone']=test['Time_spent_Alone'].astype(int)

In [10]:
# Social Engagement Score (interaction term)
train['Social_score'] = (train['Social_event_attendance'] + train['Going_outside'] + train['Friends_circle_size'])
# Introvert-Tendency Proxy
train['Introvert_score'] = (train['Time_spent_Alone'] - train['Social_score'])
train['Inp']=train['Introvert_score']-train['Post_frequency']
train['set']=train['Social_event_attendance']-train['Time_spent_Alone']
train['In_ex']=train['Stage_fear']+train['Drained_after_socializing']

# Social Engagement Score (interaction term)
test['Social_score'] = (test['Social_event_attendance'] + test['Going_outside'] + test['Friends_circle_size'])
# Introvert-Tendency Proxy
test['Introvert_score'] = (test['Time_spent_Alone'] - test['Social_score'])
test['Inp']=test['Introvert_score']-test['Post_frequency']
test['set']=test['Social_event_attendance']-test['Time_spent_Alone']
test['In_ex']=test['Stage_fear']+test['Drained_after_socializing']

In [11]:
train.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality,Social_score,Introvert_score,Inp,set,In_ex
0,0,0,0,6.0,4.0,0,15.0,5.0,Extrovert,25.0,-25.0,-30.0,6.0,0
1,1,1,0,7.0,3.0,0,10.0,8.0,Extrovert,20.0,-19.0,-27.0,6.0,0
2,2,6,1,1.0,0.0,1,3.0,0.0,Introvert,4.0,2.0,2.0,-5.0,2
3,3,3,0,7.0,3.0,0,11.0,5.0,Extrovert,21.0,-18.0,-23.0,4.0,0
4,4,1,0,4.0,4.0,0,13.0,7.0,Extrovert,21.0,-20.0,-27.0,3.0,0


In [12]:
def team(a):
    if a<-1:
        return 0
    else:
        return 1
train['team']=train['set'].apply(team)
test['team']=test['set'].apply(team)
        

In [13]:
X=train.drop(['Personality'],axis=1)

y_t=train['Personality']

In [14]:
y=le.fit_transform(y_t)
y

array([0, 0, 1, ..., 1, 1, 0])

In [15]:
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

x_scale = scaler.transform(test)

In [16]:




# Define objective function for Optuna
def objective(trial):
    # Hyperparameters to tune
    n_estimators = trial.suggest_int('n_estimators', 50, 600)
    max_depth = trial.suggest_int('max_depth', 2, 62)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])

    # Create model
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    # Cross-validation
    score = cross_val_score(clf, X_scale, y, cv=5, scoring='accuracy').mean()
    return score

# Create Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Show best result
print("Best trial:")
trial = study.best_trial

print(f"  Accuracy: {trial.value}")
print("  Best hyperparameters: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


[I 2025-07-15 11:05:01,428] A new study created in memory with name: no-name-90ffb76c-8ff6-4596-a0f9-559e457af623
[I 2025-07-15 11:05:03,782] Trial 0 finished with value: 0.9690134748734271 and parameters: {'n_estimators': 65, 'max_depth': 4, 'min_samples_split': 15, 'min_samples_leaf': 13, 'max_features': 'log2'}. Best is trial 0 with value: 0.9690134748734271.
[I 2025-07-15 11:05:46,302] Trial 1 finished with value: 0.968905483512736 and parameters: {'n_estimators': 557, 'max_depth': 38, 'min_samples_split': 11, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9690134748734271.
[I 2025-07-15 11:06:29,330] Trial 2 finished with value: 0.9690134894471599 and parameters: {'n_estimators': 512, 'max_depth': 57, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_features': 'auto'}. Best is trial 2 with value: 0.9690134894471599.
[I 2025-07-15 11:06:47,360] Trial 3 finished with value: 0.968905483512736 and parameters: {'n_estimators': 516, 'max_depth': 4, 'mi

Best trial:
  Accuracy: 0.9691754327669981
  Best hyperparameters: 
    n_estimators: 280
    max_depth: 50
    min_samples_split: 13
    min_samples_leaf: 20
    max_features: sqrt


In [17]:
model=RandomForestClassifier(n_estimators=126,
    max_depth=11,
    min_samples_split=15,
    min_samples_leaf=16,
    max_features='log2')

In [18]:
model.fit(X_scale,y)

In [19]:
pred=model.predict(x_scale)

In [20]:
predictions=le.inverse_transform(pred)

In [21]:
output = pd.DataFrame({'id': test.id, 'Personalities': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [22]:
sub=pd.read_csv("submission.csv")
sub

Unnamed: 0,id,Personalities
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
...,...,...
6170,24694,Extrovert
6171,24695,Introvert
6172,24696,Extrovert
6173,24697,Extrovert
