In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e7/sample_submission.csv
/kaggle/input/playground-series-s5e7/train.csv
/kaggle/input/playground-series-s5e7/test.csv


# Dataset Preparation

In [2]:
train_data = pd.read_csv("/kaggle/input/playground-series-s5e7/train.csv")

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


In [4]:
train_data.nunique()

id                           18524
Time_spent_Alone                12
Stage_fear                       2
Social_event_attendance         11
Going_outside                    8
Drained_after_socializing        2
Friends_circle_size             16
Post_frequency                  11
Personality                      2
dtype: int64

## Converting text data to binomial data

In [5]:
# convert text data to numerical data Stage
# Check what you're working with
print(train_data['Stage_fear'].value_counts())
print(train_data['Stage_fear'].unique())

print("\n")
print(train_data['Drained_after_socializing'].value_counts())
print(train_data['Drained_after_socializing'].unique())

print("\n")
print(train_data['Personality'].value_counts())
print(train_data['Personality'].unique())

Stage_fear
No     12609
Yes     4022
Name: count, dtype: int64
['No' 'Yes' nan]


Drained_after_socializing
No     13313
Yes     4062
Name: count, dtype: int64
['No' nan 'Yes']


Personality
Extrovert    13699
Introvert     4825
Name: count, dtype: int64
['Extrovert' 'Introvert']


In [6]:
# Convert to binary
# train data
train_data['Stage_fear'] = train_data['Stage_fear'].map({'Yes': 1, 'No': 0})
train_data['Drained_after_socializing'] = train_data['Drained_after_socializing'].map({'Yes': 1, 'No': 0})
train_data['Personality'] = train_data['Personality'].map({'Extrovert': 1, 'Introvert': 0})



In [7]:
train_data

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,0.0,6.0,4.0,0.0,15.0,5.0,1
1,1,1.0,0.0,7.0,3.0,0.0,10.0,8.0,1
2,2,6.0,1.0,1.0,0.0,,3.0,0.0,0
3,3,3.0,0.0,7.0,3.0,0.0,11.0,5.0,1
4,4,1.0,0.0,4.0,4.0,0.0,13.0,,1
...,...,...,...,...,...,...,...,...,...
18519,18519,3.0,0.0,7.0,3.0,0.0,9.0,7.0,1
18520,18520,1.0,,6.0,7.0,0.0,6.0,5.0,1
18521,18521,7.0,1.0,1.0,1.0,1.0,1.0,,0
18522,18522,,1.0,1.0,0.0,1.0,5.0,2.0,0


In [8]:
train_data = train_data.drop('id',axis=1)

# Training Dataset Pipeline

In [9]:
train_data.iloc[:, :-1]

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,0.0,0.0,6.0,4.0,0.0,15.0,5.0
1,1.0,0.0,7.0,3.0,0.0,10.0,8.0
2,6.0,1.0,1.0,0.0,,3.0,0.0
3,3.0,0.0,7.0,3.0,0.0,11.0,5.0
4,1.0,0.0,4.0,4.0,0.0,13.0,
...,...,...,...,...,...,...,...
18519,3.0,0.0,7.0,3.0,0.0,9.0,7.0
18520,1.0,,6.0,7.0,0.0,6.0,5.0
18521,7.0,1.0,1.0,1.0,1.0,1.0,
18522,,1.0,1.0,0.0,1.0,5.0,2.0


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


X = train_data.iloc[:, :-1].copy()

y = train_data['Personality'].copy()

# Split into 80% train, 20% validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2,      # 20% for validation
    random_state=42,    # For reproducibility
    stratify=y          # Maintains class distribution (for classification)
)


# AutomML Approach using Optuna Library

In [11]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, make_scorer, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRFClassifier
import optuna
import numpy as np
from time import time

# create an OptunaAutoML Class
class AutoMLOptuna:
    """
    An AutoML Algorithm to search for the best algorithm for binary classification
    Uses StandardScaler for scaling and SimpleImputer for handling missing values.
    Includes SVC, XGBoost, SVM, LDA, RandomForest, Neural Network, Naive Bayes, and Decision Trees
    """
    def __init__(self, X_train, X_test, y_train, y_test, numeric_features=None, categorical_features=None, models=None):
        """
        X_train, X_test: Input training and test data
        y_train, y_test: Input binary classification labels (0 and 1)
        numeric_features: List of numeric column names
        categorical_features: List of categorical column names
        models: List of specific models to use (optional)
        """
        self.X_train, self.X_test, self.y_train, self.y_test = X_train, X_test, y_train, y_test
        self.numeric_features = numeric_features
        self.categorical_features = categorical_features
        self.models = models

    def create_model(self, trial, models=None):
        # Create preprocessing pipeline with SimpleImputer and StandardScaler
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
            ('scaler', StandardScaler())  # Standard scaling only
        ])
        
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent'))  # Handle missing categorical values
        ])

        # Column transformer
        if self.numeric_features is not None and self.categorical_features is not None:
            transformer = ColumnTransformer([
                ('numeric', numeric_transformer, self.numeric_features),
                ('categorical', categorical_transformer, self.categorical_features)
            ], remainder='passthrough')
        else:
            # If features not specified, auto-detect (assuming all numeric for simplicity)
            transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ])

        if self.models is None:
            self.models = ['LogisticRegression', 'NaiveBayes', 'DecisionTree', 'RandomForest',
                          'SVM', 'KNN', 'GBM', 'XGBoost', 'NeuralNetwork', 'AdaBoost', 'XGB-RF']
        
        # Select classifier
        model_type = trial.suggest_categorical('model_type', self.models)

        if model_type == 'LogisticRegression':
            penalty = trial.suggest_categorical('penalty', ['l2', 'l1'])
            solver = 'saga' if penalty == 'l1' else 'lbfgs'
            regularization = trial.suggest_float('Logistic-regularization', 0.01, 500, log=True)
            model = LogisticRegression(penalty=penalty, C=regularization, solver=solver, random_state=42)

        elif model_type == 'NaiveBayes':
            model = GaussianNB()

        elif model_type == 'DecisionTree':
            max_depth = trial.suggest_int('max_depth', 1, 32)
            min_samples_split = trial.suggest_float('min_samples_split', 0.1, 1.0)
            min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.1, 0.5)
            model = DecisionTreeClassifier(max_depth=max_depth,
                                         min_samples_split=min_samples_split,
                                         min_samples_leaf=min_samples_leaf,
                                         random_state=42)

        elif model_type == 'RandomForest':
            n_estimators = trial.suggest_int('n_estimators', 50, 1000)
            max_depth = trial.suggest_int('max_depth', 1, 10)
            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, 
                                         n_jobs=-1, random_state=42)

        elif model_type == 'SVM':
            C = trial.suggest_float('C', 1e-4, 1e4, log=True)
            gamma = trial.suggest_float('gamma', 1e-4, 1e4, log=True)
            kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
            degree = trial.suggest_int('degree', 1, 5)
            coef0 = trial.suggest_float('coef0', -1.0, 1.0)
            model = SVC(C=C, gamma=gamma, kernel=kernel, degree=degree, coef0=coef0, random_state=42)

        elif model_type == 'SVC-Bagging':
            bagging_fraction = trial.suggest_float('bagging_fraction', 0.1, 1.0)
            estimators = trial.suggest_int('n_estimators', 1, 20)
            model = BaggingClassifier(estimator=SVC(random_state=42), n_estimators=estimators, 
                                    max_samples=bagging_fraction, random_state=42)

        elif model_type == 'KNN-Bagging':
            bagging_fraction = trial.suggest_float('bagging_fraction', 0.1, 1.0)
            estimators = trial.suggest_int('n_estimators', 1, 20)
            model = BaggingClassifier(estimator=KNeighborsClassifier(), n_estimators=estimators, 
                                    max_samples=bagging_fraction, random_state=42)

        elif model_type == 'KNN':
            n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
            weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
            p = trial.suggest_categorical('p', [1, 2])
            leaf_size = trial.suggest_int('leaf_size', 10, 50)
            algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
            model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, p=p, 
                                       leaf_size=leaf_size, algorithm=algorithm)

        elif model_type == 'GBM':
            learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
            n_estimators = trial.suggest_int('n_estimators', 50, 1000)
            max_depth = trial.suggest_int('max_depth', 1, 20)
            model = GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=n_estimators, 
                                             max_depth=max_depth, random_state=42)

        elif model_type == 'MLP':
            hidden_layer_sizes = trial.suggest_int('hidden_layer_sizes', 1, 10, step=1)
            activation = trial.suggest_categorical('activation', ['relu', 'tanh', 'logistic'])
            solver = trial.suggest_categorical('solver', ['adam', 'sgd'])
            alpha = trial.suggest_float('alpha', 1e-5, 1e-2, log=True)
            model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation,
                                solver=solver, alpha=alpha, random_state=42)

        elif model_type == 'NeuralNetwork':
            hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', ['1x100', '2x100'])
            if hidden_layer_sizes == '1x100':
                hidden_layer_sizes = (100,)
            elif hidden_layer_sizes == '2x100':
                hidden_layer_sizes = (100, 100)
            activation = trial.suggest_categorical('activation', ['relu', 'tanh'])
            solver = trial.suggest_categorical('solver', ['adam', 'sgd'])
            model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, 
                                solver=solver, random_state=42)

        elif model_type == 'XGBoost':
            learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
            n_estimators = trial.suggest_int('n_estimators', 50, 1000)
            max_depth = trial.suggest_int('max_depth', 1, 10)
            model = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, 
                                max_depth=max_depth, n_jobs=-1, random_state=42)

        elif model_type == 'AdaBoost':
            n_estimators = trial.suggest_int('n_estimators', 50, 1000)
            learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
            base_estimator = trial.suggest_categorical('base_estimator', ['decision_tree', 'svm', 'random_forest'])
            
            if base_estimator == 'decision_tree':
                max_depth = trial.suggest_int('max_depth', 1, 10)
                base_model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
            elif base_estimator == 'svm':
                C = trial.suggest_float('C', 1e-4, 1e+3, log=True)
                kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
                base_model = SVC(C=C, kernel=kernel, random_state=42)
            else:
                n_estimators_rf = trial.suggest_int('n_estimators_rf', 50, 500)
                max_depth_rf = trial.suggest_int('max_depth_rf', 1, 10)
                base_model = RandomForestClassifier(n_estimators=n_estimators_rf, max_depth=max_depth_rf, 
                                                  random_state=42)

            model = AdaBoostClassifier(base_estimator=base_model, n_estimators=n_estimators,
                                     learning_rate=learning_rate, random_state=42)

        elif model_type == 'XGB-RF':
            params = {
                'objective': 'binary:logistic',
                'eval_metric': 'logloss',
                'booster': 'gbtree',
                'verbosity': 0,
                'n_jobs': -1,
                'random_state': 42,
                'eta': trial.suggest_float('eta', 0.001, 0.1, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'lambda': trial.suggest_float('lambda', 0.001, 10.0),
                'alpha': trial.suggest_float('alpha', 0.001, 10.0, log=True),
                'gamma': trial.suggest_float('gamma', 0.001, 10.0, log=True),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            }
            model = XGBRFClassifier(**params)

        if trial.should_prune():
            raise optuna.TrialPruned()

        # Create pipeline with preprocessing and model
        if self.numeric_features is not None and self.categorical_features is not None:
            pipeline = Pipeline([
                ('transformer', transformer),
                ('model', model)
            ])
        else:
            pipeline = Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('model', model)
            ])

        return pipeline

    def objective(self, trial):
        """
        Define the objective function as the 5-fold cross-validation score
        trial: number of trials
        """
        model = self.create_model(trial)
        scorer = make_scorer(accuracy_score)
        scores = cross_val_score(model, self.X_train, self.y_train, cv=5, scoring=scorer)
        return np.mean(scores)

    def AutoML_test(self, n_trials=50):
        """
        AutoML test function to find the best optimized model
        n_trials: number of optimization trials (default: 300)
        """
        start = time()
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=n_trials)
        best_params = study.best_params
        print("\n\nBest parameters: ", best_params)

        best_model = self.create_model(study.best_trial)
        best_model.fit(self.X_train, self.y_train)

        scores = cross_val_score(best_model, self.X_train, self.y_train, cv=5)
        print("Best cross-validation score: %.5f" % np.mean(scores))
        
        y_pred = best_model.predict(self.X_test)
        print("Accuracy on test data: %.5f" % accuracy_score(self.y_test, y_pred))
        print("Precision: %.5f" % precision_score(self.y_test, y_pred))
        print("Recall: %.5f" % recall_score(self.y_test, y_pred))
        print("F1-score: %.5f" % f1_score(self.y_test, y_pred))
        
        total_time = time() - start
        print("Runtime: %.4f sec" % total_time)
        
        return best_model, best_params


In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# load data
test_data = pd.read_csv("/kaggle/input/playground-series-s5e7/test.csv")

# test data
test_data['Stage_fear'] = test_data['Stage_fear'].map({'Yes': 1, 'No': 0})
test_data['Drained_after_socializing'] = test_data['Drained_after_socializing'].map({'Yes': 1, 'No': 0})

params = {'n_neighbors': 8,
          'weights': 'uniform',
          'p': 1,
          'leaf_size': 10,
          'algorithm': 'auto'}

# knn model 
model = KNeighborsClassifier(**params)

# pipeline
pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), model)

# Fit the knn on the training data
pipeline.fit(X_train, y_train)

# Predict the labels for the test data
predictions = pipeline.predict(X_val)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_val, predictions)
recall = recall_score(y_val, predictions)
# precision = precision_score(y_val, predictions)
# f1 = f1_score(y_val, predictions)

print(f"accuracy: {accuracy}")
print(f"recall: {recall}")

accuracy: 0.9651821862348178
recall: 0.9795620437956204


# Test Submission

In [13]:
# load data
test_data = pd.read_csv("/kaggle/input/playground-series-s5e7/test.csv")

# test data
test_data['Stage_fear'] = test_data['Stage_fear'].map({'Yes': 1, 'No': 0})
test_data['Drained_after_socializing'] = test_data['Drained_after_socializing'].map({'Yes': 1, 'No': 0})

# test_data without the 1st column
X_test = test_data.drop('id',axis=1)

In [14]:
X_test

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,3.0,0.0,7.0,4.0,0.0,6.0,
1,,1.0,0.0,0.0,1.0,5.0,1.0
2,3.0,0.0,5.0,6.0,0.0,15.0,9.0
3,3.0,0.0,4.0,4.0,0.0,5.0,6.0
4,9.0,1.0,1.0,2.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...
6170,3.0,0.0,5.0,5.0,0.0,9.0,6.0
6171,8.0,1.0,2.0,1.0,1.0,0.0,0.0
6172,2.0,0.0,4.0,3.0,0.0,9.0,7.0
6173,3.0,0.0,4.0,4.0,0.0,11.0,9.0


In [15]:
params = {'n_neighbors': 8,
          'weights': 'uniform',
          'p': 1,
          'leaf_size': 10,
          'algorithm': 'auto'}

# knn model 
model = KNeighborsClassifier(**params)

# pipeline
pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), model)

# Fit the knn on the training data
pipeline.fit(X_train, y_train)

# Predict the labels for the test data
predictions = pipeline.predict(X_test)

In [16]:
# Get the ID column (first column of test_data)
ids = test_data.iloc[:, 0]

# Map numeric predictions to labels
label_map = {0: "Introvert", 1: "Extrovert"}
mapped_predictions = list(map(label_map.get, predictions))

# Combine into a single DataFrame
result_df = pd.DataFrame({
    'id': ids,
    'Personality': mapped_predictions
})

print(result_df.head())

      id Personality
0  18524   Extrovert
1  18525   Introvert
2  18526   Extrovert
3  18527   Extrovert
4  18528   Introvert


In [17]:
# output result
result_df.to_csv('submission_v1.csv', index=False)