In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s5e7/sample_submission.csv
/kaggle/input/playground-series-s5e7/train.csv
/kaggle/input/playground-series-s5e7/test.csv


# Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# DATA

In [3]:
train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s5e7/sample_submission.csv')


In [4]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)

print("\nTrain Head:")
print(train.head())

print("\nTarget Distribution:")
print(train['Personality'].value_counts(normalize=True))


Train shape: (18524, 9)
Test shape: (6175, 8)

Train Head:
   id  Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0   0               0.0         No                      6.0            4.0   
1   1               1.0         No                      7.0            3.0   
2   2               6.0        Yes                      1.0            0.0   
3   3               3.0         No                      7.0            3.0   
4   4               1.0         No                      4.0            4.0   

  Drained_after_socializing  Friends_circle_size  Post_frequency Personality  
0                        No                 15.0             5.0   Extrovert  
1                        No                 10.0             8.0   Extrovert  
2                       NaN                  3.0             0.0   Introvert  
3                        No                 11.0             5.0   Extrovert  
4                        No                 13.0             NaN   Extrovert 

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


# Label Encoding

In [5]:
le = LabelEncoder()
train['Personality'] = le.fit_transform(train['Personality'])  # Extrovert = 0, Introvert = 1


# Model 

In [6]:
non_numeric_cols = train.select_dtypes(include='object').columns
print("Kategorik sütunlar:", non_numeric_cols.tolist())

Kategorik sütunlar: ['Stage_fear', 'Drained_after_socializing']


In [7]:
from sklearn.preprocessing import LabelEncoder

for col in ['Stage_fear', 'Drained_after_socializing']:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

In [8]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  
X = train.drop(columns=['id', 'Personality'])
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X_test = test.drop(columns=['id'])
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

y = train['Personality']


In [9]:
X = train.drop(columns=['id', 'Personality'])
X_test = test.drop(columns=['id'])
y = train['Personality']

X = X.dropna()
y = y[X.index]  


In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(random_state=42)
scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
print("Mean CV Accuracy:", scores.mean())


Mean CV Accuracy: 0.9648507755215562


In [11]:
y = train['Personality']
X = train.drop(columns=['id', 'Personality'])
X_test = test.drop(columns=['id'])
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [12]:
model.fit(X_scaled, y)
preds = model.predict(X_test_scaled)

In [13]:
label_map = {0: 'Extrovert', 1: 'Introvert'}
preds_text = [label_map[p] for p in preds]


In [14]:
submission = pd.DataFrame({
    'id': test['id'],
    'Personality': preds_text
})
submission.to_csv('submission.csv', index=False)