In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s5e7/sample_submission.csv
/kaggle/input/playground-series-s5e7/train.csv
/kaggle/input/playground-series-s5e7/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s5e7/sample_submission.csv')


In [3]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline


In [4]:
# RandomForest
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

# LightGBM
lgbm = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.03,
    max_depth=8,
    random_state=42
)

# Ensemble (soft voting: probability average)
ensemble = VotingClassifier(
    estimators=[('rf', rf), ('lgbm', lgbm)],
    voting='soft' 
)


In [5]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
le = LabelEncoder()
train['Personality'] = le.fit_transform(train['Personality'])  # Extrovert = 0, Introvert = 1

In [6]:
non_numeric_cols = train.select_dtypes(include='object').columns
print("Kategorik sütunlar:", non_numeric_cols.tolist())

Kategorik sütunlar: ['Stage_fear', 'Drained_after_socializing']


In [7]:
from sklearn.preprocessing import LabelEncoder

for col in ['Stage_fear', 'Drained_after_socializing']:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

In [8]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  
X = train.drop(columns=['id', 'Personality'])
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X_test = test.drop(columns=['id'])
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

y = train['Personality']

In [9]:
X = train.drop(columns=['id', 'Personality'])
X_test = test.drop(columns=['id'])
y = train['Personality']

X = X.dropna()
y = y[X.index]  

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [11]:
y = train['Personality']
X = train.drop(columns=['id', 'Personality'])
X_test = test.drop(columns=['id'])
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [12]:
ensemble.fit(X_scaled, y)
preds = ensemble.predict(X_test_scaled)

[LightGBM] [Info] Number of positive: 4825, number of negative: 13699
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001618 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 73
[LightGBM] [Info] Number of data points in the train set: 18524, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260473 -> initscore=-1.043512
[LightGBM] [Info] Start training from score -1.043512


In [13]:
label_map = {0: 'Extrovert', 1: 'Introvert'}
preds_text = [label_map[p] for p in preds]


In [14]:
submission = pd.DataFrame({
    'id': test['id'],
    'Personality': preds_text
})
submission.to_csv('submission.csv', index=False)
