# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

# Data

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')
original = pd.read_csv('/kaggle/input/extrovert-vs-introvert-behavior-data/personality_datasert.csv')

sample_submission = pd.read_csv('/kaggle/input/playground-series-s5e7/sample_submission.csv')

In [3]:
original_copy = original.copy()
for k in range(7):
    original = pd.concat([original,original_copy],axis=0)

In [4]:
# Separate features and target
X_train = train.drop(['id', 'Personality'], axis=1, errors='ignore')
y_train = train['Personality']
X_test = test.drop(['id'], axis=1, errors='ignore')

X_original = original.drop(['id', 'Personality'], axis=1, errors='ignore')
y_original = original['Personality']

print(f"Features shape: {X_train.shape}")
print(f"Target shape: {y_train.shape if y_train is not None else 'None'}")

# Encode categorical variables
label_encoders = {}
categorical_columns = X_train.select_dtypes(include=['object']).columns

for col in categorical_columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    X_original[col] = le.transform(X_original[col].astype(str))
    label_encoders[col] = le
    print(f"Encoded {col}: {le.classes_}")

# Encode target variable
target_encoder = LabelEncoder()
y_train_encoded = target_encoder.fit_transform(y_train)
y_original_encoded = target_encoder.fit_transform(y_original)
print(f"Target classes: {target_encoder.classes_}")

# Feature scaling (optional for XGBoost, but can help)
scaler = StandardScaler()
feature_names = X_train.columns.tolist()

print(f"Feature columns: {feature_names}")

Features shape: (18524, 7)
Target shape: (18524,)
Encoded Stage_fear: ['No' 'Yes' 'nan']
Encoded Drained_after_socializing: ['No' 'Yes' 'nan']
Target classes: ['Extrovert' 'Introvert']
Feature columns: ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency']


# Training

In [5]:
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_leaves': 25,
    'min_child_weight': np.float64(0.003440906647223279),
    'learning_rate': np.float64(0.09470087254583547),
    'n_estimators': 10000,
    'subsample': np.float64(0.8025291728808135),
    'colsample_bylevel': np.float64(0.8360122952647302),
    'colsample_bytree': np.float64(0.87329448975438),
    'reg_alpha': np.float64(0.002926163798802797),
    'reg_lambda': np.float64(27.126259438996986),
    'random_state': 42,
    'tree_method': 'hist',
    'device': "cuda"
}

In [6]:
y_train_encoded

array([0, 0, 1, ..., 1, 1, 0])

In [7]:
xgb_model = xgb.XGBClassifier(**xgb_params)

# Stratified K-Fold Cross Validation with Early Stopping
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

print(f"Performing {n_splits}-fold Stratified Cross Validation with Early Stopping...")

# Custom cross-validation with early stopping
cv_scores = []
fold_num = 1

for train_idx, val_idx in skf.split(X_train, y_train_encoded):
    print(f"\nTraining Fold {fold_num}/{n_splits}...")
    
    # Split data
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train_encoded[train_idx], y_train_encoded[val_idx]

    X_fold_train = pd.concat([X_fold_train,X_original], axis=0, ignore_index=True)
    y_fold_train = np.concatenate((y_fold_train,y_original_encoded))
    
    # Create model for this fold
    fold_model = xgb.XGBClassifier(**xgb_params)
    
    # Train with early stopping
    fold_model.fit(
        X_fold_train, y_fold_train,
        eval_set=[(X_fold_val, y_fold_val)],
        early_stopping_rounds=50,
        verbose=False
    )
    
    # Predict and calculate accuracy
    fold_predictions = fold_model.predict(X_fold_val)
    fold_accuracy = accuracy_score(y_fold_val, fold_predictions)
    cv_scores.append(fold_accuracy)
    
    print(f"Fold {fold_num} Accuracy: {fold_accuracy:.4f}")
    print(f"Best iteration: {fold_model.best_iteration}")
    
    fold_num += 1

cv_scores = np.array(cv_scores)

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

Performing 5-fold Stratified Cross Validation with Early Stopping...

Training Fold 1/5...
Fold 1 Accuracy: 0.9695
Best iteration: 70

Training Fold 2/5...
Fold 2 Accuracy: 0.9668
Best iteration: 58

Training Fold 3/5...
Fold 3 Accuracy: 0.9660
Best iteration: 69

Training Fold 4/5...
Fold 4 Accuracy: 0.9703
Best iteration: 124

Training Fold 5/5...
Fold 5 Accuracy: 0.9717
Best iteration: 72
Cross-validation scores: [0.96950067 0.96680162 0.9659919  0.97031039 0.97165227]
Mean CV Score: 0.9689 (+/- 0.0043)


In [8]:
best_iterations = []
fold_num = 1

print("Extracting best iterations from each CV fold...")
for train_idx, val_idx in skf.split(X_train, y_train_encoded):
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train_encoded[train_idx], y_train_encoded[val_idx]
    
    # Create temporary model to find best iteration
    temp_model = xgb.XGBClassifier(**xgb_params)
    temp_model.fit(
        X_fold_train, y_fold_train,
        eval_set=[(X_fold_val, y_fold_val)],
        early_stopping_rounds=50,
        verbose=False
    )
    
    best_iterations.append(temp_model.best_iteration)
    print(f"Fold {fold_num} best iteration: {temp_model.best_iteration}")
    fold_num += 1

# Use average best iteration for final model
optimal_n_estimators = int(np.mean(best_iterations))
print(f"\nOptimal n_estimators (average): {optimal_n_estimators}")
print(f"Range: {min(best_iterations)} - {max(best_iterations)}")

# Train final model on full dataset with optimal n_estimators
print(f"\nTraining final model on full dataset with {optimal_n_estimators} estimators...")
xgb_params_final = xgb_params.copy()
xgb_params_final['n_estimators'] = optimal_n_estimators

xgb_model_final = xgb.XGBClassifier(**xgb_params_final)
xgb_model_final.fit(X_train, y_train_encoded)

print("Final model trained on 100% of training data!")

Extracting best iterations from each CV fold...
Fold 1 best iteration: 136
Fold 2 best iteration: 54
Fold 3 best iteration: 58
Fold 4 best iteration: 103
Fold 5 best iteration: 62

Optimal n_estimators (average): 82
Range: 54 - 136

Training final model on full dataset with 82 estimators...
Final model trained on 100% of training data!


# Feature importance

In [9]:
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': xgb_model_final.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

Top 10 Most Important Features:
                     feature  importance
4  Drained_after_socializing    0.837219
1                 Stage_fear    0.123142
0           Time_spent_Alone    0.022384
2    Social_event_attendance    0.011043
3              Going_outside    0.003273
5        Friends_circle_size    0.001740
6             Post_frequency    0.001200


# Submission

In [10]:
test_predictions = xgb_model_final.predict(X_test)
test_pred_proba = xgb_model_final.predict_proba(X_test)

if y_train is not None:
    test_pred_labels = target_encoder.inverse_transform(test_predictions)
else:
    test_pred_labels = ['Introvert' if pred == 0 else 'Extrovert' for pred in test_predictions]

# Create Submission File
print("\n=== CREATING SUBMISSION FILE ===")
submission_df = pd.DataFrame({
    'id': test['id'],
    'Personality': test_pred_labels
})
submission_df.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")


=== CREATING SUBMISSION FILE ===
Submission file saved as 'submission.csv'
