In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_selection import SelectFromModel
import warnings

warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os


In [4]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# EDA

In [7]:

# Load your training dataset
print("Loading training data...")
train_data = pd.read_csv('/kaggle/input/credit-card-defaulter-hackathon/train_data.csv')
print("Training data loaded.")

# Remove white spaces from column names
print("Removing white spaces from column names...")
train_data.columns = train_data.columns.str.strip()
print("White spaces removed.")



Loading training data...


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/credit-card-defaulter-hackathon/train_data.csv'

In [None]:
train_data.describe()

# Data preparation

In [None]:
# Identify non-numerical columns
print("Identifying non-numerical columns...")
non_numerical_cols = train_data.select_dtypes(include=['object']).columns
print(f"Non-numerical columns: {non_numerical_cols}")

# Apply one-hot encoding to non-numerical columns on the training data
print("Applying one-hot encoding...")
train_data = pd.get_dummies(train_data, columns=non_numerical_cols, drop_first=True)
print("One-hot encoding applied.")

# Define features and target variable
print("Defining features and target variable...")
X = train_data.drop(['ID', 'TARGET'], axis=1)
y = train_data['TARGET']

# Split the data into training and testing sets
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print("Data split completed.")



# Feature selection and modeling

In [None]:
# Train an XGBoost Classifier on the raw data
print("Training initial XGBoost model...")
xgb = XGBClassifier(random_state=42, scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]))
xgb.fit(X_train, y_train)
print("Initial XGBoost model trained.")

# Feature selection using XGBoost feature importances
print("Applying feature selection using XGBoost...")
selector = SelectFromModel(xgb, threshold='mean', prefit=True)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)
print("Feature selection applied.")

# Parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9]
}


# Evaluation and tuning

In [None]:
print("Starting grid search for parameter tuning...")
grid_search = GridSearchCV(estimator=XGBClassifier(random_state=42, scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1])),
                           param_grid=param_grid, scoring='roc_auc', cv=3, verbose=1)
grid_search.fit(X_train_selected, y_train)
print("Grid search completed.")

# Best parameters and model from GridSearchCV
best_params = grid_search.best_params_
best_xgb = grid_search.best_estimator_

print("Best parameters found:")
print(best_params)

# Make predictions on the test set
print("Making predictions on the test set...")
y_pred = best_xgb.predict(X_test_selected)
y_pred_proba = best_xgb.predict_proba(X_test_selected)[:, 1]
print("Predictions made.")

# Evaluate the model
print("Evaluating the model...")
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred))

print("XGBoost Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("XGBoost ROC AUC Score:")
print(roc_auc_score(y_test, y_pred_proba))

# Using model

In [None]:



# Load your test dataset
print("Loading test data...")
test_data = pd.read_csv('/kaggle/input/credit-card-defaulter-hackathon/test_data.csv')
print("Test data loaded.")

# Remove white spaces from column names
print("Removing white spaces from column names in test data...")
test_data.columns = test_data.columns.str.strip()
print("White spaces removed.")

# Apply one-hot encoding to non-numerical columns on the test data
print("Applying one-hot encoding to test data...")
test_data = pd.get_dummies(test_data, columns=non_numerical_cols, drop_first=True)
print("One-hot encoding applied to test data.")

# Ensure the test dataset has the same columns as the training dataset
print("Aligning test data columns with training data...")
for c in X.columns:
    if c not in test_data.columns:
        test_data[c] = 0
test_data = test_data[X.columns]
print("Test data columns aligned.")

# Transform the test data using the same feature selection
print("Applying feature selection to test data...")
test_data_selected = selector.transform(test_data)
print("Feature selection applied to test data.")

# Make predictions on the test dataset
print("Making predictions on the test dataset...")
test_predictions = best_xgb.predict(test_data_selected)
test_probabilities = best_xgb.predict_proba(test_data_selected)[:, 1]
print("Predictions on test dataset made.")

# Create a DataFrame with IDs and predictions
print("Creating results DataFrame...")
results_df = pd.DataFrame({
    'ID': pd.read_csv('/kaggle/input/credit-card-defaulter-hackathon/test_data.csv')['ID'],  # Load IDs again for submission
    'Prediction': test_predictions
})
print("Results DataFrame created.")

# Save the results to a CSV file
print("Saving results to CSV...")
results_df.to_csv('sample_submission.csv', index=False)
