<a href="https://colab.research.google.com/github/cod3astro/kaggle_ML_competition/blob/main/loan_payback.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_df = pd.read_csv('train.csv', index_col='id')
target = train_df['loan_paid_back']
train_df.drop(columns=['loan_paid_back'], inplace=True)
train_df['is_train'] = 1 # Mark training data

test_df = pd.read_csv('test.csv', index_col='id')
test_df['is_train'] = 0 # Mark test data

df = pd.concat([train_df, test_df])
df.head()

In [None]:
df.shape

In [None]:
df.isnull().any().any()

In [None]:
df.describe(include='all')

In [None]:
numerical_cols = df.select_dtypes(include=np.number).columns
skewness = df[numerical_cols].skew().sort_values(ascending=False)

print("Skewness of numerical features:\n")
print(skewness)

Let's visualize the distribution of the top 5 most skewed numerical features to better understand their skewness.

In [None]:
top_skewed_features = skewness.head(2).index
bottom_skewed_features = skewness.tail(2).index

def plot_skewness(columns):
    plt.figure(figsize=(20, 10))
    for i, col in enumerate(columns):
        plt.subplot(2, 3, i + 1)
        sns.histplot(df[col], kde=True)
        plt.title(f'Distribution of {col} (Skewness: {skewness[col]:.2f})')
        plt.xlabel(col)
        plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
plot_skewness(columns=top_skewed_features)
plot_skewness(columns=bottom_skewed_features)

In [None]:
for col in ['annual_income', 'debt_to_income_ratio']:
    df[col] = np.log1p(df[col])
    print(f'Transformed skewness of {col}: {df[col].skew():.2f}')

In [None]:
transformed_features = ['annual_income', 'debt_to_income_ratio']

plt.figure(figsize=(15, 6))
for i, col in enumerate(transformed_features):
    plt.subplot(1, 2, i + 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of Transformed {col} (Skewness: {df[col].skew():.2f})')
    plt.xlabel(col)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
# Reload df to revert previous transformations on 'debt_to_income_ratio'
train = pd.read_csv('train.csv', index_col='id')
train.drop(columns=['loan_paid_back'], inplace=True)
train['is_train'] = 1

test = pd.read_csv('test.csv', index_col='id')
test['is_train'] = 0

df = pd.concat([train, test])

# Apply log1p transformation to 'annual_income'
df['annual_income'] = np.log1p(df['annual_income'])
print(f"Transformed skewness of annual_income (log1p): {df['annual_income'].skew():.2f}")

# Apply square root transformation to 'debt_to_income_ratio'
df['debt_to_income_ratio'] = np.sqrt(df['debt_to_income_ratio'])
print(f"Transformed skewness of debt_to_income_ratio (sqrt): {df['debt_to_income_ratio'].skew():.2f}")

In [None]:
from scipy.stats import boxcox

# Reload df to revert previous transformations on 'debt_to_income_ratio'
train = pd.read_csv('train.csv', index_col='id')
train.drop(columns=['loan_paid_back'], inplace=True)
train['is_train'] = 1

test = pd.read_csv('test.csv', index_col='id')
test['is_train'] = 0

df = pd.concat([train, test])

# Apply Box-Cox transformation to 'debt_to_income_ratio'
# Box-Cox requires data to be strictly positive. Check and ensure.
for col in ['debt_to_income_ratio', 'annual_income']:
    if (df[col] <= 0).any():
        print("Warning:", col, "contains non-positive values. Box-Cox might not be suitable or requires adjustment.")
        # A common approach for Box-Cox with non-positive values is to add a small constant
        # df['debt_to_income_ratio_transformed'], lambda_val = boxcox(df['debt_to_income_ratio'] + 1e-6)
    else:
        df[col], lambda_val = boxcox(df[col])
        print(f"Transformed skewness of debt_to_income_ratio (Box-Cox, lambda={lambda_val:.2f}): {df['debt_to_income_ratio'].skew():.2f}")


In [None]:
transformed_features = ['annual_income', 'debt_to_income_ratio']

plt.figure(figsize=(15, 6))
for i, col in enumerate(transformed_features):
    plt.subplot(1, 2, i + 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of Transformed {col} (Skewness: {df[col].skew():.2f})')
    plt.xlabel(col)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
categorical_col = df.select_dtypes(include='object').columns
for col in categorical_col:
    unique_values = df[col].unique()
    print(f'{col} ({len(unique_values)} unique)')
    print(df[col].unique())

In [None]:
train = df[df['is_train'] == 1].copy()
train.drop(columns=['is_train'], inplace=True)

display(train.head())

In [None]:
print(train.shape)
print(train_df.shape)

In [None]:
train = train.merge(target, left_index=True, right_index=True)

In [None]:
train.head()

In [None]:
train['loan_paid_back'].value_counts(normalize=True)

In [None]:
# Importing libraries
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix, roc_auc_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from scipy.stats import pointbiserialr, chi2_contingency
from imblearn.pipeline import Pipeline as ImbPipeline

In [None]:
X = train.drop(columns=['loan_paid_back'])
y = train['loan_paid_back']

In [None]:
numerical_col = X.select_dtypes(include=np.number).columns

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1)) / (n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

cramers_scores = {}
for col in categorical_col:
    cramers_scores[col] = cramers_v(train[col], train['loan_paid_back'])

X_num = train[numerical_col]
y = train['loan_paid_back']
mi_scores = mutual_info_classif(X_num, y, discrete_features=False)
mi_scores_dict = dict(zip(numerical_col, mi_scores))

all_scores = {**cramers_scores, **mi_scores_dict}
all_scores_series = pd.Series(all_scores).sort_values(ascending=False)

print("\nüìä Combined Feature Correlation Report with Target:")
print(all_scores_series)

In [None]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42)

In [None]:
onehot_col = ['gender', 'marital_status', 'employment_status', 'loan_purpose']
ordinal_col = ['education_level', 'grade_subgrade']

In [None]:
# Define ordinal categories based on their intrinsic order
education_categories = ['Other', 'High School', "Bachelor's", "Master's", 'PhD']

# Generate grade_subgrade categories in the correct order (A1-F5)
grades = ['A', 'B', 'C', 'D', 'E', 'F']
subgrades = [str(i) for i in range(1, 6)]
grade_subgrade_categories = [g + s for g in grades for s in subgrades]

# Preprocessing steps
numerical_transformer = StandardScaler()
onehot_transformer = OneHotEncoder(handle_unknown='ignore')
ordinal_transformer = OrdinalEncoder(categories=[education_categories, grade_subgrade_categories], handle_unknown='use_encoded_value', unknown_value=-1) # handle_unknown='use_encoded_value' and unknown_value=-1 for unseen categories

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_col),
        ('onehot', onehot_transformer, onehot_col),
        ('ordinal', ordinal_transformer, ordinal_col)
    ],
    remainder='passthrough' # Keep other columns (like 'is_train' if present, though it's dropped from X)
)

# Create an imblearn pipeline that includes preprocessing, SMOTE, and a classifier
# SMOTE will be applied only on training data within each CV fold
pipeline_ = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42)) # Placeholder classifier
])

print("Preprocessing pipeline_ with SMOTE defined successfully.")
print(pipeline_)

In [None]:
education_categories = ['Other', 'High School', "Bachelor's", "Master's", 'PhD']

def sort_grade_subgrade(grades):
    # Custom sort key for grade_subgrade
    def get_sort_key(item):
        grade = item[0]
        subgrade = int(item[1:])
        return (grade, subgrade)
    return sorted(grades, key=get_sort_key)

grade_subgrade_categories = sort_grade_subgrade(df['grade_subgrade'].unique())

print("Education Categories:", education_categories)
print("Grade Subgrade Categories (first 10):", grade_subgrade_categories[:10])
print("Grade Subgrade Categories (last 10):", grade_subgrade_categories[-10:])

In [None]:
# Instantiate base models with random_state=42 where applicable
logreg_model = LogisticRegression(solver='liblinear', random_state=42)
decision_model = DecisionTreeClassifier(random_state=42)
random_model = RandomForestClassifier(random_state=42)
gradient_model = GradientBoostingClassifier(random_state=42)
svc_model = SVC(random_state=42, probability=True) # Added probability=True for roc_auc
knn_model = KNeighborsClassifier()
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
lgbm_model = LGBMClassifier(random_state=42, verbosity=-1) # Added verbosity to suppress warnings
catboost_model = CatBoostClassifier(random_state=42, verbose=0)

# Create a dictionary to map model names to their objects
models = {
    'LogisticRegression': logreg_model,
    'DecisionTree': decision_model,
    'RandomForest': random_model,
    'XGBoost': xgb_model,
    'CatBoost': catboost_model,
    'LightGBM': lgbm_model,
    'GradientBoosting': gradient_model,
    'SVC': svc_model,
    'KNN': knn_model
}
# Define parameter grids for each model, prefixed with 'classifier__'
param_grids = {
    'LogisticRegression': {
        'classifier__C': [0.01, 0.1, 1],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__solver': ['liblinear']
    },
    'DecisionTree': {
        'classifier__criterion': ['gini', 'entropy'],
        'classifier__max_depth': [5, 10, 15, 20],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    },
    'RandomForest': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__max_features': ['sqrt', 'log2']
    },
    'XGBoost': {
        'classifier__n_estimators': [100, 200],
        'classifier__learning_rate': [0.01, 0.05, 0.1],
        'classifier__max_depth': [3, 5],
        'classifier__subsample': [0.7, 1.0],
        'classifier__colsample_bytree': [0.7, 1.0]
    },
    'CatBoost': {
        'classifier__iterations': [100, 200],
        'classifier__learning_rate': [0.05, 0.1],
        'classifier__depth': [4, 6, 8],
        'classifier__l2_leaf_reg': [1, 3, 5]
    },
    'LightGBM': {
        'classifier__n_estimators': [100, 200, 400],
        'classifier__learning_rate': [0.01, 0.05, 0.1],
        'classifier__num_leaves': [20, 31, 40],
        'classifier__subsample': [0.7, 1.0],
        'classifier__colsample_bytree': [0.7, 1.0]
    },
    'GradientBoosting': {
        'classifier__n_estimators': [100, 200, 500],
        'classifier__learning_rate': [0.01, 0.05, 0.1],
        'classifier__max_depth': [3, 5, 7],
        'classifier__subsample': [0.7, 1.0]
    },
    'SVC': {
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__gamma': ['scale', 'auto', 0.1, 1]
    },
    'KNN': {
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan']
    }
}
print("Defined models and their parameter grids:")
for model_name, params in param_grids.items():
    print(f"\n{model_name}:")
    for param_name, values in params.items():
        print(f"  {param_name}: {values}")

In [None]:
results = {}

# Iterate through the 'models' dictionary
for model_name, model_object in models.items():

    print(f"--- Starting GridSearchCV for: {model_name} ---")

    # Create the pipeline *inside* the loop
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model_object)  # The model object
    ])

    # Get the correct parameter grid from param_grids
    current_param_grid = param_grids[model_name]

    # Instantiate GridSearchCV
    model_grid_search = GridSearchCV(
        pipeline,           # Use the pipeline
        current_param_grid, # Use the model's specific grid
        scoring='roc_auc',
        cv=5,
        n_jobs=-1,          # Use all available cores
        verbose=1
    )

    # Fit the grid search *INSIDE* the loop
    model_grid_search.fit(train_X, train_y)

    # Print results *INSIDE* the loop
    print(f"\nBest ROC AUC Score for {model_name}: {model_grid_search.best_score_:.4f}")
    print(f"Best Parameters for {model_name}: {model_grid_search.best_params_}")
    print("-" * 50)

    # Store results
    results[model_name] = {
        'best_score': model_grid_search.best_score_,
        'best_params': model_grid_search.best_params_,
        'best_estimator': model_grid_search.best_estimator_
    }

print("\n=== All Grid Searches Complete ===")
# You can now inspect the 'results' dictionary
for model_name, result in results.items():
    print(f"{model_name} Best Score: {result['best_score']:.4f}")