In [None]:
# Final Rough

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split

loan_full = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)
# submission = pd.read_csv('data/sample_submission.csv', index_col=0)

cols = loan_full.columns
feats = cols[:-1]
target = cols[-1]


# taking sample of the data
loan, loan_test = train_test_split(loan_full, stratify=loan_full[target], train_size=0.15, random_state=42)
loan.shape 

(53459, 12)

In [142]:
# grade_subgrade labeling
grades = 'ABCDEFG'
subgrades = [ch+str(i) for ch in grades for i in range(1,6)]
grade_mapping = {
    subgrade: i+1 for i, subgrade in enumerate(subgrades)
}

# education_level labeling
education_mapping = {
    'High School': 1,
    'Other': 2,
    "Bachelor's": 3,
    "Master's": 4,
    'PhD': 5
}

# one hot encoding pipeline
ohe_pipeline = Pipeline(steps=[
    ("ohe", OneHotEncoder(
        drop="first",
        sparse_output=False,
        handle_unknown="ignore"
    ))
])

# col lists
ORDINAL_COLS = ['grade_subgrade', 'education_level']

NOMINAL_COLS = ['gender', 'marital_status', 'employment_status', 'loan_purpose']

NUMERICAL_COLS = ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']

In [192]:
# data preprocessing
def data_preprocess(data):
    X = data[feats].copy()
    y = data[target].apply(int).copy()


    X['education_level'] = X['education_level'].map(education_mapping)
    X['grade_subgrade'] = X['grade_subgrade'].map(grade_mapping)


    X_ohe = ohe_pipeline.fit_transform(X[NOMINAL_COLS])
    X_ohe_df = pd.DataFrame(X_ohe, columns=ohe_pipeline.get_feature_names_out(), index=X.index)

    X = pd.concat([X.drop(columns=NOMINAL_COLS), X_ohe_df], axis=1)
    return X,y

X,y = data_preprocess(loan)

In [197]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


knn_clf = KNeighborsClassifier(
    n_neighbors=5
)

dt_clf = DecisionTreeClassifier(
    max_depth=None, 
    random_state=42
)

rf_clf = RandomForestClassifier(
    n_estimators=200, 
    max_depth=8, 
    random_state=42
)

et_clf = ExtraTreesClassifier(
    n_estimators=100, 
    max_depth=10, 
    random_state=42
)

svm_clf = SVC(
    kernel='rbf', 
    probability=True, 
    random_state=42
)

lgbm_clf = LGBMClassifier(
    n_estimators=1000, 
    max_depth=-1, 
    random_state=42
)

ada_clf = AdaBoostClassifier(
    n_estimators=50, 
    random_state=42
)

qda_clf = QuadraticDiscriminantAnalysis()

gnb_clf = GaussianNB()


lr_clf = LogisticRegression(
    max_iter=1000, 
    random_state=42
)

gbc_clf = GradientBoostingClassifier(
    learning_rate=0.05,
    n_estimators=500,
    max_depth=2,
    random_state=42
)

xgb_clf = XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

cat_clf = CatBoostClassifier(
    iterations=100,
    depth=3,
    learning_rate=0.1,
    verbose=0
)

train_X, test_X, train_y, test_y = train_test_split(X,y,stratify=y, random_state=1)

In [198]:
clf = lgbm_clf
clf.fit(train_X, train_y)

train_y_pred = clf.predict(train_X)
test_y_pred = clf.predict(test_X)

print(accuracy_score(train_y, train_y_pred))
print(accuracy_score(test_y, test_y_pred))

[LightGBM] [Info] Number of positive: 32058, number of negative: 8036
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000879 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1343
[LightGBM] [Info] Number of data points in the train set: 40094, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.799571 -> initscore=1.383615
[LightGBM] [Info] Start training from score 1.383615
0.9794482965032174
0.9009352787130565


In [None]:
"""
gbc_clf
    0.9048486057764255
    0.904900860456416

rf_clf
    0.9027784705941039
    0.903179947624392
    0.9007255973692185

"""


In [199]:
X_full, y_full = data_preprocess(loan_test)
y_full_pred = clf.predict(X_full)
print(accuracy_score(y_full, y_full_pred))

0.8995203623195582


In [None]:
# submission file 

test[target] = None
X_test_full, _ = data_preprocess(test)
y_test_full_pred = clf.predict(X_test_full)

submission_df = pd.DataFrame(y_test_full_pred, columns=[target], index=X_test_full.index)
submission_df.to_csv("test_submission.csv")