In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
import xgboost as xgb, lightgbm as lgbm, catboost as cb

In [2]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [3]:
def balance_df_by_target(df, target_name):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1)

In [4]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [5]:
DATASET_PATH = './course_project_train.csv'

In [6]:
df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 5943 non-null   float64
 2   Years in current job          7129 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [8]:
df.describe()

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
count,5943.0,7500.0,7500.0,7500.0,7500.0,7500.0,3419.0,7486.0,7500.0,7500.0,7500.0,5943.0,7500.0
mean,1366392.0,0.030133,11.130933,18.317467,945153.7,0.17,34.6926,0.117152,11873180.0,289833.2,18314.454133,1151.087498,0.281733
std,845339.2,0.271604,4.908924,7.041946,16026220.0,0.498598,21.688806,0.347192,31926120.0,317871.4,11926.764673,1604.451418,0.449874
min,164597.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,11242.0,0.0,0.0,585.0,0.0
25%,844341.0,0.0,8.0,13.5,279229.5,0.0,16.0,0.0,180169.0,114256.5,10067.5,711.0,0.0
50%,1168386.0,0.0,10.0,17.0,478159.0,0.0,32.0,0.0,309573.0,209323.0,16076.5,731.0,0.0
75%,1640137.0,0.0,14.0,21.8,793501.5,0.0,50.0,0.0,519882.0,360406.2,23818.0,743.0,1.0
max,10149340.0,7.0,43.0,57.7,1304726000.0,7.0,118.0,4.0,100000000.0,6506797.0,136679.0,7510.0,1.0


In [9]:
for cat_colname in df.select_dtypes(include='object').columns:
    print(str(cat_colname) + '\n\n' + str(df[cat_colname].value_counts()) + '\n' + '*' * 100 + '\n')

Home Ownership

Home Mortgage    3637
Rent             3204
Own Home          647
Have Mortgage      12
Name: Home Ownership, dtype: int64
****************************************************************************************************

Years in current job

10+ years    2332
2 years       705
3 years       620
< 1 year      563
5 years       516
1 year        504
4 years       469
6 years       426
7 years       396
8 years       339
9 years       259
Name: Years in current job, dtype: int64
****************************************************************************************************

Purpose

debt consolidation      5944
other                    665
home improvements        412
business loan            129
buy a car                 96
medical bills             71
major purchase            40
take a trip               37
buy house                 34
small business            26
wedding                   15
moving                    11
educational expenses      10
vacation  

In [10]:
df['Years in current job'] = df['Years in current job'].astype(str)
df['Years in current job'] = df['Years in current job'].map(lambda x: x.rstrip(' years').rstrip(' year').rstrip('+').lstrip('< '))
df['Years in current job'] = df['Years in current job'].astype(str).replace('nan',np.nan)
df['Years in current job'] = df['Years in current job'].astype('float64')
df['Years in current job'] = df['Years in current job'].fillna(0)
df['Annual Income'].fillna((df['Annual Income'].mean()), inplace=True)
df['Months since last delinquent'] = df['Months since last delinquent'].fillna(0)
df['Bankruptcies'] = df['Bankruptcies'].fillna(0)
df['Credit Score'].fillna((df['Credit Score'].mean()), inplace=True)

df = df.drop(["Home Ownership"],axis=1)



df = df.drop(["Purpose"],axis=1)

df.loc[df['Term'] == 'Short Term', 'Term'] = 1
df.loc[df['Term'] == 'Long Term', 'Term'] = 0
df['Term'] = df['Term'].astype('float64')
df.loc[df['Current Loan Amount'] >= 99999999, 'Current Loan Amount'] = df['Current Loan Amount'].median()



In [11]:
df = reduce_mem_usage(df)

Memory usage of dataframe is 0.86 MB
Memory usage after optimization is: 0.41 MB
Decreased by 52.5%


In [12]:
#df_num_features = df.select_dtypes(include=['float64','float32','float16','int64','int32','int16','int8'])
#df_num_features = df_num_features.drop('Price', axis=1)
#df_num_features.hist(figsize=(16,16), bins=20, grid=False, log=True)

In [13]:
columns = ["Maximum Open Credit","Annual Income","Credit Score","Monthly Debt","Current Credit Balance","Current Loan Amount"]
df[columns] = MinMaxScaler().fit_transform(np.array(df[columns]).reshape(-6,6))

In [14]:
#df.loc[df['Number of Open Accounts'] > 30, 'Number of Open Accounts'] = df['Number of Open Accounts'].median()
#df.loc[df['Years of Credit History'] > 40, 'Years of Credit History'] = df['Years of Credit History'].median()


In [15]:
X = df
X = X.drop(["Credit Default"],axis=1)
y = df['Credit Default']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_balanced = balance_df_by_target(df_for_balancing, "Credit Default")
    
df_balanced["Credit Default"].value_counts()

0    3795
1    2910
Name: Credit Default, dtype: int64

In [18]:
X_train = df_balanced.drop(columns="Credit Default")
y_train = df_balanced["Credit Default"]

In [19]:
#gb = GradientBoostingClassifier(n_estimators = 200, max_depth = 15, learning_rate = 0.05, random_state = 42)

In [20]:
#gb.fit(X_train, y_train)

In [21]:
#y_pred_gb = gb.predict(X_test)

In [22]:
#model_xgb = xgb.XGBClassifier(random_state=21)
#model_xgb.fit(X_train, y_train)

#y_train_pred_xgb = model_xgb.predict(X_train)
#y_test_pred_xgb = model_xgb.predict(X_test)

#get_classification_report(y_train, y_train_pred_xgb, y_test, y_test_pred_xgb)

In [23]:
#model_lgbm = lgbm.LGBMClassifier(random_state=21)
#model_lgbm.fit(X_train, y_train)

#y_train_pred_lgbm = model_lgbm.predict(X_train)
#y_test_pred_lgbm = model_lgbm.predict(X_test)

#get_classification_report(y_train, y_train_pred_lgbm, y_test, y_test_pred_lgbm)

In [24]:
model_catb = cb.CatBoostClassifier(n_estimators=125, depth=7,
                                      class_weights=[1, 2.5],grow_policy='SymmetricTree',
                                      silent=True, random_state=21)
model_catb.fit(X_train, y_train)

y_train_pred_cat = model_catb.predict(X_train)
y_test_pred_cat = model_catb.predict(X_test)

get_classification_report(y_train, y_train_pred_cat, y_test, y_test_pred_cat)

TRAIN

              precision    recall  f1-score   support

           0       0.96      0.58      0.72      3795
           1       0.64      0.97      0.77      2910

    accuracy                           0.75      6705
   macro avg       0.80      0.78      0.75      6705
weighted avg       0.82      0.75      0.74      6705

TEST

              precision    recall  f1-score   support

           0       0.87      0.50      0.64      1592
           1       0.40      0.82      0.54       658

    accuracy                           0.59      2250
   macro avg       0.64      0.66      0.59      2250
weighted avg       0.73      0.59      0.61      2250

CONFUSION MATRIX

col_0             0    1
Credit Default          
0               797  795
1               119  539


In [25]:
#f1_score(y_test, y_pred_gb)

In [26]:
#f1_score(y_test, y_test_pred_xgb)

In [27]:
#f1_score(y_test, y_test_pred_lgbm)

In [28]:
f1_score(y_test, y_test_pred_cat)

0.5411646586345381

In [29]:
df1 = pd.read_csv('./course_project_test.csv')
df1.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
0,Rent,,4 years,0.0,9.0,12.5,220968.0,0.0,70.0,0.0,debt consolidation,Short Term,162470.0,105906.0,6813.0,
1,Rent,231838.0,1 year,0.0,6.0,32.7,55946.0,0.0,8.0,0.0,educational expenses,Short Term,78298.0,46037.0,2318.0,699.0
2,Home Mortgage,1152540.0,3 years,0.0,10.0,13.7,204600.0,0.0,,0.0,debt consolidation,Short Term,200178.0,146490.0,18729.0,7260.0
3,Home Mortgage,1220313.0,10+ years,0.0,16.0,17.0,456302.0,0.0,70.0,0.0,debt consolidation,Short Term,217382.0,213199.0,27559.0,739.0
4,Home Mortgage,2340952.0,6 years,0.0,11.0,23.6,1207272.0,0.0,,0.0,debt consolidation,Long Term,777634.0,425391.0,42605.0,706.0


In [30]:
df1['Years in current job'] = df1['Years in current job'].astype(str)
df1['Years in current job'] = df1['Years in current job'].map(lambda x: x.rstrip(' years').rstrip(' year').rstrip('+').lstrip('< '))
df1['Years in current job'] = df1['Years in current job'].astype(str).replace('nan',np.nan)
df1['Years in current job'] = df1['Years in current job'].astype('float64')
df1['Years in current job'] = df1['Years in current job'].fillna(0)
df1['Annual Income'].fillna((df1['Annual Income'].mean()), inplace=True)
df1['Months since last delinquent'] = df1['Months since last delinquent'].fillna(0)
df1['Bankruptcies'] = df1['Bankruptcies'].fillna(0)
df1['Credit Score'].fillna((df1['Credit Score'].mean()), inplace=True)

df1 = df1.drop(["Home Ownership"],axis=1)



df1 = df1.drop(["Purpose"],axis=1)

df1.loc[df1['Term'] == 'Short Term', 'Term'] = 1
df1.loc[df1['Term'] == 'Long Term', 'Term'] = 0
df1['Term'] = df1['Term'].astype('float64')
df1.loc[df1['Current Loan Amount'] >= 99999999, 'Current Loan Amount'] = df1['Current Loan Amount'].median()

In [31]:
df1 = reduce_mem_usage(df1)

Memory usage of dataframe is 0.27 MB
Memory usage after optimization is: 0.13 MB
Decreased by 50.0%


In [32]:
columns = ["Maximum Open Credit","Annual Income","Credit Score","Monthly Debt","Current Credit Balance","Current Loan Amount"]
df1[columns] = MinMaxScaler().fit_transform(np.array(df1[columns]).reshape(-6,6))

In [33]:
y_pred_cat = model_catb.predict(df1)
y_pred_cat

array([0, 0, 1, ..., 1, 0, 1], dtype=int64)

In [37]:
pd.DataFrame(y_pred_cat).to_csv("gortunov_predictions.csv", index=True)