In [2]:
import pandas as pd
import numpy as np
import random
from sklearn import metrics
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score

In [4]:
data = pd.read_csv('loan_data_exam.csv')
data

Unnamed: 0.1,Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,40997,35.0,female,High School,157123.0,9,MORTGAGE,6657.0,HOMEIMPROVEMENT,17.37,0.04,5.0,659.0,Yes,0.0
1,44208,26.0,male,Master,42906.0,3,RENT,12958.0,MEDICAL,11.59,0.30,3.0,645.0,No,1.0
2,9915,22.0,,Associate,58958.0,3,RENT,9000.0,DEBTCONSOLIDATION,7.88,0.15,2.0,652.0,Yes,0.0
3,17597,22.0,male,High School,82024.0,0,RENT,7500.0,VENTURE,9.76,0.09,3.0,566.0,Yes,0.0
4,21842,31.0,female,Master,78532.0,8,RENT,6000.0,VENTURE,6.99,0.08,10.0,703.0,No,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3451,26.0,male,Associate,44825.0,0,MORTGAGE,5000.0,MEDICAL,6.03,0.11,2.0,599.0,No,0.0
9996,24181,30.0,female,Associate,55089.0,5,RENT,10000.0,PERSONAL,7.51,0.18,7.0,,No,0.0
9997,3909,23.0,male,High School,50147.0,1,RENT,4000.0,MEDICAL,7.68,0.08,4.0,672.0,No,0.0
9998,2030,,,Associate,27735.0,0,RENT,2200.0,MEDICAL,11.01,0.08,3.0,588.0,Yes,0.0


In [5]:
def drop_data(data:pd.DataFrame, columns:list):
  data_copy = data.copy()
  data_copy.drop(columns, axis=1, inplace=True)
  return data_copy

In [6]:
def drop_missing_data(data:pd.DataFrame, columns:list):
  data_copy = data.copy()
  data_copy.dropna(subset=columns, inplace=True)
  return data_copy

In [7]:
def simple_impute_data(data:pd.DataFrame, columns:list, strategy:str):
  imputer = SimpleImputer(strategy=strategy)
  data_copy = data.copy()

  for column in columns:
    data_copy[column] = imputer.fit_transform(data_copy[[column]])
  return data_copy

In [8]:
def label_data(data:pd.DataFrame, columns:list):
  encoder = LabelEncoder()
  data_copy = data.copy()

  for column in columns:
    data_copy[column] = encoder.fit_transform(data_copy[[column]].astype(str).values.ravel())

    if 'nan' in encoder.classes_:
      data_copy.loc[data_copy[column] == -1, column] = np.nan
  return data_copy

In [9]:
def knn_impute_data(data:pd.DataFrame, columns:list, n_neighbors:int, weights:str):
  imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
  data_copy = data.copy()

  for column in columns:
    data_copy[column] = imputer.fit_transform(data_copy[column].to_numpy().reshape(-1, 1))
  return data_copy

In [10]:
data.sample(6)


Unnamed: 0.1,Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
5006,15332,24.0,male,Master,74877.0,1,RENT,15000.0,HOMEIMPROVEMENT,18.25,0.2,4.0,693.0,Yes,
9736,27995,27.0,male,,38607.0,6,RENT,6000.0,DEBTCONSOLIDATION,12.84,0.16,8.0,,No,1.0
165,1544,22.0,female,Bachelor,122176.0,0,RENT,1475.0,EDUCATION,5.79,0.01,3.0,699.0,No,0.0
1992,42086,23.0,,Associate,72997.0,0,RENT,,HOMEIMPROVEMENT,8.57,0.08,2.0,644.0,Yes,0.0
7610,33041,29.0,male,Master,152213.0,6,MORTGAGE,6226.0,VENTURE,9.69,0.04,8.0,669.0,Yes,0.0
8362,36118,34.0,male,Bachelor,118644.0,10,MORTGAGE,20000.0,VENTURE,9.62,0.17,9.0,664.0,No,0.0


In [11]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0                      10000 non-null  int64  
 1   person_age                      8991 non-null   float64
 2   person_gender                   9184 non-null   object 
 3   person_education                9091 non-null   object 
 4   person_income                   10000 non-null  float64
 5   person_emp_exp                  10000 non-null  int64  
 6   person_home_ownership           10000 non-null  object 
 7   loan_amnt                       9553 non-null   float64
 8   loan_intent                     10000 non-null  object 
 9   loan_int_rate                   10000 non-null  float64
 10  loan_percent_income             10000 non-null  float64
 11  cb_person_cred_hist_length      10000 non-null  float64
 12  credit_score                    8

In [12]:
data.isna().sum()


Unnamed: 0                           0
person_age                        1009
person_gender                      816
person_education                   909
person_income                        0
person_emp_exp                       0
person_home_ownership                0
loan_amnt                          447
loan_intent                          0
loan_int_rate                        0
loan_percent_income                  0
cb_person_cred_hist_length           0
credit_score                      1042
previous_loan_defaults_on_file       0
loan_status                        837
dtype: int64

In [13]:
data = drop_data(data=data, columns=['Unnamed: 0', 'previous_loan_defaults_on_file'])
data = drop_missing_data(data=data, columns=['loan_amnt'])

data = simple_impute_data(data=data, columns=['person_age', 'credit_score'], strategy='median')
data = label_data(data=data, columns=['person_gender', 'person_education', 'loan_intent', 'person_home_ownership', 'loan_status'])
data = knn_impute_data(data=data, columns=['person_gender', 'person_education', 'loan_status'], n_neighbors=3, weights='uniform')


In [14]:
data.sample()


Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status
5722,25.0,1.0,3.0,68368.0,4,3,7000.0,3,14.22,0.1,4.0,589.0,0.0


In [15]:
data.isna().sum()


person_age                    0
person_gender                 0
person_education              0
person_income                 0
person_emp_exp                0
person_home_ownership         0
loan_amnt                     0
loan_intent                   0
loan_int_rate                 0
loan_percent_income           0
cb_person_cred_hist_length    0
credit_score                  0
loan_status                   0
dtype: int64

In [16]:
input_data = data.copy()
input_data = drop_data(data=data, columns=['loan_status'])

target_data = data.copy()['loan_status']

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(input_data, target_data, test_size=0.2)


In [20]:
#klasifikacija

In [18]:
model = XGBClassifier(max_depth=5, min_child_weight=1, n_estimators=500, n_jobs=-1, learning_rate=0.1)
model.fit(X_train, Y_train)

y_pred = model.predict(X_test)
print("Accuracy:", metrics.accuracy_score(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))

Accuracy: 0.8100470957613815
[[1277   39   12]
 [ 148  271    1]
 [ 140   23    0]]
              precision    recall  f1-score   support

         0.0       0.82      0.96      0.88      1328
         1.0       0.81      0.65      0.72       420
         2.0       0.00      0.00      0.00       163

    accuracy                           0.81      1911
   macro avg       0.54      0.54      0.53      1911
weighted avg       0.75      0.81      0.77      1911



In [19]:
classifier = DecisionTreeClassifier(max_depth=3)
classifier = classifier.fit(X_train,Y_train)

y_pred = classifier.predict(X_test)
print("Accuracy:", metrics.accuracy_score(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))

Accuracy: 0.7912087912087912
[[1304   24    0]
 [ 212  208    0]
 [ 143   20    0]]
              precision    recall  f1-score   support

         0.0       0.79      0.98      0.87      1328
         1.0       0.83      0.50      0.62       420
         2.0       0.00      0.00      0.00       163

    accuracy                           0.79      1911
   macro avg       0.54      0.49      0.50      1911
weighted avg       0.73      0.79      0.74      1911



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [21]:
#regredija

In [22]:
input_data = data.copy()
input_data = drop_data(data=data, columns=['loan_amnt'])

target_data = data.copy()['loan_amnt']
X_train, X_test, Y_train, Y_test = train_test_split(input_data, target_data, test_size=0.2)
model = XGBRegressor(objective ='reg:linear', colsample_bytree=0.3, learning_rate=0.1, max_depth=7, n_estimators=100)
model.fit(X_train, Y_train)

y_pred = model.predict(X_test)

  bst.update(dtrain, iteration=i, fobj=obj)


In [23]:
print("Mean Absolute Error : " + str(mean_absolute_error(y_pred, Y_test)))
print("R2 Score : " + str(r2_score(y_pred, Y_test)))
print("Mean Squared Error : " + str(mean_squared_error(y_pred, Y_test)))
print("Root Mean Squared Error : " + str(np.sqrt(mean_squared_error(y_pred, Y_test))))

Mean Absolute Error : 1355.1002708247418
R2 Score : 0.8361492426853079
Mean Squared Error : 4179187.2081298786
Root Mean Squared Error : 2044.3060456130042


In [24]:
LGBM = LGBMRegressor(n_estimators = 50)
LGBM.fit(X_train,Y_train)

y_pred = LGBM.predict(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000884 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 921
[LightGBM] [Info] Number of data points in the train set: 7642, number of used features: 12
[LightGBM] [Info] Start training from score 9628.730306


In [25]:
print("Mean Absolute Error : " + str(mean_absolute_error(y_pred, Y_test)))
print("R2 Score : " + str(r2_score(y_pred, Y_test)))
print("Mean Squared Error : " + str(mean_squared_error(y_pred, Y_test)))
print("Root Mean Squared Error : " + str(np.sqrt(mean_squared_error(y_pred, Y_test))))

Mean Absolute Error : 288.72183152639366
R2 Score : 0.9916076036491591
Mean Squared Error : 332346.683316839
Root Mean Squared Error : 576.495171980511
