<a href="https://colab.research.google.com/github/dkurbatovv/Python/blob/main/Bank_Loan_(LogRegression_and_RandomForest_)_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#%pip install https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/master/xgboost-1.6.0.dev0%2B1d468e20a4fff83f3149e99371b67e6b31f64152-py3-none-manylinux2014_x86_64.whl


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score, log_loss, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression

In [None]:
df = pd.read_csv('lending_club_loan_dataset.csv')

In [None]:
df.head()
df = df.drop('id', axis = 1)


In [None]:
df = df.dropna(subset = ['dti'])
df = df.dropna(subset = ['home_ownership'])
df.info()

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), annot = True, cmap = 'seismic')

In [None]:
df = df.drop('last_major_derog_none', axis = 1)
#df = df.dropna(subset = ['last_major_derog_none'])

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(x = 'bad_loan', data = df)

In [None]:
features = ['grade',  'short_emp', 'emp_length_num', 'home_ownership',
            'purpose', 'term', 'last_delinq_none']
for f in features:
    plt.figure(figsize = (12,6))
    sns.countplot(x = f, data = df, palette = 'Set3', hue = 'bad_loan')
    plt.show()

In [None]:
for column in df:
    unique_val = np.unique(df[column])
    nr_val = len(unique_val)
    if nr_val < 12:
        print('The number of values for feature {} : {} -- :{}'.format(column, nr_val, unique_val))
    else:
        print('The number of values for feature {} : {}'.format(column, nr_val)) 

In [None]:
new_df = pd.get_dummies(df, columns = ['grade', 'home_ownership', 'term', 'purpose', 'emp_length_num'])

In [None]:
X = new_df.drop('bad_loan', axis = 1).values
y = new_df['bad_loan'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size=0.2, random_state=15)


In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.predict(X_train)
y_pred = lr.predict(X_train)

pred_proba = lr.predict_proba(X_train)


print('The training accuracy is ', lr.score(X_train, y_train))
print('The testing accuracy is ', lr.score(X_test, y_test))

print(classification_report(y_train, y_pred))

In [None]:
def plot_confusion_matrix(cm, classes=None, title='Confusion matrix'):
    """Plots a confusion matrix."""
    if classes is not None:
        sns.heatmap(cm, cmap="YlGnBu", xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True, annot_kws={'size':50})
    else:
        sns.heatmap(cm, vmin=0., vmax=1.)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
cm = confusion_matrix(y_train, y_pred)
cm_norm = cm / cm.sum(axis=1).reshape(-1,1)

plot_confusion_matrix(cm_norm, classes = lr.classes_, title='Confusion matrix')

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(criterion='entropy', n_estimators = 20)

In [None]:
rf.fit(X_train, y_train)
rf_prediction_test = rf.predict(X_test)

In [None]:
print('The training accuracy is ', rf.score(X_train, y_train))
print('The testing accuracy is ', rf.score(X_test, y_test))

In [None]:
cm = confusion_matrix(y_test, rf_prediction_test)
cm_norm = cm / cm.sum(axis = 1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_norm, classes = rf.classes_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost

In [None]:
classifier = xgboost.XGBClassifier(tree_method='gpu_hist')

params = {
          "learning_rate":[0.05,0.10,0.15,0.20,0.25,0.30],
          "max_depth":[2,3,4,5,6,8,10,12,15],
          "min_child_weight":[1,3,5,7],
          "gamma":[0.0,0.1,0.2,0.3,0.4],
          "colsample_bytree":[0.3,0.4,0.5,0.7]
}

In [None]:
clf = RandomizedSearchCV(classifier, param_distributions = params, n_iter = 5, cv = 5, verbose=2, scoring = 'roc_auc')

In [None]:
clf.fit(X,y)

In [None]:
clf.best_estimator_

In [None]:
clf.best_params_

In [None]:
final_model = xgboost.XGBClassifier(colsample_bytree=0.3, gamma=0.4, learning_rate=0.05, max_depth=5,
              min_child_weight=3, tree_method='gpu_hist')

In [None]:
final_model.fit(X, y)

In [None]:
pred_xgboost = final_model.predict(X)

In [None]:
cm = confusion_matrix(y, pred_xgboost)
cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_norm, classes=rf.classes_)

In [None]:
print('The testing accuracy is ', final_model.score(X, y))