<a href="https://colab.research.google.com/github/dkurbatovv/Python/blob/main/Random_Forest_Credit_Risk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Packages / libraries
import os #provides functions for interacting with the operating system
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

# To install sklearn type "pip install numpy scipy scikit-learn" to the anaconda terminal

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increases the size of sns plots
sns.set(rc={'figure.figsize':(8,6)})

# Datetime lib
from pandas import to_datetime
import itertools
import warnings
import datetime
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score

In [None]:
df = pd.read_csv('customer_data.csv')

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['fea_2'].fillna(df['fea_2'].mean(),inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
new_df = df.drop('id', axis = 1)

In [None]:
X = df.drop('label', axis = 1)
y = df['label']

print(X.shape, y.shape)

In [None]:
from imblearn.over_sampling import RandomOverSampler
os =  RandomOverSampler(0.7)
X_train_res, y_train_res = os.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train_res, y_train_res, test_size = 0.1, random_state = 10)

In [None]:
dt = DecisionTreeClassifier(criterion='entropy', max_depth = 2)

In [None]:
dt.fit(X_train, y_train)

In [None]:
import graphviz 

dot_data = tree.export_graphviz(dt, out_file=None, 
    feature_names=df.drop('label', axis=1).columns,    
    class_names=df['label'].unique().astype(str),  
    filled=True, rounded=True,  
    special_characters=True)
graph = graphviz.Source(dot_data)
graph

In [None]:
#del final_fi

# Calculating FI
for i, column in enumerate(new_df.drop('label', axis=1)):
    print('Importance of feature {}:, {:.3f}'.format(column, dt.feature_importances_[i]))
    
    fi = pd.DataFrame({'Variable': [column], 'Feature Importance Score': [dt.feature_importances_[i]]})
    
    try:
        final_fi = pd.concat([final_fi,fi], ignore_index = True)
    except:
        final_fi = fi
        
        
# Ordering the data
final_fi = final_fi.sort_values('Feature Importance Score', ascending = False).reset_index()            
final_fi

In [None]:
print('The training accuracy is ', dt.score(X_train, y_train))
print('The testing accuracy is ', dt.score(X_test, y_test))

In [None]:
# Confusion Matrix function

def plot_confusion_matrix(cm, classes=None, title='Confusion matrix'):
    """Plots a confusion matrix."""
    if classes is not None:
        sns.heatmap(cm, xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True, annot_kws={'size':50})
    else:
        sns.heatmap(cm, vmin=0., vmax=1.)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
y_pred = dt.predict(X_train)

# Plotting Confusion Matrix
cm = confusion_matrix(y_train, y_pred)
cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_norm, classes=dt.classes_, title='Training confusion')

In [None]:
y_pred = dt.predict(X_train)
y_pred
confusion_matrix(y_train, y_pred)

In [None]:
rf = RandomForestClassifier(criterion = 'entropy', n_estimators=100)
rf.fit(X_train, y_train)




In [None]:
print('The training accuracy is ', rf.score(X_train, y_train))
print('The testing accuracy is ', rf.score(X_test, y_test))

In [None]:
rf_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, rf_pred)
cm_norm = cm / cm.sum(axis = 1)[:, np.newaxis]
plot_confusion_matrix(cm_norm, classes = rf.classes_)