<a href="https://colab.research.google.com/github/dkurbatovv/Python/blob/main/Random_Forest_and_Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Packages / libraries
import os #provides functions for interacting with the operating system
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

# To install sklearn type "pip install numpy scipy scikit-learn" to the anaconda terminal

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increases the size of sns plots
sns.set(rc={'figure.figsize':(8,6)})

# Datetime lib
from pandas import to_datetime
import itertools
import warnings
import datetime
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score

In [None]:
raw_data = pd.read_csv('churn raw data.csv')

In [None]:
raw_data.shape

In [None]:
raw_data.head()

In [None]:
for column in raw_data:
    unique_vals = np.unique(raw_data[column])
    nr_values = len(unique_vals)
    if nr_values < 12:
        print('The number of values for feature {} :{} -- {}'.format(column, nr_values,unique_vals))
    else:
        print('The number of values for feature {} :{}'.format(column, nr_values))

In [None]:
raw_data.columns

In [None]:
raw_data2 = raw_data[['CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited']]

In [None]:
#g = sns.pairplot(raw_data2, hue='Exited', diag_kws={'bw': 0.2})

In [None]:
features = ['Geography', 'Gender', 'Age', 'Tenure', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember']


In [None]:
for f in features:
  plt.figure(figsize=(12,8))
  ax = sns.countplot(x=f, data=raw_data2, hue='Exited', palette='Set1')

In [None]:
new_raw_data = pd.get_dummies(raw_data2, columns = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember'])
new_raw_data.head()

In [None]:
scaler_val = ['CreditScore',	'Age', 'Balance', 'EstimatedSalary']
scaler = MinMaxScaler()

In [None]:
new_raw_data[scaler_val] = scaler.fit_transform(new_raw_data[scaler_val])

In [None]:
new_raw_data.head()

In [None]:
X = new_raw_data.drop('Exited', axis=1).values# Input features (attributes)
y = new_raw_data['Exited'].values # Target vector


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 0)

In [None]:
X_train.shape

In [None]:
dt = DecisionTreeClassifier(criterion='entropy', max_depth=2, random_state=1)
dt.fit(X_train, y_train)

In [None]:
import graphviz 

dot_data = tree.export_graphviz(dt, out_file=None, 
    feature_names=new_raw_data.drop('Exited', axis=1).columns,    
    class_names=new_raw_data['Exited'].unique().astype(str),  
    filled=True, rounded=True,  
    special_characters=True)
graph = graphviz.Source(dot_data)
graph

In [None]:
for i, column in enumerate(new_raw_data.drop('Exited', axis=1)):
    print('Importance of feature {}:, {:.3f}'.format(column, dt.feature_importances_[i]))
    
    fi = pd.DataFrame({'Variable': [column], 'Feature Importance Score': [dt.feature_importances_[i]]})
    
    try:
        final_fi = pd.concat([final_fi,fi], ignore_index = True)
    except:
        final_fi = fi
        
        
# Ordering the data
final_fi = final_fi.sort_values('Feature Importance Score', ascending = False).reset_index()            
final_fi

In [None]:
print('The Training Accuracy is:', dt.score(X_train, y_train))
print('The Testing Accuracy is:', dt.score(X_test, y_test))

In [None]:
def plot_confusion_matrix(cm, classes=None, title='Confusion matrix'):
    """Plots a confusion matrix."""
    if classes is not None:
        sns.heatmap(cm, xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True, annot_kws={'size':50})
    else:
        sns.heatmap(cm, vmin=0., vmax=1.)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
y_pred = dt.predict(X_train)

# Plotting Confusion Matrix
cm = confusion_matrix(y_train, y_pred)
cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_norm, classes=dt.classes_, title='Training confusion')

In [None]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy')
rf.fit(X_train, y_train)
prediction_test = rf.predict(X_test)

In [None]:
print('The Training Accuracy is:', rf.score(X_train, y_train))
print('The Testing Accuracy is:', rf.score(X_test, y_test))

In [None]:
cm = confusion_matrix(y_test, prediction_test)
cm_norm = cm / cm.sum(axis=1)[:, np.newaxis]

plt.figure()
plot_confusion_matrix(cm_norm, classes=rf.classes_)

In [None]:
#пытаемся улучшить нашу модель набором (продуктом) параметров


from itertools import product

n_estimators = 100
max_depth = [None, 2, 3, 4, 5]
max_features = [1, 'sqrt', 'log2']


for f,d in product(max_features, max_depth):
  rf = RandomForestClassifier(n_estimators=n_estimators,
                              criterion='entropy',
                              max_features=f,
                              max_depth=d,
                              random_state=1337,
                              n_jobs = 2)
  rf.fit(X_train, y_train)  
  prediction_test = rf.predict(X=X_test)
  print('Classification accuracy on test set with max features = {} and max depth = {} : {:.3f}'.format(f,d, accuracy_score(y_test,prediction_test)))
  cm = confusion_matrix(y_test,prediction_test)
  cm_norm = cm / cm.sum(axis=1)[:, np.newaxis]
  plt.figure(figsize=(12,10))
  plot_confusion_matrix(cm_norm, classes=rf.classes_)
  title='Confusion matrix accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(f, d, accuracy_score(y_test,prediction_test))

In [None]:
#%pip install https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/master/xgboost-1.6.0.dev0%2B1d468e20a4fff83f3149e99371b67e6b31f64152-py3-none-manylinux2014_x86_64.whl

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost

classifier=xgboost.XGBClassifier(tree_method='gpu_hist')

params={
    "learning_rate":[0.05,0.10,0.15,0.20,0.25,0.30],
    "max_depth":[2,3,4,5,6,8,10,12,15],
    "min_child_weight":[1,3,5,7],
    "gamma":[0.0,0.1,0.2,0.3,0.4],
    "colsample_bytree":[0.3,0.4,0.5,0.7]}

clf =RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',cv=5,verbose=3)
# source: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

# fitting it
clf.fit(X,y)

# best parameters
# clf.best_params_

In [None]:
clf.best_params_

In [None]:
print(clf.best_estimator_)

In [None]:
final_model = clf.best_estimator_

In [None]:
final_model.fit(X, y)

In [None]:
pred_xgboost = final_model.predict(X)

In [None]:
cm = confusion_matrix(y, pred_xgboost)
cm_norm = cm / cm.sum(axis=1)[:, np.newaxis]
plt.figure
plot_confusion_matrix(cm_norm, classes = rf.classes_)