<a href="https://colab.research.google.com/github/dkurbatovv/Python/blob/main/Random_Forest_Drug.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Packages / libraries
import os #provides functions for interacting with the operating system
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

# To install sklearn type "pip install numpy scipy scikit-learn" to the anaconda terminal

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increases the size of sns plots
sns.set(rc={'figure.figsize':(8,6)})

# Datetime lib
from pandas import to_datetime
import itertools
import warnings
import datetime
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score

# pip install graphviz
# conda install python-graphviz


In [None]:
df = pd.read_csv('drug200.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
for column in df:
  unique_value = np.unique(df[column])
  nr_value = len(unique_value)

  if nr_value < 12:
    print('The number of values for feature {} :{} -- {}'.format(column, nr_value,unique_value))
  else:
    print('The number of values for feature {} :{}'.format(column, nr_value))

In [None]:
sns.pairplot(data = df, hue = 'Drug', diag_kws={'bw': 0.2})

In [None]:
df.head()

In [None]:
df['Sex'][df['Sex'] == 'F'] = 0
df['Sex'][df['Sex'] == 'M'] = 1

In [None]:
df['Cholesterol'][df['Cholesterol'] == 'NORMAL'] = 0
df['Cholesterol'][df['Cholesterol'] == 'HIGH'] = 1

In [None]:
df = pd.get_dummies(df, columns = ['BP'])

In [None]:
df.head()

In [None]:
df['Drug'][df['Drug'] == 'drugA'] = 0
df['Drug'][df['Drug'] == 'drugB'] = 1
df['Drug'][df['Drug'] == 'drugC'] = 2
df['Drug'][df['Drug'] == 'drugX'] = 3
df['Drug'][df['Drug'] == 'DrugY'] = 4


In [None]:
df.head()

In [None]:
df = df.astype(float)
df['Drug'] = df['Drug'].astype(int)
df.info()

In [None]:
df.head()


In [None]:
X = df.drop('Drug', axis = 1)
y = df['Drug']

print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 10)


dt = DecisionTreeClassifier(criterion = 'gini', max_depth = 7)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_train)


print('The training accyracy is ', dt.score(X_train, y_train))
print('The testing accyracy is ', dt.score(X_test, y_test))

In [None]:
import graphviz 

dot_data = tree.export_graphviz(dt, out_file=None, 
    feature_names=df.drop('Drug', axis=1).columns,    
    class_names=df['Drug'].unique().astype(str),  
    filled=True, rounded=True,  
    special_characters=True)
graph = graphviz.Source(dot_data)
graph

In [None]:
#del final_fi

# Calculating FI
for i, column in enumerate(df.drop('Drug', axis=1)):
    print('Importance of feature {}:, {:.3f}'.format(column, dt.feature_importances_[i]))
    
    fi = pd.DataFrame({'Variable': [column], 'Feature Importance Score': [dt.feature_importances_[i]]})
    
    try:
        final_fi = pd.concat([final_fi,fi], ignore_index = True)
    except:
        final_fi = fi
        
        
# Ordering the data
final_fi = final_fi.sort_values('Feature Importance Score', ascending = False).reset_index()            
final_fi

In [None]:
def plot_confusion_matrix(cm, classes=None, title='Confusion Matrix'):
  if classes is not None:
    sns.heatmap(cm, vmin = 0., vmax = 1.0, annot = True)
  else:
    sns.heatmap(cm, vmin = 0., vmax = 1.0)
  plt.title(title)
  plt.ylabel('True label')
  plt.xlabel('Predicted label')  

In [None]:
cm = confusion_matrix(y_train, y_pred)
cm_norm = cm / cm.sum(axis = 1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_norm, classes = dt.classes_)

In [None]:
rf = RandomForestClassifier(criterion='gini', n_estimators=200)
rf.fit(X_train, y_train)

In [None]:
print('The training accyracy is ', rf.score(X_train, y_train))
print('The testing accyracy is ', rf.score(X_test, y_test))

In [None]:
rf_pred = rf.predict(X_test)


cm = confusion_matrix(y_test, rf_pred)
cm_norm = cm / cm.sum(axis = 1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_norm, classes = dt.classes_)



In [None]:
from itertools import product



In [None]:
n_estimators = 100
max_depth = [None, 2, 3, 4, 5]
max_features = [1, 'sqrt', 'log2']

In [None]:
for f, d in product(max_features, max_depth):
  rf = RandomForestClassifier(criterion = 'entropy', 
                              max_depth = d,
                              max_features = f,
                              n_estimators = n_estimators,
                              n_jobs = 2,
                              random_state = 20)
  rf.fit(X_train, y_train)
  rf_predict = rf.predict(X_test)
  print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(f, d, accuracy_score(y_test,rf_predict)))
  cm = confusion_matrix(y_test, rf_predict)
  cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
  plt.figure()
  plot_confusion_matrix(cm_norm, classes=rf.classes_,
  title='Confusion matrix accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(f, d, accuracy_score(y_test,rf_predict)))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost



In [None]:
classifier = xgboost.XGBClassifier(tree_method='gpu_hist')

params = {
    'learning_rate': [0.05,0.10,0.15,0.20,0.25,0.30],
    'max_depth': [2,3,4,5,6,8,10,12,15],
    'min_child_weight': [1,3,5,7],
    "gamma": [0.0,0.1,0.2,0.3,0.4],
    "colsample_bytree":[0.3,0.4,0.5,0.7]}



In [None]:
clf = RandomizedSearchCV(classifier, param_distributions=params, n_iter = 5, scoring='roc_auc', cv=5, verbose=3)

In [None]:
#%pip install https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/master/xgboost-1.6.0.dev0%2B1d468e20a4fff83f3149e99371b67e6b31f64152-py3-none-manylinux2014_x86_64.whl

In [None]:
clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
best = clf.best_estimator_

In [None]:
final_model = xgboost.XGBClassifier(colsample_bytree=0.4, gamma=0.3, learning_rate=0.3, max_depth=12,
              min_child_weight=3, objective='multi:softprob',
              tree_method='gpu_hist')

In [None]:
final_model.fit(X, y)

In [None]:
pred_xgboost = final_model.predict(X)

# Confusion Matrix
cm = confusion_matrix(y, pred_xgboost)
cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_norm, classes=rf.classes_)