### Reading the csv data file and creating a data-frame called drug

In [17]:
import boto3
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'gabriel-predictive-analytics'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "drug200.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
drug = pd.read_csv(file_content_stream)
drug.head(1) 

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY


In [18]:
## Frequency table of the target variable
drug['Drug'].value_counts()

DrugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: Drug, dtype: int64

In [19]:
## Changing labels to numbers
drug['Drug_numb'] = np.where(drug['Drug'] == 'drugA', 1,
                        np.where(drug['Drug'] == 'drugB', 2,
                        np.where(drug['Drug'] == 'drugC', 3,
                        np.where(drug['Drug'] == 'drugX', 4, 5))))
drug.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Drug_numb
0,23,F,HIGH,HIGH,25.355,DrugY,5
1,47,M,LOW,HIGH,13.093,drugC,3
2,47,M,LOW,HIGH,10.114,drugC,3
3,28,F,NORMAL,HIGH,7.798,drugX,4
4,61,F,LOW,HIGH,18.043,DrugY,5


In [20]:
drug['BP'].value_counts()

HIGH      77
LOW       64
NORMAL    59
Name: BP, dtype: int64

In [21]:
# Sex to dummy
drug['Sex_numb'] = np.where(drug['Sex'] == 'F', 0, 1)

# BP to dummies
drug = pd.concat([drug, pd.get_dummies(drug['BP'])], axis = 1)
drug = drug.rename(columns = {'HIGH': 'BP_HIGH', 'LOW': 'BP_LOW', 'NORMAL': 'BP_NORMAL'})

# Dummies of Cholesterol
drug = pd.concat([drug, pd.get_dummies(drug['Cholesterol'])], axis = 1)
drug = drug.rename(columns = {'HIGH': 'Cho_HIGH', 'LOW': 'Cho_LOW', 'NORMAL': 'Cho_NORMAL'})

drug.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Drug_numb,Sex_numb,BP_HIGH,BP_LOW,BP_NORMAL,Cho_HIGH,Cho_NORMAL
0,23,F,HIGH,HIGH,25.355,DrugY,5,0,1,0,0,1,0
1,47,M,LOW,HIGH,13.093,drugC,3,1,0,1,0,1,0
2,47,M,LOW,HIGH,10.114,drugC,3,1,0,1,0,1,0
3,28,F,NORMAL,HIGH,7.798,drugX,4,0,0,0,1,1,0
4,61,F,LOW,HIGH,18.043,DrugY,5,0,0,1,0,1,0


In [22]:
# Defining the input and target variables
X = drug[['Age', 'Sex_numb', 'BP_HIGH', 'BP_LOW', 'Cho_HIGH', 'Na_to_K']]
Y = drug['Drug_numb']

# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [23]:
# Building Random Forest Classifier model
one_vs_all_RF = OneVsRestClassifier(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3)).fit(X_train, Y_train)

# Predicting on test dataset
one_vs_all_RF_pred = one_vs_all_RF.predict(X_test)

# Confusion Matrix
print(classification_report(Y_test, one_vs_all_RF_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         3
           3       1.00      0.33      0.50         3
           4       0.85      1.00      0.92        11
           5       1.00      1.00      1.00        18

    accuracy                           0.95        40
   macro avg       0.97      0.87      0.88        40
weighted avg       0.96      0.95      0.94        40



In [24]:
# Building Random Forest Classifier model
one_vs_all_ADA = OneVsRestClassifier(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01)).fit(X_train, Y_train)

# Predicting on test dataset
one_vs_all_ADA_pred = one_vs_all_ADA.predict_proba(X_test)
one_vs_all_ADA_pred = np.argmax(one_vs_all_ADA_pred, axis = 1) + 1

# Confusion Matrix
print(classification_report(Y_test, one_vs_all_ADA_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00        11
           5       1.00      1.00      1.00        18

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



### Using the results from part 5 and 6, I would use the AdaBoost Classifier model to predict the drug type because it had 100% in all scores and classes.