### Import Dependencies

In [23]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

### Read Data

In [2]:
my_data = pd.read_csv("drug200.xls")
my_data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


### Check Basic Info

In [3]:
my_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [4]:
# X is the feature matrix

X = my_data[["Age", "Sex", "BP", "Cholesterol", "Na_to_K"]].values
X[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.114],
       [28, 'F', 'NORMAL', 'HIGH', 7.798],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

### Get Dummy Variables for Categorical Data

In [7]:
sex = preprocessing.LabelEncoder()
sex.fit(['F', 'M'])
X[:,1] = sex.transform(X[:,1])

bp = preprocessing.LabelEncoder()
bp.fit(['LOW', 'NORMAL', 'HIGH'])
X[:,2] = bp.transform(X[:,2])

cholestrol = preprocessing.LabelEncoder()
cholestrol.fit(['NORMAL', 'HIGH'])
X[:,3] = cholestrol.transform(X[:,3])

X[0:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.114],
       [28, 0, 2, 0, 7.798],
       [61, 0, 1, 0, 18.043]], dtype=object)

In [9]:
# Y is the target

Y = my_data["Drug"]
Y.head()

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object

### Splitting Data to Training and Testing

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state = 3)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(160, 5)
(40, 5)
(160,)
(40,)


### Modelling

In [20]:
drugtree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
drugtree.fit(X_train, Y_train)

### Prediction

In [22]:
prediction = drugtree.predict(X_test)
print(prediction[0:5])
print(Y_test[0:5])

['drugY' 'drugX' 'drugX' 'drugX' 'drugX']
40     drugY
51     drugX
139    drugX
197    drugX
170    drugX
Name: Drug, dtype: object


### Evaluation

In [25]:
score = accuracy_score(Y_test, prediction)
print(f"Decision Tree Accuracy score is : {score}")

Decision Tree Accuracy score is : 1.0
