In [None]:
# import modules
import numpy as np 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [None]:
my_data = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/drug200.csv', delimiter=',')
print(my_data[0:5])

In [None]:
# what is the size of the data
print(my_data.shape)

In [None]:
# remove the column containing the target name since it doesn't contain numeric values
X = my_data[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
print(X[0:5])

In [None]:
# convert categorical data into dummy/indicator variables
from sklearn import preprocessing
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['F','M'])
X[:,1] = le_sex.transform(X[:,1]) 


le_BP = preprocessing.LabelEncoder()
le_BP.fit([ 'LOW', 'NORMAL', 'HIGH'])
X[:,2] = le_BP.transform(X[:,2])


le_Chol = preprocessing.LabelEncoder()
le_Chol.fit([ 'NORMAL', 'HIGH'])
X[:,3] = le_Chol.transform(X[:,3]) 

print(X[0:5])

In [None]:
# fill the target variable
y = my_data["Drug"]
print(y[0:5])

In [None]:
# split the data into train and test
from sklearn.model_selection import train_test_split
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.3, random_state=3)

In [None]:
# ensure the dimensions of the training sets match
print("X train set: ", X_trainset.shape)
print("y train set: ", y_trainset.shape)

In [None]:
# ensure the dimensions of the testing sets match
print("X test set: ", X_testset.shape)
print("y test set: ", y_testset.shape)

In [None]:
# create an instance of the DecisionTreeClassifier called drugTree
# inside the classifier, specify criterion="entropy" so we can see the ifnormation gain of each node
drugTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
drugTree # it shows the default parameters

In [None]:
# fit the data with the training feature matrix X-trainset, and training response vector y_trainset
drugTree.fit(X_trainset,y_trainset)

In [None]:
# let's make some predictions on the testing dataset
predTree = drugTree.predict(X_testset)
print(predTree[0:5])
print(y_testset[0:5])

# Accuracy classification score computes subset accuracy: the set of predicted labels must exactly match the corresponding set of labels in y_true
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, predTree))

In [None]:
# let's visualise the tree
from  io import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree

dot_data = StringIO()
filename = "drugtree.png"
featureNames = my_data.columns[0:5]
targetNames = my_data["Drug"].unique().tolist()
out=tree.export_graphviz(drugTree,feature_names=featureNames, out_file=dot_data, class_names= np.unique(y_trainset), filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')