In [1]:
import pandas as pd

#Load the file into a dataframe using the pandas read_csv function
data = pd.read_csv("https://brockdsl.github.io/Python_2.0_Workshop/canadian_toy_dataset.csv")

data.columns = ["city", "gender","age","income","ill"]

print("Ill or not?")
print(data.groupby("ill")["city"].count())
print("\nTotal records:", len(data))

Ill or not?
ill
No     137861
Yes     12139
Name: city, dtype: int64

Total records: 150000


In [2]:
import pandas as pd
import matplotlib.pyplot as plt

#our machine learning pieces
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_text
from sklearn import metrics
from sklearn import tree

print('Ready to proceed!')

Ready to proceed!


In [3]:
data = pd.read_csv("https://brockdsl.github.io/Python_2.0_Workshop/canadian_toy_dataset.csv")

data.columns = ["city", "gender","age","income","ill"]
data.head()

Unnamed: 0,city,gender,age,income,ill
0,Montreal,Male,41,40367,No
1,Montreal,Male,54,45084,No
2,Montreal,Male,42,52483,No
3,Montreal,Male,40,40941,No
4,Montreal,Male,46,50289,No


In [4]:
'''This dataset is fairly clean, we just need to represent it all as numbers instead of text labels. So that means we need to change the columns:

ill - instead of a No / Yes label we'll use 0 and 2 instead
city - this will break out the column into 8 different columns
gender - this will break out the column into 2 different columns'''

data["ill"].replace({"No":0,"Yes":2},inplace=True)

#change categorical values into numeric ones
data = pd.get_dummies(data,columns=['city','gender'])
data.head()

Unnamed: 0,age,income,ill,city_Edmonton,city_Halifax,city_Montreal,city_Ottawa,city_Regina,city_Toronto,city_Vancouver,city_Waterloo,gender_Female,gender_Male
0,41,40367,0,0,0,1,0,0,0,0,0,0,1
1,54,45084,0,0,0,1,0,0,0,0,0,0,1
2,42,52483,0,0,0,1,0,0,0,0,0,0,1
3,40,40941,0,0,0,1,0,0,0,0,0,0,1
4,46,50289,0,0,0,1,0,0,0,0,0,0,1


In [5]:
data.tail()

Unnamed: 0,age,income,ill,city_Edmonton,city_Halifax,city_Montreal,city_Ottawa,city_Regina,city_Toronto,city_Vancouver,city_Waterloo,gender_Female,gender_Male
149995,48,93669,0,1,0,0,0,0,0,0,0,0,1
149996,25,96748,0,1,0,0,0,0,0,0,0,0,1
149997,26,111885,0,1,0,0,0,0,0,0,0,0,1
149998,25,111878,0,1,0,0,0,0,0,0,0,0,1
149999,37,87251,0,1,0,0,0,0,0,0,0,1,0


In [6]:
features = ["age",\
            "income",\
            "city_Edmonton",\
            "city_Halifax",\
            "city_Montreal",
            "city_Ottawa",\
            "city_Regina",\
            "city_Toronto",
            "city_Vancouver",\
            "city_Waterloo",\
            "gender_Female",\
            "gender_Male"]

X = data[features]

#we want to target the ill column
y=data.ill

In [7]:
'''Training and testing
Now that we have built our model we need to get the data ready for it. We do this by breaking it into two different pieces. The diagram shows a conceptualization of how this is proportioned.

Train Test Split

Training set - This is what is used to build the model. If we set this value too large the ML Model just memorizes the data so we need to be careful when setting this value. This is called overfitting the data.
Testing set - This is used to see if our guesses are correct
Before we were looking at the columns of the data, this investigation of training/testing looks at the rows of data.'''

'Training and testing\nNow that we have built our model we need to get the data ready for it. We do this by breaking it into two different pieces. The diagram shows a conceptualization of how this is proportioned.\n\nTrain Test Split\n\nTraining set - This is what is used to build the model. If we set this value too large the ML Model just memorizes the data so we need to be careful when setting this value. This is called overfitting the data.\nTesting set - This is used to see if our guesses are correct\nBefore we were looking at the columns of the data, this investigation of training/testing looks at the rows of data.'

In [8]:
#Training and test together make up 100% of the data!
#We start with a baseline of 30% of our data as testing

test_percent = 30
train_percent = 100 - test_percent

X_train, X_test, y_train, y_test = train_test_split(X, \
                                                    y, \
                                                    test_size=test_percent/100.0,
                                                   random_state=10)

In [9]:
'''Now the interesting part, we build our model, train it against the training set and see how it predicts against the testing set'''
#Create decision tree classifier object
treeClass = DecisionTreeClassifier()

#Train
treeClass = treeClass.fit(X_train,y_train)

#Predict
y_pred = treeClass.predict(X_test)

**Accuracy of the Model**

To see how good our machine learning model is we need to see how accurate our predictions are. Scikit has built in functions and metrics to do this for us.

In [10]:
print("Accuracy: ")
print(metrics.accuracy_score(y_test,y_pred))

Accuracy: 
0.8458666666666667
