In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import tree
# Applying random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

# Visualize the decision tree
import graphviz

## 4.1

Download a dataset about tipping this exercise, which is sourced from the study “The Effects of a Joke on Tipping When it is Delivered at the Same Time as the Bill,” by Nicholas Gueguen (2002). Can telling a joke affect whether or not a waiter in a coffee bar receives a tip from a customer?

This study investigated this question at a coffee bar at a famous resort on the west coast of France. The waiter randomly assigned coffee-ordering customers to one of three groups: When receiving the bill one group also received a card telling a joke, another group received a card containing an advertisement for a local restaurant, and a third group received no card at all. He recorded whether or not each customer left a tip.
The dataset contains the following variables:

- Card: Type of card used: Ad, Joke, or None
- Tip: 1=customer left a tip or 0=no tip
- Ad: Indicator for Ad card
- Joke: Indicator for Joke card
- None: Indicator for no card

Use a decision tree to determine whether the waiter will receive a tip from the customer from the predictor variables.

In [2]:
df = pd.read_csv("./data/TipJoke.csv", index_col= 0)
df.head(3)

Unnamed: 0,Card,Tip,Ad,Joke,None
1,,1,0,0,1
2,Joke,1,0,1,0
3,Ad,0,1,0,0


In [3]:
# Creating X and Y
X = df.iloc[:,2:5]
y = df['Tip']

# Making training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Creating the DTC and fitting the model
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

#Predicting on test data
predictions = dtree.predict(X_test)

#Printing the classification report and accuracy score
print(accuracy_score(y_test,predictions))
print(confusion_matrix(y_test,predictions))

#Features to vizualize dtree
features = list(df[['Ad','Joke','None']])

# DOT data
dot_data = tree.export_graphviz(dtree, out_file=None,
                                feature_names=('Ad','Joke','None'),
                                class_names=('0','1'),
                                filled=True)

dot_data
# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph.render('4_1', view=True)

0.828125
[[53  0]
 [11  0]]


'4_1.png'

## 4.2
The dataset you are going to use for this exercise is about contact lenses, which has three class labels:

- the patient should be prescribed hard contact lenses,
- the patient should be prescribed soft contact lenses,
- the patient should not be fitted with contact lenses,

The attributes are the following:

1. age of the patient: (1) young, (2) pre-presbyopic, (3) presbyopic
2. spectacle prescription: (1) myope, (2) hypermetrope
3. astigmatic: (1) no, (2) yes
4. tear production rate: (1) reduced, (2) normal

Build a decision tree-based classifier using about 80% of the data that would recommend the class label based on the other attributes from the dataset. Use the remaining data to manually test how well your model will classify new data.

In [4]:
df = pd.read_excel("./data/lenses.xlsx")
df.head(3)

Unnamed: 0,X1.1,X1.2,X1.3,X1.4,X3
0,1,1,1,1,3
1,1,1,1,2,2
2,1,1,2,1,3


In [5]:
# Creating X and Y
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

# Making training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Creating the DTC and fitting the model
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

#Predicting on test data
predictions = dtree.predict(X_test)

#Printing the classification report and accuracy score
print(accuracy_score(y_test,predictions))
print(confusion_matrix(y_test,predictions))

#Features to vizualize dtree
features = list(df[["X1.1","X1.2","X1.3","X1.4"]])

# DOT data
dot_data = tree.export_graphviz(dtree, out_file=None,
                                feature_names=("X1.1","X1.2","X1.3","X1.4"),
                                class_names=('1','2', '3'),
                                filled=True)

dot_data
# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph.render('4_2', view=True)

0.8
[[0 1]
 [0 4]]


'4_2.png'

## 4.3
Download the balloons dataset (Links to an external site.). It has four attributes: color, size, act, age, and one class label, inflated (True or False). Use decision tree algorithm to learn how to predict if the balloon is inflated or not using the features available. Make sure to do appropriate split of the data for training-testing.

Try doing this using at least two different subsets of the features. What differences do you find in the nature of the tree as well as the accuracy for classification you can achieve?

In [6]:
columns = ["color", "size", "act", "age", "label"]
df = pd.read_csv("./data/adult+stretch.data", header = None, names = columns)
df.head()

Unnamed: 0,color,size,act,age,label
0,YELLOW,SMALL,STRETCH,ADULT,T
1,YELLOW,SMALL,STRETCH,ADULT,T
2,YELLOW,SMALL,STRETCH,CHILD,F
3,YELLOW,SMALL,DIP,ADULT,F
4,YELLOW,SMALL,DIP,CHILD,F


In [7]:
#creating labelEncoder
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
df["color"] = le.fit_transform(df["color"])
df["size"] = le.fit_transform(df["size"])
df["act"] = le.fit_transform(df["act"])
df["age"] = le.fit_transform(df["age"])

In [8]:
# subset 1 
# Creating X and Y
X = df.iloc[:,1:4]
y = df.iloc[:,-1]

# Making training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Creating the DTC and fitting the model
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

#Predicting on test data
predictions = dtree.predict(X_test)

#Printing the classification report and accuracy score
print(accuracy_score(y_test,predictions))
print(confusion_matrix(y_test,predictions))

#Features to vizualize dtree
features = list(df[["size","act","age"]])

# DOT data
dot_data = tree.export_graphviz(dtree, out_file=None,
                                feature_names=("size","act","age"),
                                class_names=('T','F'),
                                filled=True)

dot_data
# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph.render('4_3_1', view=True)

1.0
[[4 0]
 [0 2]]


'4_3_1.png'

In [9]:
# subset 2 
# Creating X and Y
X = df.iloc[:,0:3]
y = df.iloc[:,-1]

# Making training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Creating the DTC and fitting the model
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

#Predicting on test data
predictions = dtree.predict(X_test)

#Printing the classification report and accuracy score
print(accuracy_score(y_test,predictions))
print(confusion_matrix(y_test,predictions))

#Features to vizualize dtree
features = list(df[["color", "size","act"]])

# DOT data
dot_data = tree.export_graphviz(dtree, out_file=None,
                                feature_names=("color", "size","act"),
                                class_names=('T','F'),
                                filled=True)

dot_data
# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph.render('4_3_2', view=True)

0.5
[[3 2]
 [1 0]]


'4_3_2.png'

## 4.4
Take any of the problems from before -- one where we used logistic regression or kNN and try it with decision tree and random forest algorithms now. Compare and contrast these algorithms.

In [10]:
df = pd.read_csv("./data/hsbdemo.csv", index_col= 0)
df.head(3)

Unnamed: 0,id,female,ses,schtyp,prog,read,write,math,science,socst,honors,awards,cid
1,45,female,low,public,vocation,34,35,41,29,26,not enrolled,0,1
2,108,male,middle,public,general,34,33,41,36,36,not enrolled,0,1
3,15,male,high,public,vocation,39,39,44,26,42,not enrolled,0,1


In [11]:
X = df.iloc[:,5:9]
y = df["prog"]

#Creating X and Y for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [14]:
# Creating the DTC and fitting the model
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

#Predicting on test data
predictions = dtree.predict(X_test)

#Printing the classification report and accuracy score
print(accuracy_score(y_test,predictions))
print(confusion_matrix(y_test,predictions))

#Features to vizualize dtree
features = list(df[["read","write","math","science"]])

# DOT data
dot_data = tree.export_graphviz(dtree, out_file=None,
                                feature_names=("read","write","math","science"),
                                class_names=("academic", "general", "vocation"),
                                filled=True)

dot_data
# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph.render('4_4', view=True)

0.55
[[17  1  6]
 [ 8  5  3]
 [ 5  4 11]]


In [13]:
rfc = RandomForestClassifier(n_estimators=100)
# y_train is a column vector, but 1d array is expected. Therefore, we need to
# change the shape to (n_samples,)
rfc.fit(X_train, y_train.values.ravel())

predictions = rfc.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print(confusion_matrix(y_test,predictions))

Accuracy: 0.5333333333333333
[[21  2  1]
 [12  1  3]
 [ 8  2 10]]
