In [1]:
import os
os.chdir("/Users/Clair/machine_learning_sp20")

In [2]:
from sklearn import tree
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import graphviz 
import pydotplus

In [3]:
#################################################################
# DATASET -- TIC-TAC-TOE
# Classification
# 
##################################################################
ttt_columns = ["top_left_square", "top_middle_square", "top_right_square", "middle_left_square", "middle_middle_square",
                 "middle_right_square", "bottom_left_square", "bottom_middle_square","bottom_right_square", "winner"]
ttt_data = pd.read_csv("tic-tac-toe.data", delimiter=",", names=ttt_columns, header=None)


ttt_data["top_left_square"] = ttt_data.top_left_square.map({"o": 0, "x": 1, "b": 3})
ttt_data["top_middle_square"] = ttt_data.top_middle_square.map({"o": 0, "x": 1, "b": 3})
ttt_data["top_right_square"] = ttt_data.top_right_square.map({"o": 0, "x": 1, "b": 3})

ttt_data["middle_left_square"] = ttt_data.middle_left_square.map({"o": 0, "x": 1, "b": 3})
ttt_data["middle_middle_square"] = ttt_data.middle_middle_square.map({"o": 0, "x": 1, "b": 3})
ttt_data["middle_right_square"] = ttt_data.middle_right_square.map({"o": 0, "x": 1, "b": 3})

ttt_data["bottom_left_square"] = ttt_data.bottom_left_square.map({"o": 0, "x": 1, "b": 3})
ttt_data["bottom_middle_square"] = ttt_data.bottom_middle_square.map({"o": 0, "x": 1, "b": 3})
ttt_data["bottom_right_square"] = ttt_data.bottom_right_square.map({"o": 0, "x": 1, "b": 3})

ttt_data["winner"] = ttt_data.winner.map({"positive": 1, "negative": 0})
ttt_data["x_wins"] = ttt_data["winner"]

ttt_targets = ttt_data["x_wins"]
ttt_data = ttt_data.drop(columns=["winner", "x_wins"])


In [4]:
train_data, test_data, train_targets, test_targets = train_test_split(ttt_data, ttt_targets, test_size = 0.3, shuffle = True)

In [5]:
# TREE #1
clf = tree.DecisionTreeClassifier(random_state=0, criterion="entropy")
clf.fit(train_data, train_targets)


print("Accuracy: ", round(clf.score(test_data, test_targets) * 100, 2), "%")

Accuracy:  92.01 %


In [6]:
dot_data = tree.export_graphviz(clf, out_file=None, filled=True, rounded=True, special_characters=True) 
graph = graphviz.Source(dot_data) 
graph
pydot_graph = pydotplus.graph_from_dot_data(dot_data)
pydot_graph.write_png('ttt_smaller_tree.png')

True

In [7]:
#################################################################
# DATASET -- CONTACT LENS
# Classification
# Number of Instances: 24
##################################################################
lens_cols = ["index","age", "prescript", "astigm", "tear_rate", "lens"]
lens_data = pd.read_csv("lenses.data", delimiter="\s+", names=lens_cols, header=None, na_values=["?"])

lens_targs = lens_data["lens"]
lens_data = lens_data.drop(columns=["lens", "index"])

In [8]:
train_data, test_data, train_targets, test_targets = train_test_split(lens_data, lens_targs, test_size = 0.3, shuffle = True)

In [9]:
# TREE #2
clf = tree.DecisionTreeClassifier(random_state=0, criterion="entropy")
clf.fit(train_data, train_targets)

print("Accuracy: ", round(clf.score(test_data, test_targets) * 100, 2), "%")

Accuracy:  75.0 %


In [10]:
dot_data = tree.export_graphviz(clf, out_file=None, filled=True, rounded=True, special_characters=True) 
graph = graphviz.Source(dot_data) 
graph
pydot_graph = pydotplus.graph_from_dot_data(dot_data)
pydot_graph.write_png('lens.png')

True

In [11]:
#################################################################
# DATASET -- MILES PER GALLON DATA
# REGRESSION
# 
##################################################################
mpg_columns = ["mpg", "cylinders", "displacement","horsepower", "weight", "acceleration", "model_year","origin", "car_name"]
mpg_data = pd.read_csv("auto-mpg.data", sep="\s+", names=mpg_columns, na_values=["?"])


In [12]:
#Set the targets
mpg_target = mpg_data["mpg"]

#Get rid of the target and the unneeded columns
mpg_data = mpg_data.drop(columns=["mpg", "car_name"])

In [13]:
# Replace missing horsepower values with the mean horsepower
mpg_data[mpg_data.isna().any(axis=1)] # shows records with NA's
mpg_data.horsepower = mpg_data.horsepower.fillna(mpg_data["horsepower"].mean())
train_data, test_data, train_targets, test_targets = train_test_split(mpg_data, mpg_target, test_size = 0.3, shuffle = True)


In [14]:
clf = tree.DecisionTreeRegressor(random_state=0, max_depth=9)
clf.fit(train_data, train_targets)

print("Accuracy: ", round(clf.score(test_data, test_targets) * 100, 2), "%")

dot_data = tree.export_graphviz(clf, out_file=None, filled=True, rounded=True, special_characters=True) 
graph = graphviz.Source(dot_data) 
graph
pydot_graph = pydotplus.graph_from_dot_data(dot_data)
pydot_graph.write_png('mpg_mean_hp.png')

Accuracy:  75.61 %


True

In [15]:
mpg_columns = ["mpg", "cylinders", "displacement","horsepower", "weight", "acceleration", "model_year","origin", "car_name"]
data_rmv_na = pd.read_csv("auto-mpg.data", sep="\s+", names=mpg_columns, na_values=["?"])

data_rmv_na = data_rmv_na.drop(columns=["car_name"])

data_rmv_na = data_rmv_na.apply(pd.to_numeric, errors='coerce')
data_rmv_na = data_rmv_na.dropna()


mpg_target = data_rmv_na["mpg"].copy()

data_rmv_na = data_rmv_na.drop(columns=["mpg"])

In [16]:
train_data, test_data, train_targets, test_targets = train_test_split(data_rmv_na, mpg_target, test_size = 0.3, shuffle = True)

In [17]:
clf = tree.DecisionTreeRegressor(random_state=0, max_depth=8)
clf.fit(train_data, train_targets)

print("Accuracy: ", round(clf.score(test_data, test_targets) * 100, 2), "%")

Accuracy:  77.62 %


In [18]:
dot_data = tree.export_graphviz(clf, out_file=None, filled=True, rounded=True, special_characters=True) 
graph = graphviz.Source(dot_data) 
graph
pydot_graph = pydotplus.graph_from_dot_data(dot_data)
pydot_graph.write_png('mpg_drop.png')

True