-
Notifications
You must be signed in to change notification settings - Fork 0
/
practical-3.py
198 lines (153 loc) · 6.33 KB
/
practical-3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
from os import system
from matplotlib import pyplot
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import tree, metrics, linear_model, preprocessing
from numpy import arange, meshgrid, array
print("Section 2", "\n")
iris = load_iris()
classes = iris.target_names
attributes = iris.feature_names
M = len(iris.data)
print("Classes: %s" % iris.target_names)
print("Attributes: %s" % iris.feature_names)
print("Number of instances: %s" % M)
print("\n")
print("Section 3", "\n")
def get_counts(c):
return "setosa: %s, versicolor: %s, virginica: %s" % (round(c[0], 2), round(c[1], 2), round(c[2], 2))
def count_on_attribute(X, y, no_classes, features_extractor):
class_counts = {}
for feature_vector, classification in zip(X, y):
key = features_extractor(feature_vector)
if key not in class_counts:
class_counts[key] = [0] * no_classes
class_counts[key][classification] += 1
return class_counts
def most_fequent_class(c):
greatest_count, most_frequent_class = 0, []
for i, count in enumerate(c):
if count > greatest_count:
greatest_count = count
most_frequent_class = [i]
elif count == greatest_count:
most_frequent_class.append(i)
return most_frequent_class
print("Sepal length")
sepal_lengths = count_on_attribute(iris.data, iris.target, 3, lambda x: x[0])
for sepal_length, class_counts in sepal_lengths.items():
print("> %scm - %s" % (sepal_length, get_counts(class_counts)))
print("")
print("Sepal width")
sepal_widths = count_on_attribute(iris.data, iris.target, 3, lambda x: x[1])
for sepal_width, class_counts in sepal_widths.items():
print("> %scm - %s" % (sepal_width, get_counts(class_counts)))
print("")
slw_extractor = lambda x: (x[0], x[1])
print("Sepal lengths and widths")
sepal_lengths_widths = count_on_attribute(iris.data, iris.target, 3, slw_extractor)
for (length, width), class_counts in sepal_lengths_widths.items():
most_freq = most_fequent_class(class_counts)
print("Sepal length: %scm, sepal width: %scm => %s" % (length, width, most_freq))
print("")
clf = {k: most_fequent_class(v)[0] for k, v in sepal_lengths_widths.items()}
y_hat = [clf[slw_extractor(fv)] for fv in iris.data]
def get_count(y, y_hat) -> int:
count = 0
for target, calculated in zip(y, y_hat):
if target == calculated:
count += 1
return count
def get_accuracy(y, y_hat, M) -> float:
return get_count(y, y_hat) / M
print("1R classifier accuracy: %s" % round(get_accuracy(iris.target, y_hat, M), 2))
del M, sepal_lengths, sepal_length, class_counts, sepal_width, sepal_widths
del slw_extractor, sepal_lengths_widths, length, width, clf, y_hat
print("\n")
print("Section 4", "\n")
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
clf = tree.DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
M_test = len(y_test)
count = get_count(y_test, y_hat)
accuracy = (count / M_test) * 100
print("Number of correct predictions: %d out of %d = %f%%" % (count, M_test, accuracy))
del count, M_test, accuracy
def get_training_score():
training_score = clf.score(X_train, y_train) * 100
print("Training score: %d%%" % round(training_score, 2))
def get_test_score():
test_score = clf.score(X_test, y_test) * 100
print("Test score: %d%%" % round(test_score, 2))
def get_accuracy():
sklearn_accuracy = metrics.accuracy_score(y_test, y_hat)
print("Accuracy (sk-learn): %f%%" % round(sklearn_accuracy, 2))
def get_precision():
precision = metrics.precision_score(y_test, y_hat, average=None)
print("Precision: %s" % get_counts(precision))
def get_recall():
recall = metrics.recall_score(y_test, y_hat, average=None)
print("Recall: %s" % get_counts(recall))
def get_f1():
f1 = metrics.f1_score(y_test, y_hat, average=None)
print("F1: %s" % get_counts(f1))
get_training_score()
get_test_score()
get_accuracy()
get_precision()
get_recall()
get_f1()
# print(clf.decision_path(iris.data))
# tree.export_graphviz(clf, out_file="plots/p3-iris-decision-tree.dot", class_names=classes, impurity=True)
# system("sudo apt install graphviz")
# system("dot plots/p3-iris-decision-tree.dot -Tpng > plots/p3-iris-decision-tree.png")
del X_test, X_train, clf, y_hat, y_test, y_train
print("\n")
print("Section 5", "\n")
X, y = iris.data[:, :2], iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = linear_model.LogisticRegression(solver="lbfgs", multi_class="multinomial")
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
get_training_score()
get_test_score()
get_accuracy()
get_precision()
get_recall()
get_f1()
pyplot.figure()
pyplot.xticks(fontsize=14)
pyplot.yticks(fontsize=14)
pyplot.set_cmap("Blues")
pyplot.xlabel(attributes[0], fontsize=14)
pyplot.ylabel(attributes[1], fontsize=14)
interval = 0.1
x0_range = arange(min(X[:,0]) - interval, max(X[:,0]) + interval, interval)
x1_range = arange(min(X[:,1]) - interval, max(X[:,1]) + interval, interval)
X_pairs = array([[x0, x1] for x1 in x1_range for x0 in x0_range])
y_hat_pairs = clf.predict(X_pairs)
x0_mesh, x1_mesh = meshgrid(x0_range, x1_range)
y_hat_mesh = y_hat_pairs.reshape(x0_mesh.shape)
pyplot.pcolormesh(x0_mesh, x1_mesh, y_hat_mesh, shading="auto")
markers = ['o','<','s']
colours = [(0.957, 0.263, 0.212, 1), (0.545, 0.765, 0.029, 0.7), (0.118, 0.533, 0.898, 0.5)]
for Xi, yi in zip(X, y):
pyplot.plot(Xi[0], Xi[1], marker=markers[yi], markerfacecolor=colours[yi], markersize=9, markeredgecolor='w')
pyplot.savefig("plots/p3-q4-logistic-regression-decision-bounaries")
pyplot.figure()
pyplot.xticks(fontsize=14)
pyplot.yticks(fontsize=14)
pyplot.xlabel("False postive rate", fontsize=14)
pyplot.ylabel("True postive rate", fontsize=14)
conf_scores = clf.decision_function(X_pairs)
y_binary = preprocessing.label_binarize(y_hat_pairs, classes=sorted(set(y)))
for c in range(3):
fpr, tpr, _ = metrics.roc_curve(y_binary[:, c], conf_scores[:, c])
pyplot.plot(fpr, tpr, label=classes[c], color=colours[c])
pyplot.legend()
pyplot.savefig("plots/p3-q4-logistic-regression-roc-curve")
del X, y, X_train, X_test, y_train, y_test, clf, y_hat
del interval, x0_range, x1_range, X_pairs, y_hat_pairs, x0_mesh, x1_mesh, y_hat_mesh
del markers, colours, Xi, yi, conf_scores, y_binary, c, fpr, tpr
print("\n")