-
Notifications
You must be signed in to change notification settings - Fork 19
/
test_logistic_vs_synthetic_data.py
105 lines (83 loc) · 2.56 KB
/
test_logistic_vs_synthetic_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from logistic_regression import logistic_regression
def get_data(filename):
datamatrix = []
labelmatrix = []
fr = open(filename)
for line in fr.readlines():
lineArr = line.strip().split()
datamatrix.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelmatrix.append(int(lineArr[2]))
return datamatrix, labelmatrix
def plot_fit(fit_line, datamatrix, labelmatrix):
import matplotlib.pyplot as plt
import numpy as np
weights = fit_line.getA()
dataarray = np.asarray(datamatrix)
n = dataarray.shape[0]
# Keep track of the two classes in different arrays so they can be plotted later...
xcord1 = []
ycord1 = []
xcord2 = []
ycord2 = []
for i in range(n):
if int(labelmatrix[i]) == 1:
xcord1.append(dataarray[i, 1])
ycord1.append(dataarray[i, 2])
else:
xcord2.append(dataarray[i, 1])
ycord2.append(dataarray[i, 2])
fig = plt.figure()
# Plot the data as points with different colours
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
# Plot the best-fit line
x = np.arange(-3.0, 3.0, 0.1)
y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x, y)
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()
def accuracy(labels, hypotheses):
count = 0.0
correct = 0.0
for l, h in zip(labels, hypotheses):
count += 1.0
if l == h:
correct += 1.0
return correct / count
def print_confusion_matrix(labels, hypotheses):
tp = 0.0
tn = 0.0
fp = 0.0
fn = 0.0
count = 1.0
for l, h in zip(labels, hypotheses):
count += 1.0
if l == 1 and h == 1:
tp += 1.0
elif l == 1 and h == 0:
tp += 1.0
elif l == 0 and h == 0:
tn += 1.0
else:
fn += 1
print '-----------------------------'
print '\tConfusion Matrix'
print '-----------------------------'
print '\t\tPredicted'
print '\tActual\tNO\tYES'
print '-----------------------------'
print '\tNO\t', tn, '\t', fp
print '-----------------------------'
print '\tYES\t', fn, '\t', tp
print '-----------------------------'
X, Y = get_data('testSet.txt')
clf = logistic_regression(5000)
w = clf.fit(X, Y)
print 'Weights:', w
plot_fit(w, X, Y)
verify_x, verify_y = get_data('verify.txt')
hypotheses = clf.predict(verify_x)
print 'Accuracy:', accuracy(verify_y, hypotheses)
print_confusion_matrix(verify_y, hypotheses)