-
Notifications
You must be signed in to change notification settings - Fork 0
/
linear_regression.py
executable file
·177 lines (146 loc) · 7.66 KB
/
linear_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/python3
'''Simple implementation of linear regression
The output is given by a linear relation y = X * theta
where both X and theta are vectors.
Number of entries in those vectors depends on number of features (d)
that are used to estimate the output. Therefore X is 1 x d
and theta is d x 1. The goal is to estimate theta (function parameters)
given X and y.
This approach can be extended futher. More than one pair (X, y)
can be used to find out theta. Number of samples (pairs) is given by n.
Therefore X is n x d and theta d x n.
First, the plain old good least squares method is used to estimate theta.
Next, theta is found out using gradient descent. Finally stochastic gradient
descent is applied.'''
import argparse
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
def create_training_data(number_of_samples, number_of_features = 1, mean = 0, variance = 1):
'''Generate set of example inputs and outputs to be used as a training set.
The outputs are simply linear function of inputs calculated with a random slope and bias.
On top of that, a random noise is added to spicy things up a little bit.'''
# First calculate the set of example inputs - X
step = np.random.randn(number_of_features, number_of_samples)
X = np.linspace(0.0, 1, number_of_samples)
# Each value in the row vector X is multiplied by a corresponding value in step matrix
# resulting in number_of_features by number_of_features matrix
X = step * X
# Because the least square method calculates (X^T * X)^-1, (X^T * X) cannot be singular
# (X^T * X) will be full rank if and only if the rank of X
# is equal to min(number_of_features, number_of_samples)
expected_rank = np.min((number_of_features, number_of_samples))
actual_rank = np.linalg.matrix_rank(X)
while actual_rank != expected_rank:
print("The previously calculated input matrix was singular, recalculating")
# Recalculate the rank of X does not meet the expectations
step = np.random.randn(number_of_features, 1)
X = np.linspace(0.0, 1, number_of_samples)
X = step * X
actual_rank = np.linalg.matrix_rank(X)
X = X.transpose()
# Next calculate y as a linear function of X
# Slope and bias are selected randomly for each feature separately
slope = np.random.randint(1, 5, (number_of_features, 1))
# Keep the bias small, otherwise the generated data is kind of rubbish
bias = np.sqrt(0.1) * np.random.randn(number_of_samples, 1)
y = X.dot(slope) + bias
# Finally add some Gaussian noise on top of calculated y
# Because I am lazy, mean and variance are common across all feautures
noise = np.sqrt(variance) * np.random.randn(number_of_samples, 1) + mean
y = y + noise
return (X, y)
def estimate_theta_using_least_squares(X, y):
'''Estimate theta using least squares method
theta_hat = (X^T * X)^-1 * X^T * y'''
theta_hat = np.linalg.inv(X.transpose().dot(X)).dot(X.transpose()).dot(y)
return theta_hat
def estimate_theta_using_gradient_descent(X, y, number_of_iterations,
learning_rate = 0.01):
'''Estimate theta using gradient descent'''
number_of_samples, number_of_features = X.shape
theta_hat = np.zeros(number_of_features)
for _ in range(number_of_iterations):
corrections = np.zeros(number_of_features)
# Sum over all samples ((theta^T * X[sample] - y[sample]) * X[sample])
for sample in range(number_of_samples):
# Theoretically, because theta_hand and X[sample, :] are 1-D array,
# the theta_hat.traspose() is not needed
corrections += (theta_hat.transpose().dot(X[sample, :]) - y[sample, :]) * X[sample, :]
theta_hat -= learning_rate * corrections
# Return 2-D array
theta_hat = theta_hat.reshape(number_of_features, 1)
return theta_hat
def estimate_theta_using_stochastic_gradient_descent(X, y, number_of_iterations,
learning_rate = 0.01):
'''Estimate theta using gradient descent'''
number_of_samples, number_of_features = X.shape
theta_hat = np.zeros(number_of_features)
for _ in range(number_of_iterations):
# Theoretically, because theta_hand and X[sample, :] are 1-D array,
# the theta_hat.traspose() is not needed
sample = np.random.randint(0, number_of_samples - 1)
corrections = (theta_hat.transpose().dot(X[sample, :]) - y[sample, :]) * X[sample, :]
theta_hat -= learning_rate * corrections
# Return 2-D array
theta_hat = theta_hat.reshape(number_of_features, 1)
return theta_hat
def calculate_mean_squared_error(y, y_hat):
'''Calculate mean squared error between the training output
and the output calculated using the estimated paramaters.
The result is rounded to two decimal places'''
mse = np.mean(np.square(y - y_hat))
mse = np.round(mse, decimals = 2)
return mse
def main():
'''The main function'''
# First parse the input arguments
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--plot', default = False, type = bool)
parser.add_argument('-f', '--number_of_features', default = 1, type = int)
parser.add_argument('-n', '--number_of_samples', default = 100, type = int)
parser.add_argument('-i', '--number_of_iterations', default = 500, type = int)
args = parser.parse_args()
print("Hello")
# Generate the training set
print("Step 1 - generate training data")
X, y = create_training_data(args.number_of_samples, args.number_of_features, variance = 1)
# Estimate theta using the least squares method
print("Step 2 - calculate theta using least squares method")
theta_hat_least_squares = estimate_theta_using_least_squares(X, y)
y_hat_least_squares = X.dot(theta_hat_least_squares)
# Estimate theta using the gradient descent method
print("Step 3 - calculate theta using gradient descent")
theta_hat_gd = estimate_theta_using_gradient_descent(X, y, args.number_of_iterations)
y_hat_gd = X.dot(theta_hat_gd)
# Estimate theta using the stochastic gradient descent method
print("Step 4 - calculate theta using stochastic gradient descent")
theta_hat_sgd = estimate_theta_using_stochastic_gradient_descent(X, y,
args.number_of_iterations)
y_hat_sgd = X.dot(theta_hat_sgd)
# Calculate MSE
print("Step 5 - calculate MSE")
mse_least_squares = calculate_mean_squared_error(y, y_hat_least_squares)
mse_gd = calculate_mean_squared_error(y, y_hat_gd)
mse_sgd = calculate_mean_squared_error(y, y_hat_sgd)
print("Least squares method MSE = ", mse_least_squares)
print("Gradient descent method MSE = ", mse_gd)
print("Stochastic gradient descent method MSE = ", mse_sgd)
# Plotting makes sense only if number of features is set to 1
if args.plot is True and args.number_of_features == 1:
matplotlib.use('TkAgg')
# Plot the output value against the values of the first features
plt.scatter(X[:, 0], y[:, 0], label = 'Training data')
plt.plot(X[:, 0], y_hat_least_squares[:, 0], 'r', label = 'Least squares estimate')
plt.plot(X[:, 0], y_hat_gd[:, 0], 'og', label = 'Gradient descent')
plt.plot(X[:, 0], y_hat_sgd[:, 0], 'xm', label = 'Stochastic gradient descent')
plt.title('Feature values against output values')
plt.xlabel('Feature values')
plt.ylabel('Output values')
plt.legend(loc='lower right')
plt.grid(True)
print("Plot ready")
plt.show()
print("Goodbye")
if __name__ == "__main__":
main()