-
Notifications
You must be signed in to change notification settings - Fork 59
/
multiple_linear_regression.py
257 lines (208 loc) · 9.11 KB
/
multiple_linear_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# Example of solving multivariate linear regression in Python.
#
# Uses only Numpy, with Matplotlib for plotting.
#
# Eli Bendersky (http://eli.thegreenplace.net)
# This code is in the public domain
from __future__ import print_function
import csv
import matplotlib.pyplot as plt
import numpy as np
from timer import Timer
def read_CCPP_data(filename):
"""Read data from the given CCPP CSV file.
Returns (data, header). data is a 2D Numpy array of type np.float32, with a
sample per row. header is the names of the columns as read from the CSV
file.
"""
with open(filename, 'rb') as file:
reader = csv.reader(file)
header = reader.next()
return np.array(list(reader), dtype=np.float32), header
def feature_normalize(X):
"""Normalize the feature matrix X.
Given a feature matrix X, where each row is a vector of features, normalizes
each feature. Returns (X_norm, mu, sigma) where mu and sigma are the mean
and stddev of features (vectors).
"""
num_features = X.shape[1]
mu = X.mean(axis=0)
sigma = X.std(axis=0)
X_norm = (X - mu) / sigma
return X_norm, mu, sigma
def compute_cost(X, y, theta):
"""Compute the MSE cost of a prediction based on theta, over the whole X.
X: (k, n) each row is an input with n features (including an all-ones
column that should have been added beforehead).
y: (k, 1) observed output per input.
theta: (n, 1) regression parameters.
Note: expects y and theta to be proper column vectors.
"""
k = X.shape[0]
# Vectorized computation of yhat per sample.
yhat = np.dot(X, theta)
diff = yhat - y
# Vectorized computation using a dot product to compute sum of squares.
cost = np.dot(diff.T, diff) / k
# Cost is a 1x1 matrix, we need a scalar.
return cost.flat[0]
def gradient_descent(X, y, nsteps, learning_rate=0.1):
"""Runs gradient descent optimization to fit a line y^ = theta.dot(x).
X: (k, n) each row is an input with n features (including an all-ones column
that should have been added beforehead).
y: (k, 1) observed output per input.
nsteps: how many steps to run the optimization for.
learning_rate: learning rate of gradient descent.
Yields 'nsteps + 1' pairs of (theta, cost) where theta is the fit parameter
shaped (n, 1) for that step, and its cost vs the real y. The first pair has
the initial theta and cost; the rest carry results after each of the
iteration steps.
"""
k, n = X.shape
theta = np.zeros((n, 1))
yield theta, compute_cost(X, y, theta)
for step in range(nsteps):
# yhat becomes a (k, 1) array of predictions, per sample.
yhat = np.dot(X, theta)
diff = yhat - y
dtheta = np.zeros((n, 1))
for j in range(n):
# The sum over all samples is computed with a dot product between
# (y^-y) and the jth feature across all of X.
dtheta[j, 0] = learning_rate * np.dot(diff.T, X[:, j]) / k
theta -= dtheta
yield theta, compute_cost(X, y, theta)
def compute_normal_eqn(X, y):
"""Compute theta using the normal equation.
X is the input matrix with a leftmost column of 1s. Returns theta as a
column vector.
"""
XTX = np.dot(X.T, X)
# Using linalg.inv here, which will bomb for a singular matrix.
# Alternatively, we could use linalg.pinv to compute a pseudo-inverse.
XTX_inv = np.linalg.inv(XTX)
xdot = np.dot(XTX_inv, X.T)
return np.dot(xdot, y)
def split_dataset_to_train_test(dataset, train_proportion=0.8):
"""Splits the dataset to a train set and test set.
The split is done over a random shuffle of the rows of the dataset. Assumes
each row in the data has the expected outcome in the last column.
train_proportion:
The proportion of data to keep in the training set. The rest goes to the
test set.
Returns (X_train, y_train, X_test, y_test) where y_train/y_test are column
vectors taken from the last column of the dataset.
"""
shuffled_dataset = np.random.permutation(dataset)
k_train = int(shuffled_dataset.shape[0] * train_proportion)
X_train = shuffled_dataset[:k_train, :-1]
y_train = shuffled_dataset[:k_train, -1].reshape(-1, 1)
X_test = shuffled_dataset[k_train:, :-1]
y_test = shuffled_dataset[k_train:, -1].reshape(-1, 1)
return X_train, y_train, X_test, y_test
def compute_rsquared(X, y, theta):
"""Compute R^2 - the coefficeint of determination for theta.
X: (k, n) input.
y: (k, 1) observed output per input.
theta: (n, 1) regression parameters.
Returns the R2 - a scalar.
"""
k = X.shape[0]
yhat = np.dot(X, theta)
diff = yhat - y
SE_line = np.dot(diff.T, diff)
SE_y = len(y) * y.var()
return (1 - SE_line / SE_y).flat[0]
def plot_cost_vs_step(costs):
"""Given an array of costs, plots them vs. index.
Uses logarithmic scale for y be cause the cost tends to be very large
initially.
"""
fig, ax = plt.subplots()
ax.plot(range(len(costs)), costs)
ax.set_yscale('log')
plt.show()
def plot_correlation_heatmap(X, header):
"""Plot a heatmap of the correlation matrix for X.
This requires the seaborn package to be installed.
"""
import seaborn
cm = np.corrcoef(X.T)
hm = seaborn.heatmap(cm,
cbar=True,
annot=True,
square=True,
yticklabels=header,
xticklabels=header)
plt.show()
def sample_predictions_vs_truth(X, y, theta, nsamples=10):
"""Display a sample of predictions vs. true values."""
print('Sample of predictions vs. true values')
yhat = np.dot(X, theta)
sample_indices = np.random.choice(X.shape[0], size=nsamples, replace=False)
for index in sample_indices:
print(' sample #{0}: yhat={1}, y={2}'.format(index,
yhat[index][0],
y[index][0]))
if __name__ == '__main__':
# Follow through the code here to see how the functions are used. No
# plotting is done by default. Uncomment relevant lines to produce plots.
# For reproducibility
np.random.seed(42)
# This file was dowloaded from:
# https://archive.ics.uci.edu/ml/machine-learning-databases/00294/ and then
# unzipped.
filename = 'CCPP-dataset/data.csv'
with Timer('reading data'):
X, header = read_CCPP_data(filename)
# Plot a heatmap for the correlation matrix of X. This requires the seaborn
# package. This heatmap is a useful visualization for finding features that
# are most correlated with the result, and features that are possibly
# collinear.
#plot_correlation_heatmap(X, header)
print('Read {0} data samples from {1}'.format(len(X), filename))
X_train, y_train, X_test, y_test = split_dataset_to_train_test(X)
print('Data shapes:')
print(' X_train:', X_train.shape)
print(' y_train:', y_train.shape)
print(' X_test:', X_test.shape)
print(' y_test:', y_test.shape)
# Normalize X to bring all features into the same scale. Also, add a
# all-ones column as the first column of X ("augmented X") to serve as the
# bias term.
ktrain = X_train.shape[0]
X_train_normalized, mu, sigma = feature_normalize(X_train)
X_train_augmented = np.hstack((np.ones((ktrain, 1)), X_train_normalized))
# Run gradient descent.
NSTEPS = 500
with Timer('Running gradient descent ({0} steps)'.format(NSTEPS)):
thetas_and_costs = list(gradient_descent(X_train_augmented,
y_train, NSTEPS))
# Plot cost vs. step for the last 100 steps (the first steps have an
# enourmous errors compared to the final steps).
#plot_cost_vs_step([cost for _, cost in thetas_and_costs][:100])
last_theta = thetas_and_costs[-1][0]
print('Best theta found:', last_theta)
print('Training set MSE:',
compute_cost(X_train_augmented, y_train, last_theta))
print('Training set R^2:',
compute_rsquared(X_train_augmented, y_train, last_theta))
# Normalize the test set using the mu/sigma computed from the training set,
# and augment it with the bias column of 1s.
ktest = X_test.shape[0]
X_test_normalized = (X_test - mu) / sigma
X_test_augmented = np.hstack((np.ones((ktest, 1)), X_test_normalized))
print('Test set MSE:',
compute_cost(X_test_augmented, y_test, last_theta))
print('Test set R^2:',
compute_rsquared(X_test_augmented, y_test, last_theta))
# To assess how good the fit is, print out a random sample of predictions
# for the test set compared to the real y values for these inputs.
sample_predictions_vs_truth(X_test_augmented, y_test, last_theta)
# Compute theta using the normal equation and report MST / R^2.
theta_from_normal_eqn = compute_normal_eqn(X_train_augmented, y_train)
print('Theta from normal equation:', theta_from_normal_eqn)
print('Test set MSE / normal:',
compute_cost(X_test_augmented, y_test, theta_from_normal_eqn))
print('Test set R^2 / normal:',
compute_rsquared(X_test_augmented, y_test, theta_from_normal_eqn))