In [None]:
%pylab inline

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error
import seaborn
np.random.seed(12345)
seaborn.set_style('white')
seaborn.set_context('talk')

# Generate Simple Example Data

In [None]:
x = np.linspace(-3, 3, num=200)[:, np.newaxis]
y = 1.5*x[:, 0] + np.random.randn(x.shape[0])
train_x = x[:150]
train_y = y[:150]
test_x = x[150:]
test_y = y[150:]


In [None]:
x[0:5]

In [None]:
y[0:5]

In [None]:
plot(train_x, train_y)
plot(test_x, test_y)

# Fit a Linear Regression

In [None]:
model = LinearRegression()

model.fit(train_x, train_y)

train_pred = model.predict(train_x)
test_pred = model.predict(test_x)

In [None]:
plot(train_x, train_y)
plot(test_x, test_y)
plot(train_x, train_pred, 'r')
plot(test_x, test_pred, 'r')
_ = legend(['Train', 'Test', 'Prediction'])

In [None]:
print("Train RMSE Score: {:.2f}".format(np.sqrt(mean_squared_error(train_y, train_pred))))
print("Test RMSE Score: {:.2f}".format(np.sqrt(mean_squared_error(test_y, test_pred))))

# Generate Example Data

In [None]:
x = np.linspace(-3, 3, num=200)[:, np.newaxis]
y = 1.5*x[:, 0]**3 + 3*np.random.randn(x.shape[0])
train_x = x[:150]
train_y = y[:150]
test_x = x[150:]
test_y = y[150:]


In [None]:
x[0:5]

In [None]:
y[0:5]

In [None]:
plot(train_x, train_y)
plot(test_x, test_y)

# Fit a Linear Regression

In [None]:
model = LinearRegression()

model.fit(train_x, train_y)

train_pred = model.predict(train_x)
test_pred = model.predict(test_x)

In [None]:
plot(train_x, train_y)
plot(test_x, test_y)
plot(train_x, train_pred, 'r')
plot(test_x, test_pred, 'r')
_ = legend(['Train', 'Test', 'Prediction'])

In [None]:
print("Train RMSE Score: {:.2f}".format(np.sqrt(mean_squared_error(train_y, train_pred))))
print("Test RMSE Score: {:.2f}".format(np.sqrt(mean_squared_error(test_y, test_pred))))

# Fit a Linear Regression with cubic term

#### Even though the regression is 'linear', we can use nonlinear combinations of the input variables such as $x^3$

In [None]:
model = LinearRegression()
model.fit(train_x**3, train_y)
train_pred = model.predict(train_x**3)
test_pred = model.predict(test_x**3)

In [None]:
plot(train_x, train_y)
plot(test_x, test_y)
plot(train_x, train_pred, 'r')
plot(test_x, test_pred, 'r')
_ = legend(['Train', 'Test', 'Prediction'])

In [None]:
print("Train RMSE Score: {:.2f}".format(np.sqrt(mean_squared_error(train_y, train_pred))))
print("Test RMSE Score: {:.2f}".format(np.sqrt(mean_squared_error(test_y, test_pred))))

#### We get a much better fit by using the transformed variable

# Importance of Regularization

#### Often we don't know exactly which transformed variables we need to use, so we have to try all of them. Unfortunately, this can cause normal linear regression to go badly wrong

In [None]:
# Use all powers of variable between 1 and 5
new_train_x = np.hstack((train_x, train_x**2, train_x**3, train_x**4, train_x**5, train_x**6, train_x**7))
new_test_x = np.hstack((test_x, test_x**2, test_x**3, test_x**4, test_x**5, test_x**6, test_x**7))

In [None]:
model = LinearRegression()
model.fit(new_train_x, train_y)
train_pred = model.predict(new_train_x)
test_pred = model.predict(new_test_x)

In [None]:
plot(train_x, train_y)
plot(test_x, test_y)
plot(train_x, train_pred, 'r')
plot(test_x, test_pred, 'r')
_ = legend(['Train', 'Test', 'Prediction'])

In [None]:
print("Train RMSE Score: {:.2f}".format(np.sqrt(mean_squared_error(train_y, train_pred))))
print("Test RMSE Score: {:.2f}".format(np.sqrt(mean_squared_error(test_y, test_pred))))

#### Uh oh!

#### Using a regularized regression (such as Lasso) can stop the model going too far out of line

In [None]:
model = Lasso(alpha=4) # Pick this value with cross-validation
model.fit(new_train_x, train_y)
train_pred = model.predict(new_train_x)
test_pred = model.predict(new_test_x)

In [None]:
plot(train_x, train_y)
plot(test_x, test_y)
plot(train_x, train_pred, 'r')
plot(test_x, test_pred, 'r')
_ = legend(['Train', 'Test', 'Prediction'])

In [None]:
print("Train RMSE Score: {:.2f}".format(np.sqrt(mean_squared_error(train_y, train_pred))))
print("Test RMSE Score: {:.2f}".format(np.sqrt(mean_squared_error(test_y, test_pred))))