In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

# Generalized Linear Models

After running K nearest neighbors, we were able to determine the accuracy of the algorithm. Another way to look at accuracy, how correct our model is, is *error*, how incorrect our model is. In some models, the error can be directly calculated based on the model itself. Imagine multiplying each datapoint in $X$ by some weight $w$ and adding a constant $b$:

$$h = X\times w + b$$

To determine how inaccurate the model is, simply subtract each ``h`` from the corresponding correct ``y``; if you square the result, you'll get the absolute error, ignoring the sign. Add this error over all the samples and divide by the number of samples to get the Mean Squared Error:

$$MSE = \frac{1}{n} \sum_{i=1}^{n} (h_i - y_i)^2$$

By default, scikit learn uses a $R^2$ error, which is slightly more complex, but MSE is a widely used error metric as well.

Let's go back to our regression data from last time, the ``sin`` wave:

In [None]:
x = np.linspace(-3, 3, 100)
rng = np.random.RandomState(42)
y = np.sin(4 * x) + x + rng.uniform(size=len(x))
plt.plot(x, y, 'o');

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = x[:, np.newaxis]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
min_pt = X.min() * regressor.coef_[0] + regressor.intercept_
max_pt = X.max() * regressor.coef_[0] + regressor.intercept_

plt.plot([X.min(), X.max()], [min_pt, max_pt])
plt.plot(X_train, y_train, 'o');

In [None]:
y_pred_train = regressor.predict(X_train)
plt.plot(X_train, y_train, 'o', label="data")
plt.plot(X_train, y_pred_train, 'o', label="prediction")
plt.plot([X.min(), X.max()], [min_pt, max_pt], label='fit')
plt.legend(loc='best');

In [None]:
y_pred_test = regressor.predict(X_test)
plt.plot(X_test, y_test, 'o', label="data")
plt.plot(X_test, y_pred_test, 'o', label="prediction")
plt.plot([X.min(), X.max()], [min_pt, max_pt], label='fit')
plt.legend(loc='best');

In [None]:
regressor.score(X_test, y_test)

<div class="alert alert-success">
    <b>EXERCISE: Linear regression on the diabetes dataset</b>:
     <ul>
      <li>
      Download ``02_diabetes_linear.py`` from the course website. Change the feature that linear regression is used on and see if any feature can be accurately described using linear regression.
      </li>
    </ul>
</div>

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Load the diabetes dataset
diabetes = datasets.load_diabetes()

feature = 0

diabetes_X = diabetes.data[:, np.newaxis, 0]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))

# Plot outputs
plt.figure(figsize=(20,10))
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

## Classification

Linear regression can also be used for classification, by trying to draw a line that splits the dataset into its respective classes.

In [None]:
from sklearn.datasets import make_blobs

X, y = make_blobs(centers=2, random_state=0)
plt.scatter(X[y == 0, 0], X[y == 0, 1], 
            c='blue', s=40, label='0')
plt.scatter(X[y == 1, 0], X[y == 1, 1], 
            c='red', s=40, label='1', marker='s')

plt.xlabel('first feature')
plt.ylabel('second feature')
plt.legend(loc='upper right');

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=1234,
                                                    stratify=y)

In [None]:
from sklearn.linear_model import LinearRegression

classifier = LinearRegression()
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)

We can compare these against the true labels:

In [None]:
print(prediction)
print(y_test)

In [None]:
classifier.score(X_test, y_test)

In [None]:
classifier.score(X_train, y_train)

Let's see what line the linear model made to separate the classes:

In [None]:
min_pt = X.min() * regressor.coef_[0] + regressor.intercept_
max_pt = X.max() * regressor.coef_[0] + regressor.intercept_

plt.plot([X.min(), X.max()], [min_pt, max_pt])
plt.scatter(X[y == 0, 0], X[y == 0, 1], c='blue', s=40, label='0')
plt.scatter(X[y == 1, 0], X[y == 1, 1], c='red', s=40, label='1', marker='s');