In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from numpy.random import default_rng
rng = default_rng(42)

# Linear Regression

### Prepare data

We start by using linear regression on a simple 1D problem. Our task is to predict the output, y, given the input X.

In [None]:
X = np.arange(0, 10, 0.3)
y = 1.5*X + rng.normal(0, 1.5, size=X.shape)
pd.DataFrame({'X':X, 'y':y}).head()

In [None]:
plt.scatter(X, y)

### Split data

Regression is a supervised task, and since we are interested in its performance on unseen data, we split our data into two parts:

1. a training set that the learning algorithm uses to fit the model
2. a test set to evaluate the generalization performance of the model

The ``train_test_split`` function from the ``model_selection`` module does that for us -- we will use it to split a dataset into 75% training data and 25% test data.

<img width="50%" src='https://github.com/fordanic/cmiv-ai-course/blob/master/notebooks/figures/train_test_split_matrix.png?raw=1'/>


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape

### Train model

This is deciding the architecture of our model. Our assumption is that the relationship between input and output is linear. So we constrain our model to be linear.

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train[:,np.newaxis], y_train)

In [None]:
plt.plot(X_train, y_train, 'o', c='orange')
plt.plot(X_test, y_test, 'o', c='r' )
plt.plot(X, regressor.predict(X[:, np.newaxis]), c='g')

## Multidimensional input

We can use the exact same procedure as about if the input data has more then one dimension.

In [None]:
from sklearn.datasets import load_linnerud
dataset = load_linnerud(as_frame=True)

In [None]:
dataset.frame.head()

### Predict the number of situps

We can try to predict the number of situps performed by a person based on their Weight, Waist and Pulse.

In [None]:
X = dataset.target
y = dataset.data['Situps']

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
def to_df(X, y, regressor):
    return pd.DataFrame(data = {'Weight': X.Weight, 'Waist':X.Waist, 'Pulse': X.Pulse, 'Ground Truth': y, 'Prediction':regressor.predict(X)})

In [None]:
to_df(X_test, y_test, regressor)

In [None]:
to_df(X, y, regressor)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
x_pca = pca.fit_transform(X)
plt.scatter(x_pca[:, 0], y, c='g')

In [None]:
plt.scatter(x_pca[:, 0], y, c='g')
plt.scatter(x_pca[:, 0], regressor.predict(X), c='r')

## Logistic regression

We will try to classify the green and orange points using Logistic regression.

In [None]:
rng = default_rng(42)
A = rng.uniform(0, 1, size=(100, 2))
green = A[np.where(A[:, 1] < A[:, 0]), :][0]
orange = A[np.where(A[:, 1] >= A[:, 0]), :][0]

green = green + np.random.normal(0, 0.1, size=green.shape)
orange = orange + np.random.normal(0, 0.1, size=orange.shape)

In [None]:
plt.scatter(green[:, 1], green[:, 0], c='green')
plt.scatter(orange[:, 1], orange[:, 0], c='orange')

In [None]:
X = np.concatenate([green, orange])
y = np.concatenate([[0]*green.shape[0], [1]*orange.shape[0]])

### Data split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape

### Train model

Here we set the architecture with the assumtion that the points can be separated linearly.

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

Lets calculate the decision boundary (this assumes that c0 != 0)

In [None]:
t = np.linspace(-0.3, 1.3, 100)
k = 0
m = 0
c1 = classifier.coef_[0][1]
c0 = classifier.coef_[0][0]
if c0 != 0:
    k = -c1/c0
    m = -classifier.intercept_ / c0
decision_boundary = k*t + m

In [None]:
plt.scatter(green[:, 1], green[:, 0], c='green')
plt.scatter(orange[:, 1], orange[:, 0], c='orange')
plt.plot(t, decision_boundary, c='y')

We can also calculate a score for the model. This is the average number of correct predictions. Note that this should be performed on the test set.

In [None]:
classifier.score(X_test, y_test)

In [None]:
classifier.score(X_train, y_train)