# Supervised Learning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error

## Seed the random number generator

In [None]:
rng = np.random.RandomState(2)

## Read in dataset

In [None]:
import os
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "./drive/My Drive/Colab Notebooks/" # You may need to change this, depending on where your notebooks are on Google Drive
else:
    base_dir = "."
dataset_dir = os.path.join(base_dir, "datasets")

In [None]:
df = pd.read_csv(os.path.join(dataset_dir, "synthetic.csv"))

## Take a cheeky look - not too much!

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe(include="all")

## Train a linear model (fit)

In [None]:
X = df[["feature"]].values
y = df["target"].values

In [None]:
linear_model = LinearRegression()
linear_model.fit(X, y)

In [None]:
linear_model.intercept_, linear_model.coef_

## Use it for inference (predict)

In [None]:
linear_model.predict([[0.65]]) 

## Error estimation - evaluate the model (training error - wrong! - leakage)

In [None]:
mean_absolute_error(linear_model.predict(X), y)

## Split into training set and test set (holdout)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rng)

In [None]:
linear_model.fit(X_train, y_train)

## Error estimation - evaluate the model (test error)

In [None]:
mean_absolute_error(linear_model.predict(X_test), y_test)

In [None]:
mean_absolute_error(linear_model.predict(X_train), y_train)

## Visualise the training set and the linear model

In [None]:
def plot_scatter_and_line(xs_scatter, ys_scatter, xs_line, ys_line):
    fig, ax = plt.subplots()
    sns.scatterplot(x=xs_scatter, y=ys_scatter, ax=ax)
    sns.lineplot(x=xs_line, y=ys_line, color='g', ax=ax)

In [None]:
xs_line = np.linspace(0, 1, 50)

In [None]:
plot_scatter_and_line(X_train.flatten(), y_train, xs_line, linear_model.predict(xs_line.reshape(50, 1)))

## Try some other models (underfitting and overfitting)

In [None]:
quadratic_model = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("predictor", LinearRegression())
])

In [None]:
quadratic_model.fit(X_train, y_train)

In [None]:
plot_scatter_and_line(X_train.flatten(), y_train, xs_line, quadratic_model.predict(xs_line.reshape(50, 1)))

In [None]:
cubic_model = Pipeline([
    ("poly", PolynomialFeatures(degree=3, include_bias=False)),
    ("predictor", LinearRegression())
])

In [None]:
cubic_model.fit(X_train, y_train)

In [None]:
plot_scatter_and_line(X_train.flatten(), y_train, xs_line, cubic_model.predict(xs_line.reshape(50, 1)))

In [None]:
degree_100_model = Pipeline([
    ("poly", PolynomialFeatures(degree=100, include_bias=False)),
    ("predictor", LinearRegression())
])

In [None]:
degree_100_model.fit(X_train, y_train)

In [None]:
plot_scatter_and_line(X_train.flatten(), y_train, xs_line, degree_100_model.predict(xs_line.reshape(50, 1)))