#Tutorial Module 1

The tutorial is related to quantile regression in linear formulation. The task is based on the Instacart example from the lecture.

You are provided with the data on distance between shop and buyer and the time spent to deliver the order of the buyer. Your will see how to explore the data and to build quantile regression models to estimate the delivery predictions.

For help check sklearn documentation for [linear](https://scikit-learn.org/stable/modules/linear_model.html?highlight=quantile+regression#ordinary-least-squares) and [quantile](https://scikit-learn.org/stable/modules/linear_model.html?highlight=quantile+regression#quantile-regression) regressions, as well as for [cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html?highlight=cross+validation).



In [1]:
# import dependencies
# DO NOT MODIFY THIS CELL
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import QuantileRegressor, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_validate

# some magic
%matplotlib inline
%config InlineBackend.figure_format='retina' # high-resolution plots

In [None]:
# Load the given .csv dataset, check the delimiter
# df = ...
url = "https://raw.githubusercontent.com/dsfb2/dsfb2-2023/main/assignment_1/data/a1_tutorial_data.csv"
df = pd.read_csv(url, delimiter=";")

In [None]:
# Print the first 5 rows of the DataFrame "df"
df.head(5)

In [None]:
# Visualize data
# DO NOT MODIFY THIS CELL
plt.clf()
plt.rcParams.update({'font.size': 10, 'figure.dpi':150})
sns.scatterplot(data=df, x="distance", y="time_to_buyer", marker='+')
plt.grid(linestyle='-', linewidth=0.2)

In [None]:
# Create the geature matrix (numpy array X) from the column "distance" of "df" DataFrame, make sure X.shape==(len(X),1)
# and create label vector (numpy array y) from "time_to_buyer" of "df" DataFrame, make sure y.shape==(len(y),)
X = df["distance"].to_numpy().reshape(-1,1)
y = df["time_to_buyer"].to_numpy()

In [None]:
# Create a list named quantiles, that contains the values of quantiles to estimate: 0.1, 0.5, 0.9
# quantiles = ...
quantiles = [0.1, 0.5, 0.9]

In [None]:
# Create an empty dictionary named predictions, where we will store our results
# predictions = ...
predictions = {}

In [None]:
# for ... in ... :          # loop through the list of quantiles and estimate quantile regression
#   qr = ...                # initialise a QuantileRegression classifier, set quantile and set argument alpha = 0
#   qr...                   # fit the model to the data
#   y_pred_qr = ...         # computed predicted labels for all data points
for quantile in quantiles:
  qr = QuantileRegressor(quantile=quantile, alpha=0, solver='highs')
  qr.fit(X, y)
  y_pred_qr = qr.predict(X)
  predictions[quantile] = y_pred_qr

In [None]:
# lr = ...                  # initialise a LinearRegression classifier, use default value for all arguments
# lr...                     # fit the model to the data
# y_pred_lr = ...           # computed predicted labels for all data points

lr = LinearRegression()
lr.fit(X, y)
y_pred_lr = lr.predict(X)

In [None]:
# Visualize data
# DO NOT MODIFY THIS CELL
colordict = {
  0.1 : "orange",
  0.5 : "green",
  0.9 : "blue"
  }
plt.clf()
plt.rcParams.update({'font.size': 10, 'figure.dpi':300})
for quantile, y_pred_qr in predictions.items():
    plt.plot(X, y_pred_qr, label=f"Quantile: {quantile}", color = colordict.get(quantile))
plt.plot(X, y_pred_lr, color="red", linestyle="dotted", label = "OLS")
plt.scatter(X, y, marker = '.', color = "black", s=0.5)
plt.xlabel("distance")
plt.ylabel("time_to_buyer")
plt.legend()

In [None]:
# linear_regression = ...                   # initialise a LinearRegression classifier, use default value for all arguments
# linear_regression...                      # fit the model to the data
# y_pred_linear_regression = ...            # computed predicted labels for all data points
linear_regression = LinearRegression()
linear_regression.fit(X, y)
y_pred_linear_regression = linear_regression.predict(X)

In [None]:
# quantile_regression = ...                   # initialise a QuantileRegression classifier, choose quantile equivalent to median estimation, set argument alpha = 0
# quantile_regression...                      # fit the model to the data
# y_pred_quantile_regression = ...            # computed predicted labels for all data points
quantile_regression = QuantileRegressor(quantile=0.5, alpha=0, solver='highs')
quantile_regression.fit(X, y)
y_pred_quantile_regression = quantile_regression.predict(X)


In [None]:
# Print the results
# DO NOT MODIFY THIS CELL
print(
  f"""Training error (in-sample performance)
  {linear_regression.__class__.__name__}:
  MAE = {mean_absolute_error(y, y_pred_linear_regression):.3f}
  MSE = {mean_squared_error(y, y_pred_linear_regression):.3f}
  {quantile_regression.__class__.__name__}:
  MAE = {mean_absolute_error(y, y_pred_quantile_regression):.3f}
  MSE = {mean_squared_error(y, y_pred_quantile_regression):.3f}
  """
)

In [None]:
# Create cross-validation instance for linear regression
cv_results_lr = cross_validate(
  linear_regression,
  X,
  y,
  cv=3,
  scoring=["neg_mean_absolute_error", "neg_mean_squared_error"],
)

In [None]:
# Create cross-validation instance for quantile regression
cv_results_qr = cross_validate(
  quantile_regression,
  X,
  y,
  cv=3,
  scoring=["neg_mean_absolute_error", "neg_mean_squared_error"],
)

In [None]:
# Print the results
# DO NOT MODIFY THIS CELL
print(
    f"""Test error (cross-validated performance)
    {linear_regression.__class__.__name__}:
    MAE = {-cv_results_lr["test_neg_mean_absolute_error"].mean():.3f}
    MSE = {-cv_results_lr["test_neg_mean_squared_error"].mean():.3f}
    {quantile_regression.__class__.__name__}:
    MAE = {-cv_results_qr["test_neg_mean_absolute_error"].mean():.3f}
    MSE = {-cv_results_qr["test_neg_mean_squared_error"].mean():.3f}
    """
)