# Section 2

In [1]:
import cvxpy as cp
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

In [2]:
print('To use color, just append color.BOLD to the beginning of the printed string and color.END to the end:')
print(color.BOLD + 'Like This!' + color.END)

To use color, just append color.BOLD to the beginning of the printed string and color.END to the end:
[1mLike This![0m


Implement the mean absolute error:
$$
MAE = \frac{1}{N}\sum_{i=1}^N |y_i-x_i^\top\theta|
$$

In [3]:
def get_MAE(theta, X, y):
    mae = sum(abs(y - X@theta))/len(y)
    return mae

In [4]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
X, X_test, Y, Y_test = train_test_split(diabetes['data'], 
                                        np.expand_dims(diabetes['target'], 1), 
                                        test_size=0.5, random_state=0)

#adding a bias term
X = np.hstack([np.ones([X.shape[0],1]), X])
X_test = np.hstack([np.ones([X_test.shape[0],1]), X_test])

In [5]:
print(diabetes['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, T-Cells (a type of white blood cells)
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, thyroid stimulating hormone
      - s5      ltg, lamotrigine
      - s6      glu, blood sugar level

Note: Each of these 10 feature va

In [6]:
X.shape

(221, 11)

## Question 2.2
Implement below the mean-absolute error regression with LASSO. Use $\lambda=0.5$. Hints: in the X matrix, rows represent data samples. Also, don't forget to add the `1` column to capture the intercept. (Use the `GLPK` solver)

In [7]:
lambd =0.5

u = cp.Variable((X.shape[0],1))
v = cp.Variable((X.shape[1],1)) 
theta = cp.Variable((X.shape[1],1)) 

# define the objective 
objective = cp.Minimize(1/X.shape[0] * cp.sum(u) + lambd * cp.sum(v)) 

# define the constraints 
constraints = [ Y - X @ theta <= u, 
               -Y + X @ theta <= u, 
                theta <= v, 
               -theta <= v, 
    ] 
     
problem = cp.Problem(objective, constraints) 
problem.solve(solver=cp.GLPK)

120.48642533936652

In [8]:
theta_final = theta.value

In [9]:
theta_final

array([[81.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.]])

We note that all weights in theta are 0 except for the intercept. Therefore our regularization term is too high and oversimplifies the problem.

In [10]:
print(color.BOLD + 'Training Results' + color.END)
print('MAE: {}'.format(get_MAE(theta_final, X, Y)))
print('\n')
print(color.BOLD + 'Test Results' + color.END)
print('MAE: {}'.format(get_MAE(theta_final, X_test, Y_test)))

[1mTraining Results[0m
MAE: [79.98642534]


[1mTest Results[0m
MAE: [80.30769231]


## Question 2.3
Implement Cross-Validation for your MAE LASSO regression. You may recycle any functions used above. Hint: Use the `sklearn` function `train_test_split`, which can be used to randomly split the data.

In order to make the cross validation independent,we use a different data split and variables from question 2.2. We add '_h' to the name of each variable or matrix to make the distinction. 

In [11]:
X_h, X_h_test, Y_h, Y_h_test = train_test_split(diabetes['data'], 
                                        np.expand_dims(diabetes['target'], 1), 
                                        test_size=0.25, random_state=0)


#adding a bias term
X_h = np.hstack([np.ones([X_h.shape[0],1]),X_h])
X_h_test = np.hstack([np.ones([X_h_test.shape[0],1]), X_h_test])

lambdas = np.logspace(-5.0, -1.0, num=50)


u_h = cp.Variable((X_h.shape[0],1))
v_h = cp.Variable((X_h.shape[1],1)) 
theta_h = cp.Variable((X_h.shape[1],1)) 

constraints_h = [ Y_h - X_h @ theta_h <= u_h, 
               -Y_h + X_h @ theta_h <= u_h, 
                theta_h <= v_h, 
               -theta_h <= v_h, 
    ] 

best_mae = np.inf # setting initial value for comparing MAEs

for lam in lambdas:
    
    objective_h = cp.Minimize(1/X_h.shape[0] * cp.sum(u_h) + lam * cp.sum(v_h)) 
    prob_h = cp.Problem(objective_h, constraints_h)
    prob_h.solve(solver=cp.GLPK)
        
    weights = theta_h.value
    
    print("Lambda: " + str(lam))
    print(color.BOLD + 'Training Results' + color.END)
    print('MAE: {}'.format(get_MAE(weights, X_h, Y_h)))
    print(color.BOLD + 'Validation Results' + color.END)
    print('MAE: {}'.format(get_MAE(weights, X_h_test, Y_h_test)))
    print('------------------------------')
    
    val_mae = get_MAE(weights, X_h_test, Y_h_test)
    if val_mae < best_mae: # check if currect MAE is lower than the best 
        best_mae = val_mae
        best_theta = weights
        best_lambda = lam
        
print("Best Lambda: " + str(best_lambda))
print('Best validation MAE: {}'.format(best_mae))

Lambda: 1e-05
[1mTraining Results[0m
MAE: [42.44648107]
[1mValidation Results[0m
MAE: [46.51662089]
------------------------------
Lambda: 1.2067926406393289e-05
[1mTraining Results[0m
MAE: [42.44648107]
[1mValidation Results[0m
MAE: [46.51662089]
------------------------------
Lambda: 1.4563484775012445e-05
[1mTraining Results[0m
MAE: [42.44648107]
[1mValidation Results[0m
MAE: [46.51662089]
------------------------------
Lambda: 1.757510624854793e-05
[1mTraining Results[0m
MAE: [42.44648107]
[1mValidation Results[0m
MAE: [46.51662089]
------------------------------
Lambda: 2.1209508879201926e-05
[1mTraining Results[0m
MAE: [42.44648107]
[1mValidation Results[0m
MAE: [46.51662089]
------------------------------
Lambda: 2.559547922699533e-05
[1mTraining Results[0m
MAE: [42.4465326]
[1mValidation Results[0m
MAE: [46.51800284]
------------------------------
Lambda: 3.0888435964774785e-05
[1mTraining Results[0m
MAE: [42.4465326]
[1mValidation Results[0m
MAE: [4

We can see that the lowest value of the validation MAE is reached at $\lambda$ =0.004941713361323833, which we round to 0.005. We will use this value to determine the test performance on the initial data split defined in question 2.2. 

In [12]:
best_lam = 0.005
objective = cp.Minimize(1/X.shape[0] * cp.sum(u) + best_lam * cp.sum(v)) 
prob = cp.Problem(objective, constraints)

prob.solve(solver=cp.GLPK)
best_theta = theta.value

In [13]:
print(color.BOLD + 'Training Results' + color.END)
print('MAE: {}'.format(get_MAE(best_theta, X, Y)))
print('\n')
print(color.BOLD + 'Test Results' + color.END)
print('MAE: {}'.format(get_MAE(best_theta, X_test, Y_test)))

[1mTraining Results[0m
MAE: [43.63419122]


[1mTest Results[0m
MAE: [45.75799648]
