# Ridge Regression 

Esercizio pratico su dataset 

Implementazione senza l'utilizzo di GraphLab Create

In [17]:
import graphlab
import numpy as np 
from math import sqrt


In [15]:
sales = graphlab.SFrame("Regression/data/kc_house_data.csv")

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,float,int,float,int,int,float,int,int,int,int,int,int,int,int,int,float,float,int,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [16]:
len(sales)

21613

##Implementazione senza GraphLab Create

In [18]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # aggiungiamo una colonna costante all' SFrame
    #aggiungi la colonna costante prima delle feature considerate:
    features = ['constant'] + features # combino le due liste
    #seleziono le colonne del SFrame contenute all'interno della lista di feature
    features_sframe = data_sframe[features]
    #Converto feature_sframe in una matrice numpy
    feature_matrix = features_sframe.to_numpy()
    #assegna la colonna corrispondente all'output a SArray output_sarray
    output_sarray = data_sframe[output]
    #converto output_sarray in un array_numpy
    output_array = output_sarray.to_numpy()
    return(feature_matrix, output_array)

In [19]:
def predict_output(feature_matrix, weights):
    predictions = []
    for col in range(feature_matrix.shape[0]):
        predictions.append(np.dot(feature_matrix[col,], weights))
    return(predictions)

### Calcolo della derivata

In [20]:
def feature_derivative(errori, feature):
    # Nel caso della feature w[0] la derivata non considera l2_penalty
        derivative = 2 * np.dot(errori, feature)
        return derivative

### Discesa del gradiente con Ridge Regression

In [50]:
def gradient_descent_RR(feature_matrix, output, initial_weights, step_size, l2_penalty, tolerance):
    converged = False
    iterations=0
    weights = np.array(initial_weights) # trasformo i pesi iniziali in un array 
    while not converged and iterations <1000:
    #calcolo le predizioni basandomi sulle feature matrix e sui pesi
        predictions = predict_output(feature_matrix, weights)
        # calcolo gli errori
        iterations=iterations+1
        errors = predictions - output
        gradient_cost=0
        #fino a che non ho raggiunto il grado di tolleranza, aggiorno i pesi delle features
        for i in range(len(weights)): # loop su ogni feature
            # calcolo la derivata per la feature[i]
            derivative = feature_derivative(errors,feature_matrix[:,i])
            if(i==0):
            #sottraggo lo step size * derivata del peso corrente
                weights[i] -= step_size * derivative
                gradient_cost = gradient_cost + derivative**2
            else:
                weights[i] = (1-2*step_size*l2_penalty)*weights[i]- step_size*derivative
                gradient_cost = gradient_cost+((derivative + 2*l2_penalty*weights[i])**2)
                
        #calcolo il costo della ridge
        gradient_magnitude = sqrt(gradient_cost)
        if iterations%10 == 0:
            print("Iterazione: ", iterations , " -> " , gradient_magnitude)
        if gradient_magnitude < tolerance:
            converged = True
    return weights

### Visualizzare gli effetti di L2 penalty

In [22]:
simple_features = ['sqft_living']
my_output = 'price'

In [23]:
train_data, test_data = sales.random_split(.8,seed=0)

In [24]:
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
(simple_test_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

In [25]:
len(output)

17384

In [37]:
initial_weights = np.array([-45000, 1.])
step_size = 1e-11
tolerance = 1e7

In [40]:
simple_weights_0_penalty = gradient_descent_RR(simple_feature_matrix, 
                                               output, 
                                               initial_weights, 
                                               step_size, 
                                               0.00000000001, 
                                               tolerance)

('Iterazione: ', 100, ' -> ', 12073620.313832276)


('Iterazione: ', 200, ' -> ', 12073545.327394407)


('Iterazione: ', 300, ' -> ', 12073476.443402644)


('Iterazione: ', 400, ' -> ', 12073407.559803462)


('Iterazione: ', 500, ' -> ', 12073338.676597288)


('Iterazione: ', 600, ' -> ', 12073269.793783735)


('Iterazione: ', 700, ' -> ', 12073200.911363998)


('Iterazione: ', 800, ' -> ', 12073132.02933677)


('Iterazione: ', 900, ' -> ', 12073063.147702597)


('Iterazione: ', 1000, ' -> ', 12072994.266461425)


In [41]:
simple_weights_high_penalty = gradient_descent_RR(simple_feature_matrix, 
                                                  output, 
                                                  initial_weights, 
                                                  step_size, 
                                                  1000000000, 
                                                  tolerance)

('Iterazione: ', 100, ' -> ', 211379678.16315478)


('Iterazione: ', 200, ' -> ', 211378362.26179492)


('Iterazione: ', 300, ' -> ', 211377088.74680555)


('Iterazione: ', 400, ' -> ', 211375815.23949048)


('Iterazione: ', 500, ' -> ', 211374541.73984706)


('Iterazione: ', 600, ' -> ', 211373268.24787632)


('Iterazione: ', 700, ' -> ', 211371994.7635771)


('Iterazione: ', 800, ' -> ', 211370721.2869528)


('Iterazione: ', 900, ' -> ', 211369447.8179985)


('Iterazione: ', 1000, ' -> ', 211368174.35671824)


### Calcolo del RSS

In [42]:
predictions_1 = predict_output(simple_test_feature_matrix, initial_weights)
residuals_1 = [(predictions_1[i] - test_output[i]) ** 2 for i in range(len(predictions_1))]
print sum(residuals_1)

1.98730283193e+15


In [43]:
predictions_2 = predict_output(simple_test_feature_matrix, simple_weights_0_penalty)
residuals_2 = [(predictions_2[i] - test_output[i]) ** 2 for i in range(len(predictions_2))]
print sum(residuals_2)

2.75353167186e+14


In [44]:
predictions_3 = predict_output(simple_test_feature_matrix, simple_weights_high_penalty)
residuals_3 = [(predictions_3[i] - test_output[i]) ** 2 for i in range(len(predictions_3))]
print sum(residuals_3)

2.75620251705e+14


In [45]:
simple_weights_0_penalty

array([-45000.00816622,    281.10845433])

In [46]:
simple_weights_high_penalty

array([-44997.77615304,    278.01769821])

## Multiple regression con L2 penalty

Consideriamo un modello con 2 feature: `['sqft_living', 'sqft_living15']`.

In [47]:
model_features = ['sqft_living', 'bedrooms'] 
my_output = 'price'
(feature_matrix, train_output) = get_numpy_data(train_data, model_features, my_output)
(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)

Definiamo nuovamente i pesi

In [48]:
initial_weights = np.array([0.0,0.0,0.0])
step_size = 1e-12
tolerance = 18e8

In [51]:
multiple_weights_0_penalty = gradient_descent_RR(feature_matrix, 
                                                 train_output, 
                                                 initial_weights, 
                                                 step_size, 
                                                 0, 
                                                 tolerance)

('Iterazione: ', 10, ' -> ', 7937755597232.643)


('Iterazione: ', 20, ' -> ', 1091625055434.2767)


('Iterazione: ', 30, ' -> ', 150134049232.293)


('Iterazione: ', 40, ' -> ', 20723418512.329338)


('Iterazione: ', 50, ' -> ', 3359864284.6231213)


('Iterazione: ', 60, ' -> ', 1838475515.7229328)


In [52]:
multiple_weights_high_penalty = gradient_descent_RR(feature_matrix, 
                                                    train_output, 
                                                    initial_weights, 
                                                    step_size, 
                                                    10000000, 
                                                    tolerance)

('Iterazione: ', 10, ' -> ', 7935854707977.612)


('Iterazione: ', 20, ' -> ', 1091097496443.0938)


('Iterazione: ', 30, ' -> ', 150024897597.9844)


('Iterazione: ', 40, ' -> ', 20703304211.96701)


('Iterazione: ', 50, ' -> ', 3355843521.2111254)


('Iterazione: ', 60, ' -> ', 1835988121.4125319)


In [53]:
predictions_4 = predict_output(test_feature_matrix, initial_weights)
residuals_4 = [(predictions_4[i] - test_output[i]) ** 2 for i in range(len(predictions_4))]
print sum(residuals_4)

1.78427328614e+15


In [54]:
predictions_5 = predict_output(test_feature_matrix, multiple_weights_0_penalty)
residuals_5 = [(predictions_5[i] - test_output[i]) ** 2 for i in range(len(predictions_5))]
print sum(residuals_5)

2.75723729916e+14


In [55]:
predictions_6 = predict_output(test_feature_matrix, multiple_weights_high_penalty)
residuals_6 = [(predictions_6[i] - test_output[i]) ** 2 for i in range(len(predictions_6))]
print sum(residuals_6)

2.75724177402e+14


In [56]:
first = test_data[0]
a, b, c= multiple_weights_0_penalty
p_0 = a + b * first['sqft_living'] + c * first['bedrooms']
print p_0

d, e, f = multiple_weights_high_penalty
p_high = d + e * first['sqft_living'] + f * first['bedrooms']
print p_high

376124.379361
376082.278154


In [57]:
first['price']

310000.0

In [58]:
multiple_weights_0_penalty

array([  8.76903118e-02,   2.63023430e+02,   2.62224338e-01])

In [59]:
multiple_weights_high_penalty

array([  8.83417666e-02,   2.62993981e+02,   2.65808136e-01])