# Regression Week 4: Ridge Regression (interpretation)

In [32]:
import graphlab
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

# Polynomial regression, revisited

In [1]:
def polynomial_sframe(feature, degree):
    # set poly_sframe['power_1'] equal to the passed feature
    poly_sframe = graphlab.SFrame()
    poly_sframe['power_1']=feature
     # first check if degree > 1
    if degree > 1:
        # then loop over the remaining degrees:
        # range usually starts at 0 and stops at the endpoint-1. We want it to start at 2 and stop at degree
        for power in range(2, degree+1):
            name = 'power_' + str(power)
            poly_sframe[name]=feature.apply(lambda x: x**power)
    return poly_sframe

In [3]:
sales = graphlab.SFrame('kc_house_data.gl/') # Read the data

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\Darshan\AppData\Local\Temp\graphlab_server_1486240216.log.0


This non-commercial license of GraphLab Create for academic use is assigned to darshanb@umd.edu and will expire on January 08, 2018.


In [4]:
sales = sales.sort(['sqft_living','price']) # Feature and target

In [10]:
l2_small_penalty = 1e-5 # Small regularization penalty

In [20]:
poly15_data = polynomial_sframe(sales['sqft_living'], 15)
my_features = poly15_data.column_names() # get the name of the features
poly15_data['price'] = sales['price'] # add price to the data since it's the target

#Run Linear Regression
model15 = graphlab.linear_regression.create(poly15_data, target = 'price', features = my_features,
                                            validation_set = None,l2_penalty=l2_small_penalty,verbose=False)

print "Co-efficients of model trained on entire dataset for the freature: sqft_living is:\n", 
(model15.get('coefficients')).print_rows(num_rows=16)

Co-efficients of model trained on entire dataset for the freature: sqft_living is:
+-------------+-------+--------------------+--------+
|     name    | index |       value        | stderr |
+-------------+-------+--------------------+--------+
| (intercept) |  None |   167924.857726    |  nan   |
|   power_1   |  None |   103.090951289    |  nan   |
|   power_2   |  None |   0.13460455096    |  nan   |
|   power_3   |  None | -0.000129071363752 |  nan   |
|   power_4   |  None | 5.18928955754e-08  |  nan   |
|   power_5   |  None | -7.77169299595e-12 |  nan   |
|   power_6   |  None | 1.71144842837e-16  |  nan   |
|   power_7   |  None | 4.51177958161e-20  |  nan   |
|   power_8   |  None | -4.78839816249e-25 |  nan   |
|   power_9   |  None | -2.33343499941e-28 |  nan   |
|   power_10  |  None | -7.29022428496e-33 |  nan   |
|   power_11  |  None | 7.22829146954e-37  |  nan   |
|   power_12  |  None |  6.9047076722e-41  |  nan   |
|   power_13  |  None | -3.65843768148e-46 |  nan   |

# Observe overfitting

Split the data into split the sales data into four subsets of roughly equal size and call them `set_1`, `set_2`, `set_3`, and `set_4`. Use `.random_split` function and make sure you set `seed=0`. 

In [18]:
(semi_split1, semi_split2) = sales.random_split(.5,seed=0)
(set_1, set_2) = semi_split1.random_split(0.5, seed=0)
(set_3, set_4) = semi_split2.random_split(0.5, seed=0)

In [19]:
# Run a linear regression on sqft_living for a 15th order polynomial 
set_1_data = polynomial_sframe(set_1['sqft_living'], 15)
my_features = set_1_data.column_names()
set_1_data['price'] = set_1['price'] 
model_set_1 = graphlab.linear_regression.create(set_1_data, target = 'price', features = my_features,
                                            validation_set = None,l2_penalty=l2_small_penalty,verbose=False)
print "Co-efficients of model trained on set_1 subset for the freature: sqft_living is:\n", 
model_set_1.get('coefficients').print_rows(num_rows=16)

Co-efficients of model trained on set_1 subset for the freature: sqft_living is:
+-------------+-------+--------------------+-------------------+
|     name    | index |       value        |       stderr      |
+-------------+-------+--------------------+-------------------+
| (intercept) |  None |   9306.46397814    |   835063.417729   |
|   power_1   |  None |    585.86581347    |   3484.52405144   |
|   power_2   |  None |  -0.397305884724   |   5.85325120701   |
|   power_3   |  None | 0.000141470894825  |  0.00510563501632 |
|   power_4   |  None | -1.52945974394e-08 | 2.39686540035e-06 |
|   power_5   |  None | -3.79756526062e-13 | 3.43613043735e-10 |
|   power_6   |  None |  5.9748184732e-17  |        nan        |
|   power_7   |  None | 1.06888505979e-20  |        nan        |
|   power_8   |  None | 1.59344052349e-25  |        nan        |
|   power_9   |  None | -6.9283495515e-29  | 3.72130313789e-25 |
|   power_10  |  None | -6.83813368045e-33 |        nan        |
|   power

In [21]:
# Run a linear regression on sqft_living for a 15th order polynomial 
set_2_data = polynomial_sframe(set_2['sqft_living'], 15)
my_features = set_2_data.column_names()
set_2_data['price'] = set_2['price'] 
model_set_2 = graphlab.linear_regression.create(set_2_data, target = 'price', features = my_features,
                                            validation_set = None,l2_penalty=l2_small_penalty,verbose=False)
print "Co-efficients of model trained on set_2 subset for the freature: sqft_living is:\n", 
model_set_2.get('coefficients').print_rows(num_rows=16)

Co-efficients of model trained on set_2 subset for the freature: sqft_living is:
+-------------+-------+--------------------+-------------------+
|     name    | index |       value        |       stderr      |
+-------------+-------+--------------------+-------------------+
| (intercept) |  None |   -25115.9059869   |   1675771.52039   |
|   power_1   |  None |   783.493802508    |   9984.85507514   |
|   power_2   |  None |  -0.767759300173   |   25.2592296765   |
|   power_3   |  None | 0.000438766361934  |  0.0358329790553  |
|   power_4   |  None | -1.15169161152e-07 | 3.17792630727e-05 |
|   power_5   |  None | 6.84281148707e-12  | 1.85840552557e-08 |
|   power_6   |  None |  2.5119522464e-15  | 7.34916239474e-12 |
|   power_7   |  None | -2.06440624344e-19 | 1.97638603378e-15 |
|   power_8   |  None | -4.59673058828e-23 | 3.56090427394e-19 |
|   power_9   |  None | -2.71277342492e-29 | 3.98355514126e-23 |
|   power_10  |  None | 6.21818505057e-31  |        nan        |
|   power

In [22]:
# Run a linear regression on sqft_living for a 15th order polynomial 
set_3_data = polynomial_sframe(set_3['sqft_living'], 15)
my_features = set_3_data.column_names()
set_3_data['price'] = set_3['price'] 
model_set_3 = graphlab.linear_regression.create(set_3_data, target = 'price', features = my_features,
                                            validation_set = None,l2_penalty=l2_small_penalty,verbose=False)
print "Co-efficients of model trained on set_3 subset for the freature: sqft_living is:\n", 
model_set_3.get('coefficients').print_rows(num_rows=16)

Co-efficients of model trained on set_3 subset for the freature: sqft_living is:
+-------------+-------+--------------------+-------------------+
|     name    | index |       value        |       stderr      |
+-------------+-------+--------------------+-------------------+
| (intercept) |  None |   462426.565731    |        nan        |
|   power_1   |  None |   -759.251842854   |        nan        |
|   power_2   |  None |    1.0286700473    |        nan        |
|   power_3   |  None | -0.000528264527386 |        nan        |
|   power_4   |  None | 1.15422908385e-07  |        nan        |
|   power_5   |  None | -2.26095948062e-12 |        nan        |
|   power_6   |  None | -2.08214287571e-15 |        nan        |
|   power_7   |  None | 4.08770475709e-20  |        nan        |
|   power_8   |  None |  2.570791329e-23   |        nan        |
|   power_9   |  None | 1.24311265196e-27  |        nan        |
|   power_10  |  None | -1.72025834939e-31 |        nan        |
|   power

In [23]:
# Run a linear regression on sqft_living for a 15th order polynomial 
set_4_data = polynomial_sframe(set_4['sqft_living'], 15)
my_features = set_4_data.column_names()
set_4_data['price'] = set_4['price'] 
model_set_4 = graphlab.linear_regression.create(set_4_data, target = 'price', features = my_features,
                                            validation_set = None,l2_penalty=l2_small_penalty,verbose=False)
print "Co-efficients of model trained on set_4 subset for the freature: sqft_living is:\n", 
model_set_4.get('coefficients').print_rows(num_rows=16)

Co-efficients of model trained on set_4 subset for the freature: sqft_living is:
+-------------+-------+--------------------+-------------------+
|     name    | index |       value        |       stderr      |
+-------------+-------+--------------------+-------------------+
| (intercept) |  None |   -170240.034791   |   1417346.17184   |
|   power_1   |  None |   1247.59035088    |   8978.28059127   |
|   power_2   |  None |   -1.2246091264    |   23.6158213076   |
|   power_3   |  None | 0.000555254626787  |  0.0340561499439  |
|   power_4   |  None | -6.38262361929e-08 | 2.98955350115e-05 |
|   power_5   |  None | -2.20215996475e-11 | 1.65791592065e-08 |
|   power_6   |  None | 4.81834697594e-15  | 5.63745618764e-12 |
|   power_7   |  None |  4.2146163248e-19  | 8.27510918329e-16 |
|   power_8   |  None | -7.99880749051e-23 |        nan        |
|   power_9   |  None | -1.32365907706e-26 |        nan        |
|   power_10  |  None | 1.60197797139e-31  |  5.0301150238e-27 |
|   power

# Ridge regression comes to rescue

Generally, whenever we see weights change so much in response to change in data, we believe the variance of our estimate to be large. Ridge regression aims to address this issue by penalizing "large" weights. 

# Now lets increase the l2 penalty

In [26]:
# Run a linear regression on sqft_living for a 15th order polynomial 
set_1_data = polynomial_sframe(set_1['sqft_living'], 15)
my_features = set_1_data.column_names()
set_1_data['price'] = set_1['price'] 
model_set_1 = graphlab.linear_regression.create(set_1_data, target = 'price', features = my_features,
                                            validation_set = None,l2_penalty=1e5,verbose=False)
print"\tFor high l2 penalty\n"
print "Co-efficients of model trained on set_1 subset for the freature: sqft_living is:\n", 
model_set_1.get('coefficients').print_rows(num_rows=16)

	For high l2 penalty

Co-efficients of model trained on set_1 subset for the freature: sqft_living is:
+-------------+-------+-------------------+-------------------+
|     name    | index |       value       |       stderr      |
+-------------+-------+-------------------+-------------------+
| (intercept) |  None |   530317.024516   |    1256668.1989   |
|   power_1   |  None |   2.58738875673   |   5243.78205387   |
|   power_2   |  None |  0.00127414400592 |   8.80842639712   |
|   power_3   |  None | 1.74934226932e-07 |  0.00768335556793 |
|   power_4   |  None | 1.06022119097e-11 | 3.60698895641e-06 |
|   power_5   |  None | 5.42247604482e-16 | 5.17095558996e-10 |
|   power_6   |  None | 2.89563828343e-20 |        nan        |
|   power_7   |  None | 1.65000666351e-24 |        nan        |
|   power_8   |  None | 9.86081528409e-29 |        nan        |
|   power_9   |  None | 6.06589348254e-33 | 5.60010558786e-25 |
|   power_10  |  None |  3.7891786887e-37 |        nan        |
|

In [27]:
set_2_data = polynomial_sframe(set_2['sqft_living'], 15)
my_features = set_2_data.column_names()
set_2_data['price'] = set_2['price'] 
model_set_2 = graphlab.linear_regression.create(set_2_data, target = 'price', features = my_features,
                                            validation_set = None,l2_penalty=1e5,verbose=False)
print"\tFor high l2 penalty\n"
print "Co-efficients of model trained on set_2 subset for the freature: sqft_living is:\n", 
model_set_2.get('coefficients').print_rows(num_rows=16)

	For high l2 penalty

Co-efficients of model trained on set_2 subset for the freature: sqft_living is:
+-------------+-------+-------------------+-------------------+
|     name    | index |       value       |       stderr      |
+-------------+-------+-------------------+-------------------+
| (intercept) |  None |   519216.897383   |   2309581.08725   |
|   power_1   |  None |   2.04470474182   |   13761.3225669   |
|   power_2   |  None |  0.0011314362684  |    34.812764407   |
|   power_3   |  None | 2.93074277549e-07 |  0.0493857126218  |
|   power_4   |  None | 4.43540598453e-11 | 4.37988019645e-05 |
|   power_5   |  None | 4.80849112204e-15 | 2.56129084549e-08 |
|   power_6   |  None | 4.53091707826e-19 |  1.0128759361e-11 |
|   power_7   |  None | 4.16042910575e-23 | 2.72389388957e-15 |
|   power_8   |  None | 3.90094635128e-27 | 4.90770792111e-19 |
|   power_9   |  None |  3.7773187602e-31 | 5.49021361344e-23 |
|   power_10  |  None | 3.76650326842e-35 |        nan        |
|

In [28]:
set_3_data = polynomial_sframe(set_3['sqft_living'], 15)
my_features = set_3_data.column_names()
set_3_data['price'] = set_3['price'] 
model_set_3 = graphlab.linear_regression.create(set_3_data, target = 'price', features = my_features,
                                            validation_set = None,l2_penalty=1e5,verbose=False)
print"\tFor high l2 penalty\n"
print "Co-efficients of model trained on set_3 subset for the freature: sqft_living is:\n", 
model_set_3.get('coefficients').print_rows(num_rows=16)

	For high l2 penalty

Co-efficients of model trained on set_3 subset for the freature: sqft_living is:
+-------------+-------+-------------------+-------------------+
|     name    | index |       value       |       stderr      |
+-------------+-------+-------------------+-------------------+
| (intercept) |  None |   522911.518048   |        nan        |
|   power_1   |  None |   2.26890421877   |        nan        |
|   power_2   |  None |  0.00125905041842 |        nan        |
|   power_3   |  None | 2.77552918155e-07 |        nan        |
|   power_4   |  None |  3.2093309779e-11 |        nan        |
|   power_5   |  None | 2.87573572364e-15 |        nan        |
|   power_6   |  None | 2.50076112671e-19 |        nan        |
|   power_7   |  None | 2.24685265906e-23 |        nan        |
|   power_8   |  None | 2.09349983135e-27 |        nan        |
|   power_9   |  None | 2.00435383296e-31 |        nan        |
|   power_10  |  None | 1.95410800249e-35 |        nan        |
|

In [29]:
set_4_data = polynomial_sframe(set_4['sqft_living'], 15)
my_features = set_4_data.column_names()
set_4_data['price'] = set_4['price'] 
model_set_4 = graphlab.linear_regression.create(set_4_data, target = 'price', features = my_features,
                                            validation_set = None,l2_penalty=1e5,verbose=False)
print"\tFor high l2 penalty\n"
print "Co-efficients of model trained on set_4 subset for the freature: sqft_living is:\n", 
model_set_4.get('coefficients').print_rows(num_rows=16)

	For high l2 penalty

Co-efficients of model trained on set_4 subset for the freature: sqft_living is:
+-------------+-------+-------------------+-------------------+
|     name    | index |       value       |       stderr      |
+-------------+-------+-------------------+-------------------+
| (intercept) |  None |   513667.087087   |   1874267.58319   |
|   power_1   |  None |   1.91040938244   |   11872.6819173   |
|   power_2   |  None |  0.00110058029175 |   31.2290456676   |
|   power_3   |  None | 3.12753987879e-07 |  0.0450351079477  |
|   power_4   |  None | 5.50067886825e-11 | 3.95332017452e-05 |
|   power_5   |  None | 7.20467557825e-15 | 2.19239175825e-08 |
|   power_6   |  None | 8.24977249384e-19 | 7.45484878293e-12 |
|   power_7   |  None | 9.06503223498e-23 | 1.09428234243e-15 |
|   power_8   |  None | 9.95683160453e-27 |        nan        |
|   power_9   |  None | 1.10838127982e-30 |        nan        |
|   power_10  |  None | 1.25315224143e-34 | 6.65171410918e-27 |
|

These curves should vary a lot less, now that you applied a high degree of regularization.

# Selecting an L2 penalty via cross-validation

In [30]:
(train_valid, test) = sales.random_split(.9, seed=1) #Splitiing the data
train_valid_shuffled = graphlab.toolkits.cross_validation.shuffle(train_valid, random_seed=1)

In [35]:
#Converting the training data for sqft living to a 15th order polynomial
train_poly=polynomial_sframe(train_valid['sqft_living'],15)
my_f=train_poly.column_names()
train_poly['price']=train_valid['price']

#Converting the test data for sqft living to a 15th order polynomial
t=polynomial_sframe(test['sqft_living'],15)
my_f1=t.column_names()
t['price']=test['price']

#Creating a Linear Regression model with l2 penalty as 1e3
model_f=graphlab.linear_regression.create(train_poly,target='price',features=my_f,validation_set=None,l2_penalty=1000,verbose=False)
print "The RSS of model with l2 penalty as 1000 is:",(np.sum(np.square(model_f.predict(t)-t['price'])))

The RSS of model with l2 penalty as 1000 is: 1.28780855058e+14


Now we are ready to implement k-fold cross-validation. Write a function that computes k validation errors by designating each of the k segments as the validation set. It accepts as parameters (i) `k`, (ii) `l2_penalty`, (iii) dataframe, (iv) name of output column (e.g. `price`) and (v) list of feature names. The function returns the average validation error using k segments as validation sets.

* For each i in [0, 1, ..., k-1]:
  * Compute starting and ending indices of segment i and call 'start' and 'end'
  * Form validation set by taking a slice (start:end+1) from the data.
  * Form training set by appending slice (end+1:n) to the end of slice (0:start).
  * Train a linear model using training set just formed, with a given l2_penalty
  * Compute validation error using validation set just formed

In [38]:
# Function to create a k-fold cross validation
# Cross validation is used to select the parameters that minimize the RSS
import numpy as np
def k_fold_cross_validation(k, l2_penalty, data, output_name, features_list):
    # k = number of folds that is the subsets
    avg=0
    n=len(data)
    for i in range(k): 
        # Find the start and end destination of the data subset to be used for cross validation
        start = (n*i)/k
        end = (n*(i+1))/k - 1
        v=data[start:end+1] # This will act as the validation set
        f=data[0:start]
        s=data[end+1:n]
        t=f.append(s) #Append everything other than the validation set as training data
        model=graphlab.linear_regression.create(t, target = output_name, features = features_list,
                                            validation_set = None,l2_penalty=l2_penalty,verbose=False)
        #Calculate the RSS for each validation subset
        avg=avg+np.sum(np.square(model.predict(v)-v['price']))
        
    avg_validation_error=float(avg)/k #The avreage of the RSS for a given value of l2penalty
    return avg_validation_error

In [41]:
a=polynomial_sframe(train_valid_shuffled['sqft_living'],15) # Create a 15th order polynomial
my_f=a.column_names()
a['price']=train_valid_shuffled['price']
#  Calculate the RSS for each l2_penlaty value to find the  one that minimizes the RSS on validation set
for l2 in np.logspace(1,7,num=13):
    print "The RSS for l2_penalty of ",np.log10(l2),"is", '%3e' %k_fold_cross_validation(10,l2,a,'price',my_f)
print "** The l2_penalty are powers of 10**"   

The RSS for l2_penalty of  1.0 is 4.918264e+14
The RSS for l2_penalty of  1.5 is 2.875042e+14
The RSS for l2_penalty of  2.0 is 1.609090e+14
The RSS for l2_penalty of  2.5 is 1.220910e+14
The RSS for l2_penalty of  3.0 is 1.211923e+14
The RSS for l2_penalty of  3.5 is 1.239500e+14
The RSS for l2_penalty of  4.0 is 1.368372e+14
The RSS for l2_penalty of  4.5 is 1.717281e+14
The RSS for l2_penalty of  5.0 is 2.293614e+14
The RSS for l2_penalty of  5.5 is 2.529406e+14
The RSS for l2_penalty of  6.0 is 2.586825e+14
The RSS for l2_penalty of  6.5 is 2.628194e+14
The RSS for l2_penalty of  7.0 is 2.648890e+14
** The l2_penalty are powers of 10**


In [42]:
from sklearn.linear_model import LinearRegression,Ridge
train=a.to_dataframe()

In [43]:
test=t.to_dataframe()

In [44]:

# model_sk=LinearRegression()
model_sk=Ridge(alpha=0.1,normalize=True)
model_sk.fit(train[my_f],train['price'])
# model_sk.predict(test[my_f])
print(np.sum(np.square(model_sk.predict(test[my_f])-test['price'])))

1.27406290628e+14


In [45]:
# import numpy as np
# def sk_fold_cross_validation(k, l2_penalty, data, output_name, features_list):
#     avg=0
#     n=len(data)
#     for i in range(k):
#         start = (n*i)/k
#         end = (n*(i+1))/k - 1
#         v=data[start:end+1]
#         f=data[0:start]
#         s=data[end+1:n]
#         t=f.append(s)
#         t_df=t.to_dataframe()
#         v_df=v.to_dataframe()
#         model_sk=Ridge(alpha=l2_penalty,normalize=True)
#         model_sk.fit(t_df[features_list],t['price'])   
#         avg=avg+np.sum(np.square(model_sk.predict(v_df[features_list])-v_df['price']))
#     avg_validation_error=float(avg)/k
#     return avg_validation_error

## Thank You!