In [1]:
import numpy as np
import pandas as pd
import scipy
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

### a)

In [2]:
train = pd.read_csv('train.data.csv', index_col=0)
test = pd.read_csv('test.data.csv', index_col=0)

In [3]:
# select relevant columns for train and test data
X_train = train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']]
y_train = train['price']
X_test = test[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']]
y_test = test['price']

In [4]:
lr = LinearRegression(fit_intercept=True).fit(X_train, y_train)

# R squared on training data
print('The R squared on the training data is:', lr.score(X_train, y_train))

# R squared on testing data
y_pred = lr.predict(X_test)
r2_test = r2_score(y_test, y_pred)
print('The R squared on the testing data is:', r2_test)

The R squared on the training data is: 0.5101138530794578
The R squared on the testing data is: 0.5049944614037092


### b)

In [5]:
# read in fancy hosue data
fancyhouse = pd.read_csv('fancyhouse.csv', index_col=0)
fancyhouse

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,zipcode,condition,grade,waterfront,view,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
1,8,25,50000,225000,4,98039,10,10,1,4,37500,12500,1994,2010,47.627606,-122.242054,5000,40000


In [6]:
# predict the price of fancy house
prediction = lr.predict(fancyhouse[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']])
print("The predicted price of Bill Gates's house is:", float(prediction))

The predicted price of Bill Gates's house is: 15436769.538224256


### c)

In [7]:
# add interaction to train and test
train['bed_bath'] = train['bedrooms'] * train['bathrooms']
test['bed_bath'] = test['bedrooms'] * test['bathrooms']

# create new training and test data with the interaction
X_train2 = train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'bed_bath']]
X_test2 = test[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'bed_bath']]

In [8]:
lr = LinearRegression(fit_intercept=True).fit(X_train2, y_train)

# R squared on training data
print('The R squared on the training data is:', lr.score(X_train2, y_train))

# R squared on testing data
y_pred = lr.predict(X_test2)
r2_test = r2_score(y_test, y_pred)
print('The R squared on the testing data is:', r2_test)

The R squared on the training data is: 0.5173532927738305
The R squared on the testing data is: 0.5105355458590675


### d)

In [9]:
def LinearRegressionGD(X, y, step_size=0.01, max_iter=10000, tau=0.01):
        '''
        INPUTS
        X: predictors
        y: response
        step_size: learning parameter
        max_iter: max number of iterations
        tau: threshold to terminate the algo based on norm of gradient
        
        OUTPUT
        beta: vector of regression coefficients
        '''
        n = X.shape[0]
        p = X.shape[1]
        
        # initial guess
        beta = np.zeros(p)
        
        # to count iterations
        iter_counter = 0

        # perform gradient descent for n iterations
        while True:
            
            # compute gradient
            grad = X.T @ (X @ beta - y)
            
            grad_norm = np.linalg.norm(grad)
            
            # update beta
            beta = beta - step_size * grad
            
            iter_counter += 1
            
            # if norm of gradient falls below threshold or max iterations is reached then end
            if grad_norm < tau or iter_counter >= max_iter: 
                break
            
        return beta

In [10]:
# scale the data for faster convergence
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [11]:
# R squared
def R2(y_true, y_pred):
    r = scipy.stats.linregress(y_true, y_pred).rvalue
    return r**2

#### repeat part (a)

In [12]:
beta_gd = LinearRegressionGD(X_train, y_train, step_size=0.00005, max_iter=10000, tau=0.01)
beta_gd

array([-55655.71729065,   2842.59687337, 290948.51916708, -16587.41536863])

In [13]:
# train and test predictions
y_pred_train = X_train @ beta_gd
y_pred_test = X_test @ beta_gd

print('The R squared on the training data using GD is:', R2(y_train, y_pred_train))
print('The R squared on the testing data using GD is:',  R2(y_test, y_pred_test))

The R squared on the training data using GD is: 0.5101138530794578
The R squared on the testing data using GD is: 0.5052642488895052


#### repeat part (b)

In [14]:
fh = fancyhouse[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']]

# scale fancyhouse data using mean and sd from training data
fh_scaled = scaler.transform(fh)
prediction_gd = fh_scaled @ beta_gd
print("The predicted price of Bill Gates's house using GD is:", float(prediction_gd))

The predicted price of Bill Gates's house using GD is: 14920927.696733491


#### repeat part (c)

In [15]:
# scale the data with the extra feature
X_train2 = scaler.fit_transform(X_train2)
X_test2 = scaler.fit_transform(X_test2)

beta_gd2 = LinearRegressionGD(X_train2, y_train, step_size=0.000009, max_iter=10000, tau=0.01)
beta_gd2

array([-120802.95365028,  -85709.65010885,  283634.61471033,
        -16503.48524082,  143171.1945134 ])

In [16]:
# train and test predictions
y_pred_train2 = X_train2 @ beta_gd2
y_pred_test2 = X_test2 @ beta_gd2

print('The R squared on the training data using GD with added feature is:', R2(y_train, y_pred_train2))
print('The R squared on the testing data using GD with added feature is:',  R2(y_test, y_pred_test2))

The R squared on the training data using GD with added feature is: 0.5173532927738306
The R squared on the testing data using GD with added feature is: 0.5108344665471076


### e)

In [17]:
def LinearRegressionSGD(X, y, step_size=0.0001, n_iters=15000, m=1):
        '''
        INPUTS
        X: predictors
        y: response
        step_size: learning parameter
        n_iters: number of iterations
        m: batch size
        
        OUTPUT
        beta: vector of regression coefficients
        '''
        n = X.shape[0]
        p = X.shape[1]
        
        # initial guess
        beta = np.zeros(p)

        # perform stochastic gradient descent for n iterations
        for i in range(n_iters):
            
            # randomly select m observations from X and y
            index = np.random.choice(n, size=m)
            X_new = X[index, :]
            y_new = np.array(y)[index]
            
            # compute gradient
            grad = X_new.T @ (X_new @ beta - y_new)
            
            grad_norm = np.linalg.norm(grad)
            
            # update beta
            beta = beta - (1/m) * step_size * grad
            
        return beta

#### repeat part (a)

In [18]:
beta_sgd = LinearRegressionSGD(X_train, y_train, 0.0008, 18000, m=1)
beta_sgd

array([-60805.0000969 ,  12123.38312398, 284186.53705644,  -9433.35103072])

In [19]:
y_pred_train = X_train @ beta_sgd
y_pred_test = X_test @ beta_sgd

print('The R squared on the training data using SGD is:', R2(y_train, y_pred_train))
print('The R squared on the testing data using SGD is:',  R2(y_test, y_pred_test))

The R squared on the training data using SGD is: 0.5093902312458867
The R squared on the testing data using SGD is: 0.5050856219424475


#### repeat part (b)

In [20]:
prediction_gd = fh_scaled @ beta_sgd
print("The predicted price of Bill Gates's house using SGD is:", float(prediction_gd))

The predicted price of Bill Gates's house using SGD is: 14850946.714640113


#### repeat part (c)

In [21]:
beta_sgd2 = LinearRegressionSGD(X_train2, y_train, 0.0009, 15000, m=1)
beta_sgd2

array([-8.32837206e+04, -4.05848896e+04,  2.58308896e+05,  2.55632281e+02,
        5.73456465e+04])

In [22]:
y_pred_train2 = X_train2 @ beta_sgd2
y_pred_test2 = X_test2 @ beta_sgd2

print('The R squared on the training data using SGD with added feature is:', R2(y_train, y_pred_train2))
print('The R squared on the testing data using SGD with added feature is:',  R2(y_test, y_pred_test2))

The R squared on the training data using SGD with added feature is: 0.5121015492618722
The R squared on the testing data using SGD with added feature is: 0.506415576370023
