## Problem 2

In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import pandas as pd

Preprocessing Data:

In [2]:
df_data = pd.read_csv('train.csv')
df_dropped = df_data.drop(['zipcode', 'price'], axis=1)
df_dropped = df_dropped.iloc[:, 1:]
train_x = np.array(df_dropped.values)
train_y = np.array(df_data['price'].values)

In [3]:
df_data = pd.read_csv('test.csv')
df_dropped = df_data.drop(['zipcode', 'price', 'id', 'date'], axis=1)
df_dropped = df_dropped.iloc[:, 1:]
test_x = np.array(df_dropped.values)
test_y = np.array(df_data['price'].values)

In [4]:
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)

scaler = StandardScaler()
scaler.fit(test_x)
test_x = scaler.transform(test_x)

In [5]:
train_y /= 1000
test_y /= 1000

Fitting on model:

In [6]:
reg = LinearRegression().fit(train_x, train_y)

Coefficients:

In [7]:
reg.intercept_

520.414834000001

In [8]:
reg.coef_

array([-12.52196187,  18.52763251,  56.7488368 ,  10.88186845,
         8.04372084,  63.74289956,  48.20010852,  12.96426936,
        92.23147482,  48.29008886,  27.13703247, -67.64311741,
        17.27137953,  78.37573693,  -1.03520308,  45.57765781,
       -12.93009098])

Training R^2:

In [9]:
reg.score(train_x, train_y)

0.7265334318706018

In [10]:
train_y_pred = reg.predict(train_x)

Training MSE:

In [11]:
mean_squared_error(train_y, train_y_pred)

31486.167775794882

Testing R^2:

In [12]:
reg.score(test_x, test_y)

0.6414235000248596

Testing MSE:

In [13]:
test_y_pred = reg.predict(test_x)
mean_squared_error(test_y, test_y_pred)

59784.3655675167

## Problem 3

Insert bias term:

In [14]:
train_x_bias = np.insert(train_x, 0, 1, axis=1)
test_x_bias = np.insert(test_x, 0, 1, axis=1)

Calculate parameters with closed form:

In [15]:
closed_form_params = np.matmul(np.linalg.pinv(np.matmul(train_x_bias.T, train_x_bias)), np.matmul(train_x_bias.T, train_y))

In [16]:
closed_form_params

array([520.414834  , -12.52196187,  18.52763251,  56.7488368 ,
        10.88186845,   8.04372084,  63.74289956,  48.20010852,
        12.96426936,  92.23147482,  48.29008886,  27.13703247,
       -67.64311741,  17.27137953,  78.37573693,  -1.03520308,
        45.57765781, -12.93009098])

Training MSE:

In [17]:
train_y_pred = np.array([sum(closed_form_params * x) for x in train_x_bias]).reshape(-1, 1)

In [18]:
mean_squared_error(train_y, train_y_pred)

31486.167775794875

Training R^2:

In [19]:
r2_score(train_y, train_y_pred)

0.7265334318706018

Testing MSE:

In [20]:
test_y_pred = np.array([sum(closed_form_params * x) for x in test_x_bias]).reshape(-1, 1)

In [21]:
mean_squared_error(test_y, test_y_pred)

59784.36556751629

Testing R^2:

In [22]:
r2_score(test_y, test_y_pred)

0.641423500024862

## Problem 4

Predict polynomial:

In [23]:
def predict_poly(features: np.array, predict_features: np.array, y: np.array, p: int) -> float:
    poly = PolynomialFeatures(p)
    poly_feature_x = poly.fit_transform(features.reshape(-1, 1))
    poly_predict_features = poly.fit_transform(predict_features.reshape(-1, 1))
    
    parameters = np.matmul(np.linalg.pinv(np.matmul(poly_feature_x.T, poly_feature_x)), np.matmul(poly_feature_x.T, y))
    
    return np.sum(poly_predict_features * parameters, axis=1)
    

In [24]:
#get sqft_living 
sqft_living_feature = train_x[:,2]

In [25]:
df = pd.DataFrame(columns=['data', 'p', 'mse', 'r2'])

In [26]:
for p in range(1, 6):
    train_y_pred_poly = predict_poly(train_x[:,2], train_x[:, 2], train_y, p)
    mse = mean_squared_error(train_y, train_y_pred_poly)
    r2 = r2_score(train_y, train_y_pred_poly)
    df_new_row = pd.DataFrame.from_records([{'data':'train', 'p': p, 'mse': mse, 'r2': r2}])
    df = pd.concat([df, df_new_row], sort=False, ignore_index=True)

In [27]:
for p in range(1, 6):
    test_y_pred_poly = predict_poly(train_x[:,2], test_x[:,2], test_y, p)
    mse = mean_squared_error(test_y, test_y_pred_poly)
    r2 = r2_score(test_y, test_y_pred_poly)
    df_new_row = pd.DataFrame.from_records([{'data':'test', 'p': p, 'mse': mse, 'r2': r2}])
    df = pd.concat([df, df_new_row], sort=False, ignore_index=True)

In [28]:
df

Unnamed: 0,data,p,mse,r2
0,train,1,57947.526161,0.496709
1,train,2,54822.665116,0.523849
2,train,3,53785.194716,0.53286
3,train,4,52795.774758,0.541453
4,train,5,52626.111955,0.542927
5,test,1,174142.918463,-0.04448
6,test,2,181222.889992,-0.086944
7,test,3,180735.356371,-0.08402
8,test,4,153144.808508,0.081463
9,test,5,203858.764038,-0.22271


## Problem 5

In [29]:
def grad_descent(iterations: int, alpha: int, train_x: np.array, train_y: np.array) -> np.array:
    
    num_features = train_x.shape[1]
    num_samples = train_x.shape[0]
    params = np.zeros(num_features)
    
    for iteration in range(iterations):
        gradient = grad(params, train_x, train_y, num_features, num_samples)
        update = -alpha * gradient
        params += update
    
    return params

In [30]:
def grad(theta, X, Y, num_features, num_samples):
    
    gradient = np.zeros(num_features)
    
    for training_sample, training_response in zip(X, Y):
        x_theta = np.matmul(theta.T, training_sample)
        gradient += (x_theta - training_response) * training_sample
        
    gradient *= 2/num_samples
    return gradient

In [31]:
train_x = np.insert(train_x, 0, 1, axis=1)

In [32]:
lrs = [0.01, 0.1, 0.5]
iters = [10, 50, 100]

df = pd.DataFrame(columns=['data', 'iter', 'alpha', 'mse', 'r2'])
for alpha in lrs:
    for num_iter in iters:
        
        #training 
        params = grad_descent(num_iter, alpha, train_x, train_y)
        train_y_pred = np.sum(params * train_x, axis=1)
        mse = mean_squared_error(train_y, train_y_pred)
        r2 = r2_score(train_y, train_y_pred)
        df_new_row = pd.DataFrame.from_records([{'data':'train', 'iter': num_iter, 'alpha': alpha, 'mse': f'{mse:e}', 'r2': f'{r2:e}'}])
        df = pd.concat([df, df_new_row], sort=False, ignore_index=True)
        
        #testing
        params = grad_descent(num_iter, alpha, train_x, train_y)
        train_y_pred = np.sum(params * train_x, axis=1)
        mse = mean_squared_error(train_y, train_y_pred)
        r2 = r2_score(train_y, train_y_pred)
        df_new_row = pd.DataFrame.from_records([{'data':'test', 'iter': num_iter, 'alpha': alpha, 'mse': f'{mse:e}', 'r2': f'{r2:e}'}])
        df = pd.concat([df, df_new_row], sort=False, ignore_index=True)

In [33]:
df

Unnamed: 0,data,iter,alpha,mse,r2
0,train,10,0.01,235727.8,-1.047365
1,test,10,0.01,235727.8,-1.047365
2,train,50,0.01,69720.5,0.3944571
3,test,50,0.01,69720.5,0.3944571
4,train,100,0.01,36820.35,0.6802045
5,test,100,0.01,36820.35,0.6802045
6,train,10,0.1,35105.1,0.6951019
7,test,10,0.1,35105.1,0.6951019
8,train,50,0.1,31497.26,0.7264371
9,test,50,0.1,31497.26,0.7264371


## Problem 6

In [34]:
def grad_descent_ridge(iterations: int, alpha: int, lambda_coef: int, train_x: np.array, train_y: np.array) -> np.array:
    
    num_features = train_x.shape[1]
    num_samples = train_x.shape[0]
    params = np.zeros(num_features)
    
    for iteration in range(iterations):
        gradient = grad(params, train_x, train_y, num_features, num_samples)
        
        bias = params[0]
        params *= 1 - 2 * alpha * lamb
        params[0] = bias
        update = -alpha * gradient
        params += update
    
    return params

In [35]:
X = np.random.uniform(-2, 2, 1000)
e = np.random.normal(0, 2, 1000)
Y = 1 + 2*X + e
Y = Y.reshape(-1, 1)
X = X.reshape(-1, 1)
X = np.insert(X, 0, np.ones(1000), axis=1)

In [36]:
df = pd.DataFrame(columns=['model', 'lambda', 'slope', 'mse', 'r2'])
lambdas = [0.1, 1, 10, 100]
for lamb in lambdas:
    params = grad_descent_ridge(200, 0.01, lamb, X, Y)
    train_y_pred = np.sum(params.T * X, axis=1)
    mse = mean_squared_error(Y, train_y_pred)
    r2 = r2_score(Y, train_y_pred)
    slope = params[1]
    df_new_row = pd.DataFrame.from_records([{'model':'ridge', 'lambda': lamb, 'slope': slope, 'mse': mse, 'r2': r2}])
    df = pd.concat([df, df_new_row], sort=False, ignore_index=True)
    
params = grad_descent(200, 0.01, X, Y)
train_y_pred = np.sum(params * X, axis=1)
mse = mean_squared_error(Y, train_y_pred)
r2 = r2_score(Y, train_y_pred)
slope = params[1]
df_new_row = pd.DataFrame.from_records([{'model':'linear', 'lambda': None, 'slope': slope, 'mse': mse, 'r2': r2}])
df = pd.concat([df, df_new_row], sort=False, ignore_index=True)

In [37]:
df

Unnamed: 0,model,lambda,slope,mse,r2
0,ridge,0.1,1.816324,4.24256,0.538916
1,ridge,1.0,1.107781,5.159925,0.439216
2,ridge,10.0,0.225132,8.122721,0.117218
3,ridge,100.0,-4.306942,55.200867,-4.999262
4,linear,,1.953132,4.215302,0.541879
