In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy.stats import pearsonr
import itertools
#import numba

## Linear regression with weight decay regularization

In [4]:
data = pd.read_csv('data/diamonds.csv')
data.shape

(53940, 10)

In [5]:
data = data.dropna()
data = data.drop_duplicates()
#data['price'] = data['price'].astype('float64')
data.shape

(53794, 10)

In [6]:
data.head()


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [7]:
def split_X_and_y(dataset, lab_filter, features_filter):
    return (dataset[features_filter], dataset[lab_filter])

In [9]:
class LinearRegression:
    def __init__(self, solver = "cf", learning_rate = 0.0001, max_iter = 100):
        self.rng = np.random.default_rng(0)
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.solver = solver

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def predict(self, X_test):
        if(self.solver == "cf"):
            self.coef_ = self.linear_regression_PI(X_test)
        elif(self.solver == "gd"):
            self.coef_ = self.linear_regression_SGD(X_test)
        else:
            raise Exception("Solver not implemented")
        return self.coef_

    def linear_regression_PI(self, X_test):
        X = self.X_train.copy()
        y = self.y_train.copy()
        
        m = X_test.shape[0]
        lambd = 0.0001
        lambd_I = lambd * np.eye(X.shape[1]+1)
        w0 = np.ones((X.shape[0], 1))
        A = np.c_[w0, X]
        w = np.linalg.inv(lambd_I + A.T @ A) @ A.T @ y[:,np.newaxis]
        
        y_pred = np.c_[np.ones((m, 1)), X_test] @ w
        return np.hstack(y_pred)
        

    def linear_regression_SGD(self, X_test):
        X = self.X_train
        y = self.y_train
        lr = self.learning_rate
        m, n = X.shape
        w0 = np.ones((m, 1)) 
        A = np.c_[w0, X]
        w = self.rng.normal(scale=0.0001,size=(X_train.shape[1]+1, 1))
        for j in range(self.max_iter):
            for i in range(m):
                w += lr * (y[i] - A[[i]] @ w) * A[[i]].T
        y_pred = np.c_[np.ones((X_test.shape[0], 1)), X_test] @ w
        
        return np.hstack(y_pred)
    def rmse(self, y_test, y_pred):
        return np.sqrt(np.mean((y_test - y_pred)**2))

In [10]:
def features_filter():
    filter = ['carat', 'depth', 'table', 'x', 'y', 'z']
    return filter

In [11]:
def labels_filter():
    filter = ['price']
    return filter

In [13]:
numer_of_rows = data.shape[0]
# 60% for train, 20% for validation and 20% for test
train, validate, test = np.split(data.sample(frac = 1, random_state = 0), [int(.75*numer_of_rows), int(.90*numer_of_rows)])
train.reset_index(drop=True, inplace=True)
validate.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

validate.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.13,Premium,H,SI2,62.3,57.0,4873,6.68,6.65,4.15
1,1.02,Very Good,G,IF,62.3,59.0,8624,6.36,6.45,3.99
2,0.72,Ideal,E,SI2,61.3,55.0,2401,5.75,5.84,3.55
3,1.2,Premium,H,VS2,62.4,57.0,6048,6.73,6.67,4.18
4,1.0,Very Good,H,SI1,62.2,59.0,4830,6.32,6.35,3.94


In [14]:
feat_filter = features_filter()
lab_filter = labels_filter()
X_test, y_test = split_X_and_y(test, lab_filter, feat_filter)
X_train, y_train = split_X_and_y(train, lab_filter, feat_filter)
X_validate, y_validate = split_X_and_y(validate, lab_filter, feat_filter)
print(y_train)

       price
0        432
1       6964
2       2037
3       8451
4       1581
...      ...
40340    878
40341   3690
40342    840
40343   2032
40344   5599

[40345 rows x 1 columns]


In [15]:
lr = LinearRegression()
lr.fit(X_train.values, y_train['price'].values)
predictions = lr.predict(X_test)
lr.rmse(y_test['price'], predictions)

1442.10555073594

In [16]:
lr_gd = LinearRegression(solver = "gd", max_iter=10, learning_rate=0.0001)
lr_gd.fit(X_train.values, y_train['price'].values)
predictions = lr_gd.predict(X_test)
lr_gd.rmse(y_test['price'], predictions)

1622.0522864136026