In [130]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt 

In [131]:
data = pd.read_csv('diamonds.csv', index_col=0)

In [132]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [133]:
train, test = train_test_split(data, test_size=0.15)

In [134]:
train, validation = train_test_split(train, test_size=0.15)

In [135]:
train_one_hot_cut = pd.get_dummies(train['cut'])
train_one_hot_color = pd.get_dummies(train['color'])
train_one_hot_clarity = pd.get_dummies(train['clarity'])

In [136]:
validation_one_hot_cut = pd.get_dummies(validation['cut'])
validation_one_hot_color = pd.get_dummies(validation['color'])
validation_one_hot_clarity = pd.get_dummies(validation['clarity'])

In [137]:
test_one_hot_cut = pd.get_dummies(test['cut'])
test_one_hot_color = pd.get_dummies(test['color'])
test_one_hot_clarity = pd.get_dummies(test['clarity'])

In [138]:
y_train = train.loc[:,'price']
y_validation = validation.loc[:,'price']
y_test = test.loc[:,'price']

In [139]:
x_train = train.drop('price', axis=1)
x_validation = validation.drop('price', axis=1)
x_test = test.drop('price', axis=1)

In [140]:
x_train = x_train.drop('cut', axis=1)
x_train = x_train.drop('color', axis=1)
x_train = x_train.drop('clarity', axis=1)

In [141]:
x_validation = x_validation.drop('cut', axis=1)
x_validation = x_validation.drop('color', axis=1)
x_validation = x_validation.drop('clarity', axis=1)

In [142]:
x_test = x_test.drop('cut', axis=1)
x_test = x_test.drop('color', axis=1)
x_test = x_test.drop('clarity', axis=1)

In [143]:
x_scaler = StandardScaler().fit(x_train)

In [144]:
x_train.head()

Unnamed: 0,carat,depth,table,x,y,z
21438,1.21,59.6,60.0,6.91,6.89,4.11
30661,0.3,61.6,58.0,4.28,4.35,2.66
17895,1.21,62.3,58.0,6.84,6.77,4.24
4446,0.92,59.4,62.0,6.34,6.36,3.77
5306,0.93,61.7,60.0,6.25,6.2,3.84


In [145]:
x_test.head()

Unnamed: 0,carat,depth,table,x,y,z
30469,0.31,63.5,58.0,4.3,4.27,2.72
12541,1.2,61.3,56.0,6.8,6.71,4.14
25995,1.53,60.3,58.0,7.51,7.48,4.52
24130,1.62,62.2,56.0,7.48,7.53,4.67
81,0.26,63.4,59.0,4.0,4.04,2.55


In [146]:
x_train_norm = pd.DataFrame(x_scaler.transform(x_train), columns=x_train.columns, index=x_train.index)
x_validation_norm = pd.DataFrame(x_scaler.transform(x_validation), columns=x_validation.columns, index=x_validation.index)
x_test_norm = pd.DataFrame(x_scaler.transform(x_test), columns=x_test.columns, index=x_test.index)

In [147]:
x_train_norm.head()

Unnamed: 0,carat,depth,table,x,y,z
21438,0.864814,-1.490611,1.134325,1.046573,1.004866,0.818351
30661,-1.051561,-0.103237,0.23861,-1.294219,-1.209533,-1.263407
17895,0.864814,0.382344,0.23861,0.984271,0.900248,1.004991
4446,0.254101,-1.629348,2.030041,0.539253,0.542806,0.330215
5306,0.27516,-0.033868,1.134325,0.45915,0.403316,0.430713


In [148]:
x_test_norm.head()

Unnamed: 0,carat,depth,table,x,y,z
30469,-1.030502,1.214768,0.23861,-1.276418,-1.279278,-1.177265
12541,0.843755,-0.311343,-0.657105,0.948669,0.84794,0.861422
25995,1.538704,-1.00503,0.23861,1.580594,1.519234,1.406986
24130,1.728235,0.312975,-0.657105,1.553893,1.562824,1.62234
81,-1.135797,1.1454,0.686468,-1.543428,-1.479794,-1.421333


In [149]:
x_train_norm = x_train_norm.join(train_one_hot_cut)
x_train_norm = x_train_norm.join(train_one_hot_color)
x_train_norm = x_train_norm.join(train_one_hot_clarity)

In [150]:
x_validation_norm = x_validation_norm.join(validation_one_hot_cut)
x_validation_norm = x_validation_norm.join(validation_one_hot_color)
x_validation_norm = x_validation_norm.join(validation_one_hot_clarity)

In [151]:
x_test_norm = x_test_norm.join(test_one_hot_cut)
x_test_norm = x_test_norm.join(test_one_hot_color)
x_test_norm = x_test_norm.join(test_one_hot_clarity)

In [152]:
x_train_norm.head()

Unnamed: 0,carat,depth,table,x,y,z,Fair,Good,Ideal,Premium,...,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
21438,0.864814,-1.490611,1.134325,1.046573,1.004866,0.818351,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
30661,-1.051561,-0.103237,0.23861,-1.294219,-1.209533,-1.263407,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
17895,0.864814,0.382344,0.23861,0.984271,0.900248,1.004991,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4446,0.254101,-1.629348,2.030041,0.539253,0.542806,0.330215,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5306,0.27516,-0.033868,1.134325,0.45915,0.403316,0.430713,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [153]:
x_test_norm.head()

Unnamed: 0,carat,depth,table,x,y,z,Fair,Good,Ideal,Premium,...,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
30469,-1.030502,1.214768,0.23861,-1.276418,-1.279278,-1.177265,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
12541,0.843755,-0.311343,-0.657105,0.948669,0.84794,0.861422,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
25995,1.538704,-1.00503,0.23861,1.580594,1.519234,1.406986,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
24130,1.728235,0.312975,-0.657105,1.553893,1.562824,1.62234,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
81,-1.135797,1.1454,0.686468,-1.543428,-1.479794,-1.421333,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


# Normal Equation

In [154]:
theta_norm = np.dot(np.linalg.pinv(np.dot(np.transpose(x_train_norm),x_train_norm)),np.dot(np.transpose(x_train_norm),y_train))

In [155]:
y_validation_pred_normal = np.dot(theta_norm, np.transpose(x_validation_norm))

In [156]:
metrics.r2_score(y_validation, y_validation_pred_normal)

0.9188591761733607

In [157]:
metrics.mean_absolute_error(y_validation, y_validation_pred_normal)

742.5835456801144

# Linear Regression

In [282]:
learning_rate=0.001
iterations=1

### Scikit-learn implementation

In [283]:
clf = SGDRegressor(eta0=learning_rate, max_iter=iterations,verbose=True, penalty="None", loss="squared_loss", learning_rate="constant", tol=None, shuffle=False, fit_intercept=False)

In [284]:
initial_theta = np.zeros_like(theta_norm)

In [285]:
clf.fit(x_train_norm, y_train, coef_init=initial_theta)

-- Epoch 1
Norm: 7553.07, NNZs: 26, Bias: 0.000000, T: 38971, Avg. loss: 1125654.629797
Total training time: 0.00 seconds.


SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.001,
       fit_intercept=False, l1_ratio=0.15, learning_rate='constant',
       loss='squared_loss', max_iter=1, n_iter=None, penalty='None',
       power_t=0.25, random_state=None, shuffle=False, tol=None,
       verbose=True, warm_start=False)

In [267]:
y_validation_pred_SKlearn = clf.predict(x_validation_norm)

In [268]:
metrics.r2_score(y_validation, y_validation_pred_SKlearn)

0.9180072055229191

In [269]:
metrics.mean_absolute_error(y_validation, y_validation_pred_SKlearn)

752.0713136306576

### Our implementation

In [274]:
%matplotlib qt
def SGD(theta, x, y, learning_rate, iterations):
    m = x.shape[0]
    fig = plt.figure(1)
    plt.xlabel('Iterations')
    plt.ylabel('Average Cost')
    plt.ion()
    plt.show()
    for i in range(1,iterations+1):
        iter_total_cost = 0
        for item,price in zip(x.values, y.values):
            y_pred = np.dot(item, theta)
            loss = y_pred - price
            cost = np.sum(loss ** 2)/2
            gradient = np.dot(item.transpose(), loss)
            theta = theta - learning_rate * gradient
            iter_total_cost += cost
        iter_average_cost = iter_total_cost/m
        print("Epoch {}".format(i))
        print(iter_average_cost)
        plt.scatter(i,iter_average_cost, c='b')
        fig.canvas.draw()
        plt.pause(0.0001)
    return theta

In [277]:
initial_theta = np.zeros_like(theta_norm)

In [279]:
theta_sgd = SGD(initial_theta, x_train_norm, y_train, learning_rate, iterations)

Epoch 1
1125654.629796634
Epoch 2
690225.1971911865
Epoch 3
658953.4121862265
Epoch 4
652976.0675854761
Epoch 5
650947.1084133298
Epoch 6
650227.6815821404
Epoch 7
649941.4297210237
Epoch 8
649819.6143616836
Epoch 9
649764.7638501865
Epoch 10
649738.8058536106


In [250]:
y_validation_pred_our = np.dot(theta_sgd, np.transpose(x_validation_norm))

In [251]:
metrics.r2_score(y_validation, y_validation_pred_our)

0.9188200392012525

In [252]:
metrics.mean_absolute_error(y_validation, y_validation_pred_our)

743.4625574792572

## Feature Selection

In [None]:
model_sfm = SelectFromModel(clf, prefit=True)
feature_mask = model_sfm.get_support()
feature_name = np.array(x_train_norm.columns)
print(np.array(feature_name)[feature_mask==True])

In [None]:
x_train_selected = model_sfm.transform(x_train_norm)
x_test_selected = model_sfm.transform(x_test_norm)

In [None]:
clf.fit(x_train_selected, y_train)

In [None]:
clf.score(x_test_selected, y_test)