In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt 

In [2]:
data = pd.read_csv('diamonds.csv', index_col=0)

In [3]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
train, test = train_test_split(data, test_size=0.15)

In [5]:
train, validation = train_test_split(train, test_size=0.15)

In [6]:
train_one_hot_cut = pd.get_dummies(train['cut'])
train_one_hot_color = pd.get_dummies(train['color'])
train_one_hot_clarity = pd.get_dummies(train['clarity'])

In [7]:
validation_one_hot_cut = pd.get_dummies(validation['cut'])
validation_one_hot_color = pd.get_dummies(validation['color'])
validation_one_hot_clarity = pd.get_dummies(validation['clarity'])

In [8]:
test_one_hot_cut = pd.get_dummies(test['cut'])
test_one_hot_color = pd.get_dummies(test['color'])
test_one_hot_clarity = pd.get_dummies(test['clarity'])

In [9]:
y_train = train.loc[:,'price']
y_validation = validation.loc[:,'price']
y_test = test.loc[:,'price']

In [10]:
x_train = train.drop('price', axis=1)
x_validation = validation.drop('price', axis=1)
x_test = test.drop('price', axis=1)

In [11]:
x_train = x_train.drop('cut', axis=1)
x_train = x_train.drop('color', axis=1)
x_train = x_train.drop('clarity', axis=1)

In [12]:
x_validation = x_validation.drop('cut', axis=1)
x_validation = x_validation.drop('color', axis=1)
x_validation = x_validation.drop('clarity', axis=1)

In [13]:
x_test = x_test.drop('cut', axis=1)
x_test = x_test.drop('color', axis=1)
x_test = x_test.drop('clarity', axis=1)

In [14]:
x_scaler = StandardScaler().fit(x_train)

In [15]:
x_train.head()

Unnamed: 0,carat,depth,table,x,y,z
12648,1.01,62.7,58.0,6.39,6.37,4.0
15643,1.6,65.0,56.0,7.35,7.26,4.8
31804,0.37,61.9,55.0,4.6,4.66,2.86
48190,0.72,65.8,59.0,5.51,5.58,3.65
30852,0.33,61.2,55.0,4.49,4.46,2.74


In [16]:
x_test.head()

Unnamed: 0,carat,depth,table,x,y,z
2067,0.32,61.5,58.0,4.35,4.4,2.69
12528,1.25,62.0,59.0,6.9,6.87,4.27
25688,0.36,63.0,59.0,4.52,4.49,2.84
14710,0.3,63.8,55.0,4.25,4.28,2.72
18897,1.31,62.2,57.0,7.0,7.03,4.36


In [17]:
x_train_norm = pd.DataFrame(x_scaler.transform(x_train), columns=x_train.columns, index=x_train.index)
x_validation_norm = pd.DataFrame(x_scaler.transform(x_validation), columns=x_validation.columns, index=x_validation.index)
x_test_norm = pd.DataFrame(x_scaler.transform(x_test), columns=x_test.columns, index=x_test.index)

In [18]:
x_train_norm.head()

Unnamed: 0,carat,depth,table,x,y,z
12648,0.44232,0.66136,0.244274,0.582797,0.550741,0.644971
15643,1.684171,2.261176,-0.656073,1.437909,1.327262,1.771001
31804,-0.904774,0.104902,-1.106246,-1.011631,-0.941228,-0.959621
48190,-0.168082,2.817633,0.694447,-0.201056,-0.138531,0.152333
30852,-0.988967,-0.381998,-1.106246,-1.109613,-1.115727,-1.128526


In [19]:
x_test_norm.head()

Unnamed: 0,carat,depth,table,x,y,z
2067,-1.010015,-0.173326,0.244274,-1.234317,-1.168077,-1.198903
12528,0.94748,0.17446,0.694447,1.037075,0.986989,1.025006
25688,-0.925822,0.870032,0.694447,-1.082891,-1.089552,-0.987772
14710,-1.052112,1.426489,-1.106246,-1.323391,-1.272776,-1.156676
18897,1.07377,0.313574,-0.2059,1.126149,1.126588,1.151684


In [20]:
x_train_norm = x_train_norm.join(train_one_hot_cut)
x_train_norm = x_train_norm.join(train_one_hot_color)
x_train_norm = x_train_norm.join(train_one_hot_clarity)

In [21]:
x_validation_norm = x_validation_norm.join(validation_one_hot_cut)
x_validation_norm = x_validation_norm.join(validation_one_hot_color)
x_validation_norm = x_validation_norm.join(validation_one_hot_clarity)

In [22]:
x_test_norm = x_test_norm.join(test_one_hot_cut)
x_test_norm = x_test_norm.join(test_one_hot_color)
x_test_norm = x_test_norm.join(test_one_hot_clarity)

In [23]:
x_train_norm.head()

Unnamed: 0,carat,depth,table,x,y,z,Fair,Good,Ideal,Premium,...,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
12648,0.44232,0.66136,0.244274,0.582797,0.550741,0.644971,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
15643,1.684171,2.261176,-0.656073,1.437909,1.327262,1.771001,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
31804,-0.904774,0.104902,-1.106246,-1.011631,-0.941228,-0.959621,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
48190,-0.168082,2.817633,0.694447,-0.201056,-0.138531,0.152333,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
30852,-0.988967,-0.381998,-1.106246,-1.109613,-1.115727,-1.128526,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [24]:
x_test_norm.head()

Unnamed: 0,carat,depth,table,x,y,z,Fair,Good,Ideal,Premium,...,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
2067,-1.010015,-0.173326,0.244274,-1.234317,-1.168077,-1.198903,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
12528,0.94748,0.17446,0.694447,1.037075,0.986989,1.025006,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
25688,-0.925822,0.870032,0.694447,-1.082891,-1.089552,-0.987772,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
14710,-1.052112,1.426489,-1.106246,-1.323391,-1.272776,-1.156676,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
18897,1.07377,0.313574,-0.2059,1.126149,1.126588,1.151684,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0


# Normal Equation

In [25]:
theta_norm = np.dot(np.linalg.pinv(np.dot(np.transpose(x_train_norm),x_train_norm)),np.dot(np.transpose(x_train_norm),y_train))

In [26]:
y_validation_pred_normal = np.dot(theta_norm, np.transpose(x_validation_norm))

In [27]:
metrics.r2_score(y_validation, y_validation_pred_normal)

0.9214854791961604

In [28]:
metrics.mean_absolute_error(y_validation, y_validation_pred_normal)

737.8968017848838

# Linear Regression

In [79]:
learning_rate=0.001
iterations=100
initial_theta = np.zeros_like(theta_norm)

### Scikit-learn implementation

In [110]:
clf = SGDRegressor(eta0=learning_rate, max_iter=iterations,verbose=True, penalty="None", loss="squared_loss", learning_rate="constant", tol=None, shuffle=False, fit_intercept=False)

In [111]:
clf.fit(x_train_norm, y_train, coef_init=initial_theta)

-- Epoch 1
Norm: 9025.27, NNZs: 26, Bias: 0.000000, T: 38971, Avg. loss: 647550.783251
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 9025.27, NNZs: 26, Bias: 0.000000, T: 77942, Avg. loss: 647546.605103
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 9025.32, NNZs: 26, Bias: 0.000000, T: 116913, Avg. loss: 647547.079885
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 9025.34, NNZs: 26, Bias: 0.000000, T: 155884, Avg. loss: 647547.093292
Total training time: 0.02 seconds.
-- Epoch 5
Norm: 9025.35, NNZs: 26, Bias: 0.000000, T: 194855, Avg. loss: 647547.172694
Total training time: 0.02 seconds.
-- Epoch 6
Norm: 9025.36, NNZs: 26, Bias: 0.000000, T: 233826, Avg. loss: 647547.198539
Total training time: 0.03 seconds.
-- Epoch 7
Norm: 9025.37, NNZs: 26, Bias: 0.000000, T: 272797, Avg. loss: 647547.213976
Total training time: 0.03 seconds.
-- Epoch 8
Norm: 9025.37, NNZs: 26, Bias: 0.000000, T: 311768, Avg. loss: 647547.220717
Total training time: 0.04 seconds.
-- Epoch 9
Norm: 9

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.001,
       fit_intercept=False, l1_ratio=0.15, learning_rate='constant',
       loss='squared_loss', max_iter=100, n_iter=None, penalty='None',
       power_t=0.25, random_state=None, shuffle=False, tol=None,
       verbose=True, warm_start=False)

In [99]:
y_validation_pred_SKlearn = clf.predict(x_validation_norm)

In [100]:
metrics.r2_score(y_validation, y_validation_pred_SKlearn)

0.921402156170207

In [101]:
metrics.mean_absolute_error(y_validation, y_validation_pred_SKlearn)

744.1835304252023

### Our implementation

In [124]:
%matplotlib qt
def SGD(theta, x, y, learning_rate, iterations):
    m = x.shape[0]
    fig = plt.figure(1)
    plt.xlabel('Iterations')
    plt.ylabel('Average Cost')
    plt.ion()
    plt.show()
    for i in range(1,iterations+1):
        iter_total_cost = 0
        for item,price in zip(x.values, y.values):
            y_pred = np.dot(item, theta)
            loss = y_pred - price
            cost = np.sum(loss ** 2)/2
            gradient = np.dot(item.transpose(), loss)
            theta = theta - learning_rate * gradient
            iter_total_cost += cost
        iter_average_cost = iter_total_cost/m
        print("Epoch {}".format(i))
        print(iter_average_cost)
        plt.scatter(i,iter_average_cost, c='b')
        fig.canvas.draw()
        plt.pause(0.0001)
    return theta

In [125]:
theta_sgd = SGD(initial_theta, x_validation_norm, y_validation,learning_rate, iterations)

Epoch 1
631767.6484996347
Epoch 2
630459.2046998414
Epoch 3
630143.1666578399
Epoch 4
630015.0712351819
Epoch 5
629941.4009434943
Epoch 6
629888.386126996
Epoch 7
629845.2440409795
Epoch 8
629807.7946975959
Epoch 9
629774.081457055
Epoch 10
629743.0305123472
Epoch 11
629713.9840586449
Epoch 12
629686.5105479095
Epoch 13
629660.3139383305
Epoch 14
629635.1841255666
Epoch 15
629610.9674447296
Epoch 16
629587.5482564488
Epoch 17
629564.8371457098
Epoch 18
629542.7632286806
Epoch 19
629521.2690617691
Epoch 20
629500.3072160615
Epoch 21
629479.8379254126
Epoch 22
629459.8274326269
Epoch 23
629440.2467946174
Epoch 24
629421.0709941913
Epoch 25
629402.2782609423
Epoch 26
629383.849538435
Epoch 27
629365.7680567506
Epoch 28
629348.018983298
Epoch 29
629330.5891336162
Epoch 30
629313.4667293406
Epoch 31
629296.641194253
Epoch 32
629280.1029815715
Epoch 33
629263.843427253
Epoch 34
629247.8546252493
Epoch 35
629232.1293212608
Epoch 36
629216.6608222916
Epoch 37
629201.4429196521
Epoch 38
629186.

In [126]:
y_validation_pred_our = np.dot(theta_sgd, np.transpose(x_validation_norm))

In [127]:
metrics.r2_score(y_validation, y_validation_pred_our)

0.9221993843447939

In [128]:
metrics.mean_absolute_error(y_validation, y_validation_pred_our)

739.938566964176

## Feature Selection

In [None]:
model_sfm = SelectFromModel(clf, prefit=True)
feature_mask = model_sfm.get_support()
feature_name = np.array(x_train_norm.columns)
print(np.array(feature_name)[feature_mask==True])

In [None]:
x_train_selected = model_sfm.transform(x_train_norm)
x_test_selected = model_sfm.transform(x_test_norm)

In [None]:
clf.fit(x_train_selected, y_train)

In [None]:
clf.score(x_test_selected, y_test)