In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt 

In [2]:
data = pd.read_csv('diamonds.csv', index_col=0)

In [3]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
train, test = train_test_split(data, test_size=0.15)

In [5]:
train, validation = train_test_split(train, test_size=0.15)

In [6]:
train_one_hot_cut = pd.get_dummies(train['cut'])
train_one_hot_color = pd.get_dummies(train['color'])
train_one_hot_clarity = pd.get_dummies(train['clarity'])

In [7]:
validation_one_hot_cut = pd.get_dummies(validation['cut'])
validation_one_hot_color = pd.get_dummies(validation['color'])
validation_one_hot_clarity = pd.get_dummies(validation['clarity'])

In [8]:
test_one_hot_cut = pd.get_dummies(test['cut'])
test_one_hot_color = pd.get_dummies(test['color'])
test_one_hot_clarity = pd.get_dummies(test['clarity'])

In [9]:
y_train = train.loc[:,'price']
y_validation = validation.loc[:,'price']
y_test = test.loc[:,'price']

In [10]:
x_train = train.drop('price', axis=1)
x_validation = validation.drop('price', axis=1)
x_test = test.drop('price', axis=1)

In [11]:
x_train = x_train.drop('cut', axis=1)
x_train = x_train.drop('color', axis=1)
x_train = x_train.drop('clarity', axis=1)

In [12]:
x_validation = x_validation.drop('cut', axis=1)
x_validation = x_validation.drop('color', axis=1)
x_validation = x_validation.drop('clarity', axis=1)

In [13]:
x_test = x_test.drop('cut', axis=1)
x_test = x_test.drop('color', axis=1)
x_test = x_test.drop('clarity', axis=1)

In [14]:
x_scaler = StandardScaler().fit(x_train)

In [15]:
x_train.head()

Unnamed: 0,carat,depth,table,x,y,z
9799,1.24,62.2,58.0,6.94,6.82,4.28
39316,0.53,60.8,58.0,5.19,5.21,3.16
35700,0.3,60.8,57.0,4.36,4.33,2.64
1140,0.65,61.3,56.0,5.58,5.61,3.43
4930,1.11,63.2,59.0,6.55,6.46,4.11


In [16]:
x_test.head()

Unnamed: 0,carat,depth,table,x,y,z
24592,1.2,62.3,60.0,6.74,6.8,4.22
33669,0.34,62.3,57.0,4.44,4.48,2.78
6588,1.01,61.3,59.0,6.5,6.46,3.97
10532,1.01,63.6,56.0,6.38,6.33,4.04
21155,1.01,60.0,55.0,6.56,6.6,3.95


In [17]:
x_train_norm = pd.DataFrame(x_scaler.transform(x_train), columns=x_train.columns, index=x_train.index)
x_validation_norm = pd.DataFrame(x_scaler.transform(x_validation), columns=x_validation.columns, index=x_validation.index)
x_test_norm = pd.DataFrame(x_scaler.transform(x_test), columns=x_test.columns, index=x_test.index)

In [18]:
x_train_norm.head()

Unnamed: 0,carat,depth,table,x,y,z
9799,0.93838,0.312731,0.243907,1.081895,0.951429,1.047443
39316,-0.566479,-0.668222,0.243907,-0.48387,-0.459097,-0.535941
35700,-1.053969,-0.668222,-0.203542,-1.22649,-1.230068,-1.271084
1140,-0.312137,-0.317882,-0.650992,-0.134928,-0.108656,-0.154232
4930,0.662842,1.013412,0.691357,0.732953,0.636032,0.807108


In [19]:
x_test_norm.head()

Unnamed: 0,carat,depth,table,x,y,z
24592,0.853599,0.382799,1.138806,0.902951,0.933907,0.962619
33669,-0.969188,0.382799,-0.203542,-1.154913,-1.098652,-1.073161
6588,0.45089,-0.317882,0.691357,0.688217,0.636032,0.609185
10532,0.45089,1.293684,-0.650992,0.58085,0.522139,0.708147
21155,0.45089,-1.228767,-1.098442,0.7419,0.758687,0.580911


In [20]:
x_train_norm = x_train_norm.join(train_one_hot_cut)
x_train_norm = x_train_norm.join(train_one_hot_color)
x_train_norm = x_train_norm.join(train_one_hot_clarity)

In [21]:
x_validation_norm = x_validation_norm.join(validation_one_hot_cut)
x_validation_norm = x_validation_norm.join(validation_one_hot_color)
x_validation_norm = x_validation_norm.join(validation_one_hot_clarity)

In [22]:
x_test_norm = x_test_norm.join(test_one_hot_cut)
x_test_norm = x_test_norm.join(test_one_hot_color)
x_test_norm = x_test_norm.join(test_one_hot_clarity)

In [23]:
x_train_norm.head()

Unnamed: 0,carat,depth,table,x,y,z,Fair,Good,Ideal,Premium,...,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
9799,0.93838,0.312731,0.243907,1.081895,0.951429,1.047443,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
39316,-0.566479,-0.668222,0.243907,-0.48387,-0.459097,-0.535941,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
35700,-1.053969,-0.668222,-0.203542,-1.22649,-1.230068,-1.271084,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1140,-0.312137,-0.317882,-0.650992,-0.134928,-0.108656,-0.154232,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4930,0.662842,1.013412,0.691357,0.732953,0.636032,0.807108,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [24]:
x_test_norm.head()

Unnamed: 0,carat,depth,table,x,y,z,Fair,Good,Ideal,Premium,...,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
24592,0.853599,0.382799,1.138806,0.902951,0.933907,0.962619,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
33669,-0.969188,0.382799,-0.203542,-1.154913,-1.098652,-1.073161,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
6588,0.45089,-0.317882,0.691357,0.688217,0.636032,0.609185,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
10532,0.45089,1.293684,-0.650992,0.58085,0.522139,0.708147,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
21155,0.45089,-1.228767,-1.098442,0.7419,0.758687,0.580911,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [25]:
theta_norm = np.dot(np.linalg.pinv(np.dot(np.transpose(x_train_norm),x_train_norm)),np.dot(np.transpose(x_train_norm),y_train))

In [26]:
y_normal = np.dot(theta_norm, np.transpose(x_test_norm))

In [27]:
metrics.r2_score(y_test, y_normal)

0.9213980957035438

In [28]:
metrics.mean_absolute_error(y_test, y_normal)

738.7683920475756

In [93]:
clf = SGDRegressor(penalty="None", eta0=0.001, max_iter=100)

In [94]:
clf.fit(x_train_norm, y_train)

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.001,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=100, n_iter=None, penalty='None',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [95]:
y_test_pred_SKlearn = clf.predict(x_test_norm)

In [96]:
metrics.r2_score(y_test, y_test_pred_SKlearn)

0.9205990293955055

In [33]:
metrics.mean_absolute_error(y_test, y_test_pred_SKlearn)

742.4546959509245

In [34]:
initial_theta = np.zeros_like(theta_norm)

In [35]:
initial_theta

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [163]:
%matplotlib qt
def SGD(theta, x, y, learning_rate=0.001, iterations=100):
    m = x.shape[0]
    fig = plt.figure(1)
    plt.xlabel('Iterations')
    plt.ylabel('Average Cost')
    plt.ion()
    plt.show()
    for i in range(iterations):
        iter_total_cost = 0
        for item,price in zip(x.values, y.values):
            y_pred = np.dot(item, theta)
            loss = y_pred - price
            cost = np.sum(loss ** 2)
            gradient = np.dot(item.transpose(), loss)
            theta = theta - learning_rate * gradient
            iter_total_cost += cost
        iter_average_cost = iter_total_cost/m
        plt.scatter(i,iter_average_cost, c='b')
        fig.canvas.draw()
        plt.pause(0.0001)
    return theta

In [164]:
theta_sgd = SGD(initial_theta, x_test_norm, y_test)

In [148]:
y_test_pred_Our = np.dot(theta_sgd, np.transpose(x_test_norm))

In [159]:
metrics.r2_score(y_test, y_test_pred_Our)

0.9214944448305752

In [160]:
metrics.mean_absolute_error(y_test, y_test_pred_Our)

729.2517512485996

In [87]:
model_sfm = SelectFromModel(clf, prefit=True)
feature_mask = model_sfm.get_support()
feature_name = np.array(x_train_norm.columns)
print(np.array(feature_name)[feature_mask==True])

['carat' 'D' 'E' 'J' 'I1' 'IF' 'SI2' 'VS1' 'VVS1' 'VVS2']


In [88]:
x_train_selected = model_sfm.transform(x_train_norm)
x_test_selected = model_sfm.transform(x_test_norm)

In [89]:
clf.fit(x_train_selected, y_train)

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.001,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=100, n_iter=None, penalty='None',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [90]:
clf.score(x_test_selected, y_test)

0.9061149933200293