In [103]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel

In [104]:
data = pd.read_csv('diamonds.csv', index_col=0)

In [105]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [106]:
train, test = train_test_split(data, test_size=0.15)

In [107]:
train_one_hot_cut = pd.get_dummies(train['cut'])
train_one_hot_color = pd.get_dummies(train['color'])
train_one_hot_clarity = pd.get_dummies(train['clarity'])

In [108]:
test_one_hot_cut = pd.get_dummies(test['cut'])
test_one_hot_color = pd.get_dummies(test['color'])
test_one_hot_clarity = pd.get_dummies(test['clarity'])

In [109]:
y_train = train.loc[:,'price']
y_test = test.loc[:,'price']

In [110]:
x_train = train.drop('price', axis=1)
x_test = test.drop('price', axis=1)

In [111]:
x_train = x_train.drop('cut', axis=1)
x_train = x_train.drop('color', axis=1)
x_train = x_train.drop('clarity', axis=1)

In [112]:
x_test = x_test.drop('cut', axis=1)
x_test = x_test.drop('color', axis=1)
x_test = x_test.drop('clarity', axis=1)

In [113]:
x_scaler = StandardScaler().fit(x_train)

In [114]:
x_train.head()

Unnamed: 0,carat,depth,table,x,y,z
17703,0.32,60.5,59.0,4.43,4.39,2.67
50870,0.7,60.0,62.0,5.73,5.83,3.47
2390,0.32,61.4,57.0,4.39,4.4,2.7
27104,2.1,60.8,60.0,8.24,8.35,5.04
33565,0.45,60.1,59.0,4.96,4.92,3.02


In [115]:
x_test.head()

Unnamed: 0,carat,depth,table,x,y,z
30011,0.32,62.0,55.0,4.38,4.42,2.73
2510,0.7,61.7,57.0,5.76,5.73,3.54
30859,0.33,60.8,58.0,4.47,4.45,2.71
43550,0.52,61.6,58.0,5.14,5.22,3.19
27993,0.3,60.4,58.0,4.3,4.34,2.61


In [116]:
x_train_norm = pd.DataFrame(x_scaler.transform(x_train), columns=x_train.columns, index=x_train.index)
x_test_norm = pd.DataFrame(x_scaler.transform(x_test), columns=x_test.columns, index=x_test.index)

In [117]:
x_train_norm.head()

Unnamed: 0,carat,depth,table,x,y,z
17703,-1.010445,-0.868038,0.690481,-1.163729,-1.210251,-1.232558
50870,-0.208147,-1.216785,2.033327,-0.003163,0.084963,-0.099343
2390,-1.010445,-0.240294,-0.20475,-1.199438,-1.201256,-1.190062
27104,2.747688,-0.65879,1.138096,2.237621,2.351586,2.124592
33565,-0.735974,-1.147035,0.690481,-0.690575,-0.73354,-0.736776


In [118]:
x_test_norm.head()

Unnamed: 0,carat,depth,table,x,y,z
30011,-1.010445,0.178202,-1.09998,-1.208366,-1.183267,-1.147567
2510,-0.208147,-0.031046,-0.20475,0.023619,-0.004983,-0.000186
30859,-0.989332,-0.65879,0.242866,-1.128019,-1.156284,-1.175897
43550,-0.588183,-0.100796,0.242866,-0.529881,-0.463704,-0.495968
27993,-1.052671,-0.937787,0.242866,-1.279785,-1.255223,-1.317549


In [119]:
x_train_norm = x_train_norm.join(train_one_hot_cut)
x_train_norm = x_train_norm.join(train_one_hot_color)
x_train_norm = x_train_norm.join(train_one_hot_clarity)

In [120]:
x_test_norm = x_test_norm.join(test_one_hot_cut)
x_test_norm = x_test_norm.join(test_one_hot_color)
x_test_norm = x_test_norm.join(test_one_hot_clarity)

In [121]:
x_train_norm.head()

Unnamed: 0,carat,depth,table,x,y,z,Fair,Good,Ideal,Premium,...,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
17703,-1.010445,-0.868038,0.690481,-1.163729,-1.210251,-1.232558,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
50870,-0.208147,-1.216785,2.033327,-0.003163,0.084963,-0.099343,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2390,-1.010445,-0.240294,-0.20475,-1.199438,-1.201256,-1.190062,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
27104,2.747688,-0.65879,1.138096,2.237621,2.351586,2.124592,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
33565,-0.735974,-1.147035,0.690481,-0.690575,-0.73354,-0.736776,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0


In [122]:
x_test_norm.head()

Unnamed: 0,carat,depth,table,x,y,z,Fair,Good,Ideal,Premium,...,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
30011,-1.010445,0.178202,-1.09998,-1.208366,-1.183267,-1.147567,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2510,-0.208147,-0.031046,-0.20475,0.023619,-0.004983,-0.000186,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
30859,-0.989332,-0.65879,0.242866,-1.128019,-1.156284,-1.175897,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
43550,-0.588183,-0.100796,0.242866,-0.529881,-0.463704,-0.495968,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
27993,-1.052671,-0.937787,0.242866,-1.279785,-1.255223,-1.317549,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [123]:
theta_norm = np.dot(np.linalg.pinv(np.dot(np.transpose(x_train_norm),x_train_norm)),np.dot(np.transpose(x_train_norm),y_train))

In [124]:
y_normal = np.dot(theta_norm, np.transpose(x_test_norm))

In [125]:
metrics.r2_score(y_test, y_normal)

0.9158736289261628

In [126]:
clf = SGDRegressor(penalty="None", eta0=0.001, max_iter=1000)

In [127]:
clf.fit(x_train_norm, y_train)

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.001,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=1000, n_iter=None, penalty='None',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [128]:
clf.score(x_test_norm, y_test)

0.9215085178110666

In [47]:
model_sfm = SelectFromModel(clf, prefit=True)

In [60]:
feature_mask = model_sfm.get_support()
feature_name = np.array(x_train_norm.columns)
print(np.array(feature_name)[feature_mask==True])

IndexError: boolean index did not match indexed array along dimension 0; dimension is 26 but corresponding boolean dimension is 9

In [56]:
x_train_selected = model_sfm.transform(x_train_norm)
x_test_selected = model_sfm.transform(x_test_norm)

In [57]:
clf.fit(x_train_selected, y_train)

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.001,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=1000, n_iter=None, penalty='None',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [58]:
clf.score(x_test_selected, y_test)

0.8959678147783878

In [185]:
initial_theta = np.zeros_like(theta_norm)

In [186]:
def SGD(theta, x, y, learning_rate=0.001, iterations=1000):
    m = x.shape[0]
    loss = 0
    for i in range(iterations):
        for item,price in zip(x.values, y.values):
            y_pred = np.dot(item, theta)
            loss += y_pred - price
            cost = np.sum(loss ** 2)
            gradient = np.dot(item.transpose(), loss)
            theta = theta - learning_rate * gradient
    return theta

In [187]:
theta_sgd = SGD(initial_theta, x_test_norm, y_test)

  
  # Remove the CWD from sys.path while we load stuff.


In [188]:
y_sgd = np.dot(theta_norm, np.transpose(x_test_norm))

In [189]:
metrics.r2_score(y_test, y_sgd)

0.9158736289261628