In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [2]:
data = pd.read_csv('diamonds.csv', index_col=0)

In [3]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
train, test = train_test_split(data, test_size=0.15)

In [5]:
train_one_hot_cut = pd.get_dummies(train['cut'])
train_one_hot_color = pd.get_dummies(train['color'])
train_one_hot_clarity = pd.get_dummies(train['clarity'])

In [6]:
test_one_hot_cut = pd.get_dummies(test['cut'])
test_one_hot_color = pd.get_dummies(test['color'])
test_one_hot_clarity = pd.get_dummies(test['clarity'])

In [7]:
y_train = train.loc[:,'price']
y_test = test.loc[:,'price']

In [21]:
x_train = train.drop('price', axis=1)
x_test = test.drop('price', axis=1)

In [22]:
x_train = x_train.drop('cut', axis=1)
x_train = x_train.drop('color', axis=1)
x_train = x_train.drop('clarity', axis=1)

In [23]:
x_test = x_test.drop('cut', axis=1)
x_test = x_test.drop('color', axis=1)
x_test = x_test.drop('clarity', axis=1)

In [24]:
x_scaler = StandardScaler().fit(x_train)

In [25]:
x_train.head()

Unnamed: 0,carat,depth,table,x,y,z
29526,0.38,61.9,54.7,4.63,4.68,2.88
18542,1.51,62.2,60.0,7.27,7.32,4.54
48019,0.63,61.3,58.0,5.57,5.52,3.4
5321,0.9,61.0,59.0,6.14,6.18,3.76
4365,0.9,62.3,64.0,6.1,6.13,3.81


In [26]:
x_test.head()

Unnamed: 0,carat,depth,table,x,y,z
53478,0.7,58.2,60.0,5.75,5.79,3.36
15460,1.21,62.2,60.0,6.71,6.73,4.18
46508,0.51,62.4,57.0,5.09,5.13,3.19
16827,1.51,61.9,56.0,7.44,7.29,4.56
44090,0.5,61.6,56.0,5.1,5.13,3.15


In [27]:
x_train_norm = pd.DataFrame(x_scaler.transform(x_train), columns=x_train.columns, index=x_train.index)
x_test_norm = pd.DataFrame(x_scaler.transform(x_test), columns=x_test.columns, index=x_test.index)

In [28]:
x_train_norm.head()

Unnamed: 0,carat,depth,table,x,y,z
29526,-0.883195,0.104763,-1.230731,-0.981892,-0.919759,-0.947047
18542,1.512312,0.314084,1.134232,1.377083,1.387223,1.446667
48019,-0.353215,-0.313879,0.241793,-0.141954,-0.18572,-0.197209
5321,0.219163,-0.5232,0.688012,0.36737,0.391026,0.32191
4365,0.219163,0.383858,2.919109,0.331628,0.347333,0.39401


In [29]:
x_test_norm.head()

Unnamed: 0,carat,depth,table,x,y,z
53478,-0.204821,-2.476864,1.134232,0.018885,0.050222,-0.254889
15460,0.876337,0.314084,1.134232,0.876694,0.871648,0.927549
46508,-0.607605,0.453632,-0.204426,-0.570858,-0.526524,-0.500028
16827,1.512312,0.104763,-0.650646,1.528986,1.361007,1.475507
44090,-0.628804,-0.104558,-0.650646,-0.561923,-0.526524,-0.557708


In [30]:
x_train_norm = x_train_norm.join(train_one_hot_cut)
x_train_norm = x_train_norm.join(train_one_hot_color)
x_train_norm = x_train_norm.join(train_one_hot_clarity)

In [31]:
x_test_norm = x_test_norm.join(test_one_hot_cut)
x_test_norm = x_test_norm.join(test_one_hot_color)
x_test_norm = x_test_norm.join(test_one_hot_clarity)

In [32]:
x_train_norm.head()

Unnamed: 0,carat,depth,table,x,y,z,Fair,Good,Ideal,Premium,...,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
29526,-0.883195,0.104763,-1.230731,-0.981892,-0.919759,-0.947047,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
18542,1.512312,0.314084,1.134232,1.377083,1.387223,1.446667,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
48019,-0.353215,-0.313879,0.241793,-0.141954,-0.18572,-0.197209,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
5321,0.219163,-0.5232,0.688012,0.36737,0.391026,0.32191,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4365,0.219163,0.383858,2.919109,0.331628,0.347333,0.39401,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [33]:
x_test_norm.head()

Unnamed: 0,carat,depth,table,x,y,z,Fair,Good,Ideal,Premium,...,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
53478,-0.204821,-2.476864,1.134232,0.018885,0.050222,-0.254889,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
15460,0.876337,0.314084,1.134232,0.876694,0.871648,0.927549,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
46508,-0.607605,0.453632,-0.204426,-0.570858,-0.526524,-0.500028,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
16827,1.512312,0.104763,-0.650646,1.528986,1.361007,1.475507,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
44090,-0.628804,-0.104558,-0.650646,-0.561923,-0.526524,-0.557708,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [34]:
theta_norm = np.dot(np.linalg.pinv(np.dot(np.transpose(x_train_norm),x_train_norm)),np.dot(np.transpose(x_train_norm),y_train))

In [37]:
y_normal = np.dot(theta_norm, np.transpose(x_test_norm))

In [40]:
metrics.r2_score(y_test, y_normal)

0.9196850470018753

In [41]:
clf = SGDRegressor(penalty="None", eta0=0.001, max_iter=1000)

In [42]:
clf.fit(x_train_norm, train_y)

NameError: name 'train_x_norm' is not defined

In [None]:
clf.score(test_x, test_y)