In [1]:
import pandas as pd

df = pd.read_csv("datasets/diamonds.csv", index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [3]:
cut_class_dict = {'Fair':1,'Good':2,'Very Good':3,'Premium':4,'Ideal':5}

In [4]:
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [5]:
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

In [6]:
df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [18]:
#Stochastic Gradient Descent
import sklearn
from sklearn.linear_model import SGDRegressor

df = sklearn.utils.shuffle(df)

X = df.drop('price', axis=1).values
y = df['price'].values

In [19]:
test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

In [20]:
clf = SGDRegressor(max_iter=1000)
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test))

-31237894.893177114


In [21]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

-8050643.408323288 8077
12483935.513689995 861
-19549210.78580141 3171
-11071448.587275028 2554
-29581084.265467167 4516
-38222327.34746599 613
22015392.17807579 15458
-11545332.10607338 745
-22817050.03187895 6214
-6777975.3884625435 666


In [25]:
#Support Vector Regression
from sklearn import svm

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

-0.11267072148195068


In [26]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

2425.0722953775467 8077
2330.0868992382084 861
2409.5738716669334 3171
2354.522003994304 2554
2477.1764482010967 4516
2407.798592280461 613
2465.3697242939734 15458
2322.905381166538 745
2455.58653859225 6214
2362.4074361335734 666


In [27]:
clf = SGDRegressor(max_iter=10000)

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

-38837463.2684397
10441675.06380248 8077
-15537783.394329548 861
21449153.523753643 3171
12926460.9522717 2554
31920835.39777279 4516
40834937.13530588 613
-23076485.09080267 15458
12255334.332582235 745
25782085.191161633 6214
7631017.973648787 666


In [31]:
import sklearn
from sklearn import svm, preprocessing

df = sklearn.utils.shuffle(df)

X = df.drop("price", axis=1).values
X = preprocessing.scale(X)
y = df["price"].values

test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

for X,y in list(zip(X_test, y_test))[:10]:
    print(f"model predicts {clf.predict([X])[0]}, real value: {y}")

0.5846454474288176
model predicts 4944.368686096386, real value: 5401
model predicts 6727.869019245582, real value: 10380
model predicts 5716.2861983559305, real value: 18034
model predicts 2770.284766534557, real value: 2239
model predicts 1241.5049181906593, real value: 408
model predicts 592.5267863032645, real value: 451
model predicts 6550.553417488911, real value: 13986
model predicts 1360.7085354974597, real value: 1591
model predicts 1846.5112172014133, real value: 1662
model predicts 3646.295008427112, real value: 2923
