In [1]:
import pandas as pd

df = pd.read_csv('diamonds.csv', index_col = 0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [6]:
#df['cut'].astype('category').cat.codes
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)


In [7]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [13]:
import sklearn
from sklearn import svm, preprocessing

# you probably always want to shuffle your model in order not to have bias in my model, especially if dataset is sorted in anyway
df = sklearn.utils.shuffle(df)

In [14]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
42380,0.59,5,6,4,62.5,57.0,1305,5.32,5.37,3.34
32432,0.33,5,6,6,62.5,55.0,792,4.46,4.41,2.77
4436,0.9,2,4,5,65.0,57.0,3615,5.96,6.01,3.89
28690,0.26,3,5,9,60.8,58.0,679,4.11,4.18,2.52
40651,0.5,3,6,4,61.3,56.0,1154,5.08,5.13,3.13


In [15]:
#X --> feature list
X = df.drop('price', axis=1).values
X = preprocessing.scale(X)
y = df['price'].values

In [16]:
X

array([[0.59, 5.  , 6.  , ..., 5.32, 5.37, 3.34],
       [0.33, 5.  , 6.  , ..., 4.46, 4.41, 2.77],
       [0.9 , 2.  , 4.  , ..., 5.96, 6.01, 3.89],
       ...,
       [0.32, 5.  , 6.  , ..., 4.42, 4.39, 2.72],
       [0.77, 2.  , 7.  , ..., 5.83, 5.86, 3.69],
       [0.41, 4.  , 6.  , ..., 4.78, 4.75, 2.96]])

In [17]:
y

array([1305,  792, 3615, ...,  972, 3489,  999])

In [21]:
test_size = 200
X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR(kernel = 'linear')
clf.fit(X_train, y_train)

SVR(kernel='linear')

In [22]:
clf.score(X_test,y_test)

0.791388237633325

In [23]:
for X,y in zip(X_test, y_test):
    print(f'Model: {clf.predict([X])[0]}, Actual: {y}')

Model: 2525.163050863399, Actual: 1770
Model: 1889.456603750683, Actual: 1746
Model: -972.8364574815223, Actual: 506
Model: 3450.324076399478, Actual: 2658
Model: 3768.3942898561854, Actual: 2456
Model: 3398.4529463527088, Actual: 2559
Model: 9680.506378180022, Actual: 18342
Model: 5856.597427602896, Actual: 5192
Model: -296.7712806369, Actual: 530
Model: 2768.2208166385226, Actual: 2278
Model: 939.7981181019059, Actual: 1181
Model: 2742.777313737248, Actual: 2351
Model: 1406.138396265511, Actual: 1030
Model: 98.28160225536158, Actual: 689
Model: 3238.643908539505, Actual: 2037
Model: 1178.396606628945, Actual: 892
Model: 2724.675972218103, Actual: 1781
Model: 4218.752004889991, Actual: 3869
Model: 1425.1798405982827, Actual: 1443
Model: 2871.2862456440253, Actual: 2304
Model: -424.31530053901406, Actual: 456
Model: 499.8450924515437, Actual: 752
Model: 1482.6653476067404, Actual: 1321
Model: 4553.145342279562, Actual: 3947
Model: 6564.844466249708, Actual: 7555
Model: 3918.79644037113

In [27]:
clf = svm.SVR(kernel = 'rbf')
clf.fit(X_train, y_train)

SVR()

In [28]:
clf.score(X_test,y_test)

-0.07055366939384844

In [29]:
for X,y in zip(X_test, y_test):
    print(f'Model: {clf.predict([X])[0]}, Actual: {y}')

Model: 2419.8202382614795, Actual: 1770
Model: 2372.2352665371654, Actual: 1746
Model: 2334.709272081368, Actual: 506
Model: 2401.2081519918706, Actual: 2658
Model: 2390.0725918033954, Actual: 2456
Model: 2397.127993929657, Actual: 2559
Model: 2540.4119577250503, Actual: 18342
Model: 2473.8852027795583, Actual: 5192
Model: 2304.338106530335, Actual: 530
Model: 2387.0279776867715, Actual: 2278
Model: 2371.1026642161823, Actual: 1181
Model: 2429.3602920627413, Actual: 2351
Model: 2316.605517292939, Actual: 1030
Model: 2352.4620623838828, Actual: 689
Model: 2446.0959748764885, Actual: 2037
Model: 2279.605040399125, Actual: 892
Model: 2345.2330020453614, Actual: 1781
Model: 2447.797560409144, Actual: 3869
Model: 2404.789836140562, Actual: 1443
Model: 2356.1099551831726, Actual: 2304
Model: 2302.503153207792, Actual: 456
Model: 2325.3820857880314, Actual: 752
Model: 2351.6947441002144, Actual: 1321
Model: 2462.467479540913, Actual: 3947
Model: 2473.1479975658326, Actual: 7555
Model: 2468.30