In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('diamonds.csv', index_col = 0)
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [6]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [5]:
#df['cut'].astype('category').cat.codes
#converts string data to numerical so we can work with it in machine learning
#we want to preserve certain order, for example of cut, ideal > premium ... > fair 

In [7]:
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

In [8]:
df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [16]:
import sklearn
from sklearn import svm, preprocessing

In [24]:
#we will shuffle the data so that it doesnt influence the model
df = sklearn.utils.shuffle(df)

X = df.drop('price', axis = 1).values
X = preprocessing.scale(X)

y = df['price'].values

test_size = 200

#define a big train size to our model and then a small size to test, different
X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

#classifier
clf = svm.SVR(kernel = 'linear')
clf.fit(X_train, y_train)

SVR(kernel='linear')

In [25]:
#accuracy of model prediction
clf.score(X_test, y_test)

0.8518032376665785

In [26]:
for X,y in zip(X_test,y_test):
    print(f'Model: {clf.predict([X])[0]}, Actual: {y}')

Model: -207.6747554588228, Actual: 432
Model: 631.0591356630039, Actual: 703
Model: 4639.571564601512, Actual: 4252
Model: 136.7802761932012, Actual: 743
Model: 7623.005459510185, Actual: 9979
Model: 2478.193369301575, Actual: 2423
Model: 12666.13171263319, Actual: 12220
Model: 5636.277796100663, Actual: 4391
Model: 683.7646741852759, Actual: 795
Model: 1248.1327032659087, Actual: 1202
Model: 12377.24788244524, Actual: 16641
Model: 3289.557437504253, Actual: 3478
Model: 1366.9989329434343, Actual: 1238
Model: 8279.96098653982, Actual: 12467
Model: 9.271620699028063, Actual: 524
Model: 4646.491249264916, Actual: 4308
Model: 9537.051282700191, Actual: 10574
Model: 166.25630787914088, Actual: 720
Model: 4114.954249395486, Actual: 5000
Model: 7396.9529944317255, Actual: 11231
Model: 2146.9244646133984, Actual: 1832
Model: 2887.795079999517, Actual: 2048
Model: 2289.339668250541, Actual: 1832
Model: 4439.177203000958, Actual: 4788
Model: 2756.6807491111517, Actual: 2312
Model: 4089.96179850

In [27]:
clf = svm.SVR(kernel = 'rbf')
clf.fit(X_train, y_train)

SVR()

In [28]:
clf.score(X_test, y_test)

0.6325803729492911

In [29]:
for X,y in zip(X_test,y_test):
    print(f'Model: {clf.predict([X])[0]}, Actual: {y}')

Model: 944.7988916774129, Actual: 432
Model: 783.6662600862028, Actual: 703
Model: 4132.609430514383, Actual: 4252
Model: 419.83782512269136, Actual: 743
Model: 7043.598587407558, Actual: 9979
Model: 2603.73163216196, Actual: 2423
Model: 5909.978292410255, Actual: 12220
Model: 5087.91304562509, Actual: 4391
Model: 838.0941792814219, Actual: 795
Model: 2381.3744897627103, Actual: 1202
Model: 6134.974614051282, Actual: 16641
Model: 3185.2715604736413, Actual: 3478
Model: 1657.8461880067366, Actual: 1238
Model: 5850.928627441901, Actual: 12467
Model: 931.5460849572196, Actual: 524
Model: 4363.925373618022, Actual: 4308
Model: 7196.459722949158, Actual: 10574
Model: 486.0306607710336, Actual: 720
Model: 3891.0588079276954, Actual: 5000
Model: 6732.478704879033, Actual: 11231
Model: 1763.391224595783, Actual: 1832
Model: 3079.6675801669503, Actual: 2048
Model: 2313.5283079633455, Actual: 1832
Model: 3565.9748558153738, Actual: 4788
Model: 2701.275528358206, Actual: 2312
Model: 3506.06546063