In [1]:
# Machine Learning with Scikit-learn

#Objective: Predicting the price of a diamond

In [2]:
import pandas as pd

df = pd.read_csv("diamonds.csv", index_col=0) #we dont generate duplicate columns
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df["cut"].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [4]:
df["color"].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [5]:
df["clarity"].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [6]:
df["cut"].astype("category").cat.codes

1        2
2        3
3        1
4        3
5        1
        ..
53936    2
53937    1
53938    4
53939    3
53940    2
Length: 53940, dtype: int8

In [7]:
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}
clarity_dict = {"I3":1, "I2":2, "I1":3, "SI2":4, "SI1":5, "VS2":6, "VS1":7, "VVS2":8, "VVS1":9, "IF":10, "FL":11}
color_dict = {"J":1,"I": 2, "H": 3, "G": 4, "F": 5, "E": 6, "D": 7}



df["cut"] = df["cut"].map(cut_class_dict)
df["color"] = df["color"].map(color_dict)
df["clarity"] = df["clarity"].map(clarity_dict)

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [13]:
# terminal: install scikit-learn

# cheat sheet:   https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

import sklearn 
from sklearn import svm, preprocessing


#shuffle pandas by index

#shuffle sklearn

df = sklearn.utils.shuffle(df)


x = df.drop('price', axis=1).values
x = preprocessing.scale(x)
y = df['price'].values

test_size = 200

x_train = x[:-test_size]
y_train = y[:-test_size]

x_test = x[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR(kernel="linear")
clf.fit(x_train, y_train)

SVR(kernel='linear')

In [14]:
clf.score(x_test, y_test)

0.8768795438469067

In [16]:
for x, y in zip(x_test, y_test):
    print(f"Model: {clf.predict([x])[0]}, Actual: {y}")

Model: 2377.2060048545727, Actual: 1622
Model: 7801.536511136797, Actual: 6238
Model: 1500.5800116719784, Actual: 1436
Model: 189.2614823544227, Actual: 605
Model: 5305.182909308455, Actual: 6335
Model: 6365.101280047873, Actual: 7106
Model: 1536.592077520801, Actual: 1200
Model: 1166.022709249948, Actual: 1052
Model: 3106.649212938115, Actual: 2808
Model: 672.2020220171257, Actual: 1163
Model: 422.74226661022385, Actual: 583
Model: 488.794931944828, Actual: 855
Model: 544.4338249939633, Actual: 928
Model: 717.2287188572864, Actual: 956
Model: 4502.707140892105, Actual: 4761
Model: 5468.698856299688, Actual: 5436
Model: 885.6094597979959, Actual: 816
Model: 707.0700172536594, Actual: 919
Model: 2556.3630012846324, Actual: 2312
Model: 10337.886325647423, Actual: 16521
Model: 4669.5179868441355, Actual: 4452
Model: 1900.0015867898776, Actual: 1838
Model: 7871.924192214185, Actual: 8161
Model: 1962.5923260127283, Actual: 1813
Model: 979.6431602950775, Actual: 814
Model: 7580.485599282487,

In [17]:
clf = svm.SVR(kernel="rbf")
clf.fit(x_train, y_train)

SVR()

In [18]:
print(clf.score(x_test,y_test))

for x, y in zip(x_test, y_test):
    print(f"Model: {clf.predict([x])[0]}, Actual: {y}")

0.6642731960556278
Model: 2035.0735151760666, Actual: 1622
Model: 5325.09387095904, Actual: 6238
Model: 1057.935596245417, Actual: 1436
Model: 725.0638211228775, Actual: 605
Model: 5610.572238616247, Actual: 6335
Model: 6543.299571640815, Actual: 7106
Model: 1142.8188639498803, Actual: 1200
Model: 2190.1023529060067, Actual: 1052
Model: 2891.8466888025514, Actual: 2808
Model: 932.4916928172188, Actual: 1163
Model: 730.8582119218845, Actual: 583
Model: 1277.8202250393094, Actual: 855
Model: 745.0885753018192, Actual: 928
Model: 625.8874787539507, Actual: 956
Model: 4040.962853608795, Actual: 4761
Model: 5127.450474618889, Actual: 5436
Model: 904.263651765667, Actual: 816
Model: 999.027999191831, Actual: 919
Model: 2885.9899305375557, Actual: 2312
Model: 7741.370681971968, Actual: 16521
Model: 4528.348848595294, Actual: 4452
Model: 1732.8473693451685, Actual: 1838
Model: 5676.069722166214, Actual: 8161
Model: 1672.9974555496267, Actual: 1813
Model: 1034.6744630543735, Actual: 814
Model: 