In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("diamonds.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [5]:
df=df.drop("Unnamed: 0", axis=1)

In [6]:
df = df.drop(df[df["x"]==0].index)
df = df.drop(df[df["y"]==0].index)
df = df.drop(df[df["z"]==0].index)

In [7]:
df = df[(df["depth"]<75)&(df["depth"]>45)]
df = df[(df["table"]<80)&(df["table"]>40)]
df = df[(df["y"]<30)]
df = df[(df["z"]<30)&(df["z"]>2)]

In [8]:
X= df.drop(["price"],axis =1)
y= df["price"]

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25, random_state=15)

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
# previously we just encoded the columns with label encoder like this and it worked
# however if we want to save the encoders we should have seperated them
# so i will use new encoding
#label_encoder = LabelEncoder()
#for col in ['cut', 'color', 'clarity']:
#    X_train[col] = label_encoder.fit_transform(X_train[col])
#    X_test[col] = label_encoder.transform(X_test[col])

In [13]:
encoders = {}
for col in ['cut', 'color', 'clarity']:
    encoders[col] = LabelEncoder()
    X_train[col] = encoders[col].fit_transform(X_train[col])
    X_test[col] = encoders[col].transform(X_test[col])
    print(encoders[col])

LabelEncoder()
LabelEncoder()
LabelEncoder()


In [14]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
15200,1.15,2,4,2,62.4,54.0,6.71,6.76,4.2
14632,1.11,3,1,2,61.3,58.0,6.66,6.61,4.07
19151,1.21,1,2,5,63.7,58.0,6.67,6.71,4.26
29299,0.3,2,5,5,61.5,58.0,4.28,4.31,2.64
9983,1.0,4,2,2,63.1,57.0,6.37,6.33,4.01


In [15]:
X_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
12011,1.09,2,4,2,61.4,57.0,6.66,6.6,4.07
26606,1.52,2,3,6,61.4,57.0,7.38,7.4,4.54
48741,0.53,2,3,6,61.3,56.0,5.22,5.26,3.21
22795,1.71,2,6,4,61.6,57.0,7.67,7.62,4.71
18444,1.5,3,6,5,59.6,60.0,7.38,7.31,4.38


In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [18]:
from sklearn.svm import SVR
svr=SVR(C=1000, gamma=0.1, kernel='rbf')

In [19]:
from sklearn.metrics import r2_score

In [20]:
svr.fit(X_train_scaled, y_train)
y_pred=svr.predict(X_test_scaled)
score=r2_score(y_test,y_pred)
print("R2 Score", score)

R2 Score 0.9452198447140461


In [21]:
encoders

{'cut': LabelEncoder(), 'color': LabelEncoder(), 'clarity': LabelEncoder()}

In [22]:
scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [23]:
svr

0,1,2
,kernel,'rbf'
,degree,3
,gamma,0.1
,coef0,0.0
,tol,0.001
,C,1000
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [24]:
#https://docs.python.org/3/library/pickle.html
import pickle

In [25]:
with open("diamond_model_complete.pkl", "wb") as f:
    pickle.dump({
        "model":svr,
        "encoders":encoders,
        "scaler":scaler
    },f)

In [26]:
pd.DataFrame(X_test_scaled).to_csv("testdatascaled.csv", index=False)