In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
import clean 

In [2]:
data = pd.read_csv("./data/diamonds_train.csv")

In [3]:
data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.78,Premium,F,VS1,61.5,58.0,5.93,5.98,3.66,3446
1,1,0.31,Ideal,D,SI1,60.8,56.0,4.37,4.32,2.64,732
2,2,0.3,Ideal,F,SI1,62.3,54.0,4.3,4.34,2.69,475
3,3,1.04,Ideal,E,VVS2,62.0,58.0,6.54,6.46,4.03,9552
4,4,0.65,Ideal,J,SI1,61.4,55.0,5.58,5.62,3.44,1276


In [4]:
data.dtypes

id           int64
carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object

In [5]:
data.clarity.unique()

array(['VS1', 'SI1', 'VVS2', 'SI2', 'VS2', 'IF', 'VVS1', 'I1'],
      dtype=object)

In [6]:
data.corr()

Unnamed: 0,id,carat,depth,table,x,y,z,price
id,1.0,0.001804,-0.003035,0.004436,0.003238,0.002021,0.002507,0.001823
carat,0.001804,1.0,0.023118,0.181725,0.976267,0.945757,0.968685,0.922345
depth,-0.003035,0.023118,1.0,-0.299534,-0.028765,-0.032894,0.092482,-0.013307
table,0.004436,0.181725,-0.299534,1.0,0.195775,0.182559,0.154399,0.126545
x,0.003238,0.976267,-0.028765,0.195775,1.0,0.967143,0.985385,0.886168
y,0.002021,0.945757,-0.032894,0.182559,0.967143,1.0,0.96035,0.860499
z,0.002507,0.968685,0.092482,0.154399,0.985385,0.96035,1.0,0.876061
price,0.001823,0.922345,-0.013307,0.126545,0.886168,0.860499,0.876061,1.0


In [7]:
data.clarity.value_counts()

SI1     9710
VS2     9164
SI2     6884
VS1     6092
VVS2    3839
VVS1    2738
IF      1357
I1       561
Name: clarity, dtype: int64

In [8]:
data.color.value_counts()

G    8462
E    7292
F    7133
H    6200
D    5094
I    4094
J    2070
Name: color, dtype: int64

In [9]:
data.cut.value_counts()

Ideal        16141
Premium      10303
Very Good     9068
Good          3631
Fair          1202
Name: cut, dtype: int64

In [10]:
data["clarity"] = data["clarity"].apply(lambda x: clean.clarityToNumber(x))
data["cut"] = data["cut"].apply(lambda x: clean.cutToNumber(x))
data["color"] = data["color"].apply(lambda x: clean.colorToNumber(x))
data = clean.dropColumns(data)

In [11]:
data

Unnamed: 0,id,carat,cut,color,clarity,x,price
0,0,0.78,4,5,5,5.93,3446
1,1,0.31,5,7,3,4.37,732
2,2,0.30,5,5,3,4.30,475
3,3,1.04,5,6,6,6.54,9552
4,4,0.65,5,1,3,5.58,1276
...,...,...,...,...,...,...,...
40340,40340,0.50,5,6,5,5.09,1716
40341,40341,0.33,5,7,4,4.40,781
40342,40342,0.40,4,4,7,4.69,1123
40343,40343,1.06,5,2,5,6.59,5651


In [12]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["id", "price"]), data["price"], test_size=0.2)

In [13]:
rf_reg = RandomForestRegressor(n_estimators=25, max_depth=15, min_samples_leaf=6, random_state=111)
rf_reg.fit(data.drop(columns=["id", "price"]), data["price"])
svc = SVC(gamma='auto')
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [14]:
data1 = pd.read_csv("./data/diamonds_test.csv")
data1 = clean.dropColumns(data1)
data1["clarity"] = data1["clarity"].apply(lambda x: clean.clarityToNumber(x))
data1["cut"] = data1["cut"].apply(lambda x: clean.cutToNumber(x))
data1["color"] = data1["color"].apply(lambda x: clean.colorToNumber(x))
data

Unnamed: 0,id,carat,cut,color,clarity,x,price
0,0,0.78,4,5,5,5.93,3446
1,1,0.31,5,7,3,4.37,732
2,2,0.30,5,5,3,4.30,475
3,3,1.04,5,6,6,6.54,9552
4,4,0.65,5,1,3,5.58,1276
...,...,...,...,...,...,...,...
40340,40340,0.50,5,6,5,5.09,1716
40341,40341,0.33,5,7,4,4.40,781
40342,40342,0.40,4,4,7,4.69,1123
40343,40343,1.06,5,2,5,6.59,5651


In [15]:
y_pred_normal = rf_reg.predict(data.drop(columns=["id", "price"])) 
y_pred = rf_reg.predict(data1.drop(columns="id"))

In [16]:
print("MSE_normal", mean_squared_error(data["price"], y_pred_normal))
print("RMSE_normal", np.sqrt(mean_squared_error(data["price"], y_pred_normal)))

MSE_normal 221671.9377814751
RMSE_normal 470.82049422415236


In [17]:
kaggle = pd.DataFrame({"Price": y_pred})

In [18]:
kaggle.index.name = "id"

In [19]:
kaggle.to_csv("prediction.csv")

In [None]:
y_svc_predict = svc.predict(X_test)

In [None]:
print("MSE_normal", mean_squared_error(data["price"], y_svc_predict))
print("RMSE_normal", np.sqrt(mean_squared_error(data["price"], y_svc_predict)))