In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import RidgeCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
sns.set_context("poster")
sns.set(rc={"figure.figsize": (12.,6.)})
sns.set_style("whitegrid")

# Study of the train dataset

In [3]:
diamonds = pd.read_csv("../data/train.csv")

In [4]:
diamonds.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.14,Ideal,G,VVS2,61.0,56.0,6.74,6.76,4.12,9013
1,1,0.76,Ideal,H,VS2,62.7,57.0,5.86,5.82,3.66,2692
2,2,0.84,Ideal,G,VS1,61.4,56.0,6.04,6.15,3.74,4372
3,3,1.55,Ideal,H,VS1,62.0,57.0,7.37,7.43,4.59,13665
4,4,0.3,Ideal,G,SI2,61.9,57.0,4.28,4.31,2.66,422


In [5]:
diamonds.dtypes

id           int64
carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object

In [6]:
diamonds.drop("id",axis=1,inplace = True) #drop id cause we don't need it to fit the models

## Checking categorical features
- changing categorical into numeric

In [7]:
diamonds.cut.unique()

array(['Ideal', 'Fair', 'Very Good', 'Premium', 'Good'], dtype=object)

In [8]:
diamonds.cut.value_counts()

Ideal        16193
Premium      10301
Very Good     9030
Good          3710
Fair          1221
Name: cut, dtype: int64

In [9]:
cut_dicc = {"Ideal":5, "Premium":4,"Very Good":3,"Good":2, "Fair":1}

In [10]:
diamonds.cut = diamonds.cut.replace(cut_dicc)

In [11]:
diamonds.cut.value_counts()

5    16193
4    10301
3     9030
2     3710
1     1221
Name: cut, dtype: int64

### information about diamonds
#### colors:
    - https://www.petragems.com/education/diamond-color/
#### clarity:
    - https://4cs.gia.edu/en-us/diamond-clarity/
        - Flawless (FL) No inclusions and no blemishes visible under 10x magnification
        - Internally Flawless (IF) No inclusions visible under 10x magnification
        - Very, Very Slightly Included (VVS1 and VVS2) Inclusions so slight they are difficult for a skilled grader to see under 10x magnification
        - Very Slightly Included (VS1 and VS2) Inclusions are observed with effort under 10x magnification, but can be characterized as minor
        - Slightly Included (SI1 and SI2) Inclusions are noticeable under 10x magnification
        - Included (I1, I2, and I3) Inclusions are obvious under 10x magnification which may affect transparency and brilliance

In [12]:
diamonds.color.unique()

array(['G', 'H', 'F', 'J', 'E', 'I', 'D'], dtype=object)

In [13]:
diamonds.color.value_counts()

G    8459
E    7346
F    7174
H    6273
D    5019
I    4065
J    2119
Name: color, dtype: int64

In [14]:
color_dic = {'D':7, 'E': 6, 'F':5, 'G':4, 'H':3, 'I':2, 'J':1} 

In [15]:
diamonds.color = diamonds.color.replace(color_dic)

In [16]:
diamonds.clarity.unique()

array(['VVS2', 'VS2', 'VS1', 'SI2', 'SI1', 'VVS1', 'IF', 'I1'],
      dtype=object)

In [17]:
diamonds.clarity.value_counts()

SI1     9746
VS2     9175
SI2     6912
VS1     6137
VVS2    3822
VVS1    2732
IF      1367
I1       564
Name: clarity, dtype: int64

In [18]:
clarity_dic = {"I3" : 1, "I2" : 2, "I1" : 3,
               "SI2" : 4 ,"SI1" : 5,
               "VS2" : 6, "VS1" : 7,
               "VVS2" : 8, "VVS1":9,
               "IF" : 10 , "FL" : 11}

In [19]:
diamonds.clarity = diamonds.clarity.replace(clarity_dic)

In [20]:
diamonds.dtypes

carat      float64
cut          int64
color        int64
clarity      int64
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object

## Checking correlation of variables

In [21]:
diamonds.corr()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
carat,1.0,-0.134463,-0.292673,-0.352151,0.025563,0.181437,0.976119,0.968747,0.969643,0.921128
cut,-0.134463,1.0,0.020104,0.19071,-0.219509,-0.433241,-0.125122,-0.123166,-0.150823,-0.052115
color,-0.292673,0.020104,1.0,-0.026766,-0.049141,-0.023844,-0.271814,-0.26947,-0.274945,-0.172244
clarity,-0.352151,0.19071,-0.026766,1.0,-0.070941,-0.156182,-0.371832,-0.364112,-0.373575,-0.145062
depth,0.025563,-0.219509,-0.049141,-0.070941,1.0,-0.297741,-0.028409,-0.031153,0.093987,-0.015052
table,0.181437,-0.433241,-0.023844,-0.156182,-0.297741,1.0,0.195592,0.187326,0.154194,0.127691
x,0.976119,-0.125122,-0.271814,-0.371832,-0.028409,0.195592,1.0,0.991276,0.986062,0.884874
y,0.968747,-0.123166,-0.26947,-0.364112,-0.031153,0.187326,0.991276,1.0,0.98016,0.880575
z,0.969643,-0.150823,-0.274945,-0.373575,0.093987,0.154194,0.986062,0.98016,1.0,0.875011
price,0.921128,-0.052115,-0.172244,-0.145062,-0.015052,0.127691,0.884874,0.880575,0.875011,1.0


In [22]:
sns.heatmap(diamonds.corr(), annot= True, cmap="Greens")

<AxesSubplot:>

<AxesSubplot:>

### Clear collinearity between x,y,z and carat
- we drop x,y and z

In [24]:
dropeando = ["x","y","z"]
diamonds.drop(dropeando, axis=1,inplace = True)

In [25]:
diamonds.shape

(40455, 7)

In [44]:
diamonds.isna().sum() #checking for null values

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
dtype: int64

## Training diferents models to see which fits better with the data

In [27]:
diamonds.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price'], dtype='object')

In [28]:
X = diamonds.drop("price",axis=1)

In [29]:
y = diamonds.price

In [30]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table
0,1.14,5,4,8,61.0,56.0
1,0.76,5,3,6,62.7,57.0
2,0.84,5,4,7,61.4,56.0
3,1.55,5,3,7,62.0,57.0
4,0.3,5,4,4,61.9,57.0


In [31]:
y.head()

0     9013
1     2692
2     4372
3    13665
4      422
Name: price, dtype: int64

In [32]:
X_train, X_test, y_train, y_test = tts(X,y, test_size= 0.2)

In [33]:
models={
    'log': LinearRegression(),
    'ridge': Ridge(),
    'lasso': Lasso(),
    'sgd': SGDRegressor(),
    'knn': KNeighborsRegressor(),
    'grad': GradientBoostingRegressor(),
    'Bay':BayesianRidge(),
    'RidgeCV': RidgeCV(),
    'Dec_tree': DecisionTreeRegressor(max_depth=1),
    'Ran_Forest' : RandomForestRegressor()
}

In [34]:
for name, model in models.items():
    print("entrenando ", name)
    model.fit(X_train,y_train)

entrenando  log
entrenando  ridge
entrenando  lasso
entrenando  sgd
entrenando  knn
entrenando  grad
entrenando  Bay
entrenando  RidgeCV
entrenando  Dec_tree
entrenando  Ran_Forest


In [35]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"------{name}------")
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('R2 - ', metrics.r2_score(y_test, y_pred))

------log------
RMSE -  1266.6221193945944
R2 -  0.9021813389303266
------ridge------
RMSE -  1266.6135769179168
R2 -  0.9021826583622237
------lasso------
RMSE -  1266.5928596774904
R2 -  0.9021858582155149
------sgd------
RMSE -  50616286.11373752
R2 -  -156209746.61634588
------knn------
RMSE -  1924.822675900204
R2 -  0.7741037451836748
------grad------
RMSE -  650.5011552936433
R2 -  0.9741997613938811
------Bay------
RMSE -  1266.621116131812
R2 -  0.9021814938901709
------RidgeCV------
RMSE -  1266.6212488406215
R2 -  0.9021814733925207
------Dec_tree------
RMSE -  2532.2539538959168
R2 -  0.6090312067156239
------Ran_Forest------
RMSE -  590.0949997169475
R2 -  0.9787689512680534


In [36]:
#RandomForestRegressor has the best R2 and the less RMSE

In [39]:
diamonds.sample(5)

Unnamed: 0,carat,cut,color,clarity,depth,table,price
6254,0.73,5,6,4,59.9,57.0,2059
28987,1.01,5,4,10,62.2,61.0,8957
2980,0.53,5,2,7,61.8,55.0,1552
34527,0.32,5,2,8,62.2,54.5,567
10237,0.32,5,4,10,61.8,55.0,918


In [40]:
# we export the clean dataset to train the model RandomForestRegressor
diamonds.to_csv("../data/modelo.csv", index= False)