In [1]:
#import libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

In [2]:
# Data for training
diamonds = pd.read_csv('./diamonds.csv')

# Data for kaggle
diamonds_test = pd.read_csv('./diamonds_test.csv')

In [3]:
#Drop id and unnamed

def column_drop(x,y):
    x.drop(['Unnamed: 0'], inplace = True, axis = 1)
    x.drop(['index_id'], inplace = True, axis = 1)
    y.drop(['id'], inplace = True, axis = 1)
    
column_drop(diamonds,diamonds_test)

In [4]:
#Function to drop registers with 0 in x, y, z

def drop_0s(x):
    x = x[x['x']!=0]
    x = x[x['y']!=0]
    x = x[x['z']!=0]
    return x

In [5]:
# def proporciones correctas

def uniform(x,y):
    x['uniform'] = x['z']/ x['y']
    y['uniform'] = y['z']/ y['y']

In [6]:
# Function to avoid NaN in uniform column

def fill_na(x,y):
    x['uniform'].fillna(x['depth']/100, inplace = True)
    y['uniform'].fillna(y['depth']/100, inplace = True)

In [7]:
# definition of values to explain better colors

conditions = [
    (diamonds['color'] == 'J'),
    (diamonds['color'] == 'D'),
    (diamonds['color'] == 'E'),
    (diamonds['color'] == 'F'),
    (diamonds['color'] == 'H'),
    (diamonds['color'] == 'I'),
    (diamonds['color'] == 'G')
    ]

values = [1,2,2,2,1,1,1]

In [8]:
conditions_test = [
    (diamonds_test['color'] == 'J'),
    (diamonds_test['color'] == 'D'),
    (diamonds_test['color'] == 'E'),
    (diamonds_test['color'] == 'F'),
    (diamonds_test['color'] == 'H'),
    (diamonds_test['color'] == 'I'),
    (diamonds_test['color'] == 'G')
    ]

values_test = [1,2,2,2,1,1,1]

In [9]:
# Function to select what features we want in the model

def column_sel(x):
    y = x.columns
    for i in y:
        drop_yn = input(f'drop o no drop {i}:')
        if drop_yn == 'y':
            x.drop([i], inplace = True, axis = 1)

In [10]:
# Function to encode categorical features

encode_columns = ['cut','clarity', 'color']

def hot_encoding(x):
    y = x[encode_columns]
    y = pd.get_dummies(y)
    x.drop(encode_columns, inplace = True, axis = 1)
    x = pd.merge(x, y, left_index=True, right_index=True)
    return x

In [11]:
#Creation of the good color feature

diamonds['good_color'] = np.select(conditions, values)
diamonds_test['good_color'] = np.select(conditions_test, values_test)

In [12]:
# Drop of 0s in diamonds dataset

diamonds = drop_0s(diamonds)

In [13]:
#Creation of the uniform feature

uniform(diamonds, diamonds_test)

In [14]:
# Fillna in uniform feature in both datasets

fill_na(diamonds, diamonds_test)

In [15]:
# Feature selection diamonds

column_sel(diamonds)

drop o no drop price:n
drop o no drop carat:n
drop o no drop city:y
drop o no drop depth:y
drop o no drop table:y
drop o no drop x:y
drop o no drop y:y
drop o no drop z:y
drop o no drop cut:n
drop o no drop color:n
drop o no drop clarity:n
drop o no drop good_color:n
drop o no drop uniform:n


In [16]:
diamonds

Unnamed: 0,price,carat,cut,color,clarity,good_color,uniform
0,4268,1.21,Premium,J,VS2,1,0.625920
1,4839,1.20,Premium,J,VS2,1,0.606676
2,368,0.30,Premium,J,VS2,1,0.602740
3,5053,1.20,Premium,J,VS2,1,0.629464
4,3593,1.05,Premium,J,VS2,1,0.597264
...,...,...,...,...,...,...,...
40450,2632,0.59,Very Good,F,IF,2,0.597806
40451,9972,1.01,Very Good,F,IF,2,0.628705
40452,886,0.30,Very Good,F,IF,2,0.621810
40453,3205,0.71,Fair,F,IF,2,0.590361


In [17]:
# Feature selection diamonds test

column_sel(diamonds_test)

drop o no drop carat:n
drop o no drop cut:n
drop o no drop color:n
drop o no drop clarity:n
drop o no drop depth:y
drop o no drop table:y
drop o no drop x:y
drop o no drop y:y
drop o no drop z:y
drop o no drop city:y
drop o no drop good_color:n
drop o no drop uniform:n


In [18]:
diamonds_test

Unnamed: 0,carat,cut,color,clarity,good_color,uniform
0,0.79,Very Good,F,SI1,2,0.623090
1,1.20,Ideal,J,VS1,1,0.606676
2,1.57,Premium,H,SI1,1,0.624317
3,0.90,Very Good,F,SI1,2,0.636215
4,0.50,Very Good,F,VS1,2,0.626719
...,...,...,...,...,...,...
13480,0.57,Ideal,E,SI1,2,0.620301
13481,0.71,Ideal,I,VS2,1,0.621291
13482,0.70,Ideal,F,VS1,2,0.618214
13483,0.70,Very Good,F,SI2,2,0.585739


In [19]:
# Encoding categorical features

diamonds = hot_encoding(diamonds)
diamonds_test = hot_encoding(diamonds_test)

-------------------------------------------------------------------------------------------------------------------

**MODEL**

In [20]:
#we separate features from target

X = diamonds.drop(['price'], axis = 1)
y = diamonds['price']
X

Unnamed: 0,carat,good_color,uniform,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,clarity_I1,clarity_IF,...,clarity_VS2,clarity_VVS1,clarity_VVS2,color_D,color_E,color_F,color_G,color_H,color_I,color_J
0,1.21,1,0.625920,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,1.20,1,0.606676,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,0.30,1,0.602740,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,1.20,1,0.629464,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,1.05,1,0.597264,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,0.59,2,0.597806,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
40451,1.01,2,0.628705,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
40452,0.30,2,0.621810,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
40453,0.71,2,0.590361,1,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [21]:
#scaling

scaler = RobustScaler()
X = scaler.fit_transform(X)
X

array([[ 0.796875  ,  0.        ,  0.52831587, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.78125   ,  0.        , -0.71125351, ...,  0.        ,
         0.        ,  1.        ],
       [-0.625     ,  0.        , -0.96482223, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.625     ,  1.        ,  0.26353222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.015625  ,  1.        , -1.76214267, ...,  0.        ,
         0.        ,  0.        ],
       [-0.515625  ,  1.        , -5.94274325, ...,  0.        ,
         0.        ,  0.        ]])

In [22]:
# Division train/test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Model definition with best parameters

model = MLPRegressor(hidden_layer_sizes=(100,100,100), random_state=42, max_iter=4000)

In [24]:
# Model training

weights = model.fit(X_train, y_train)

In [25]:
# Model predictions

predictions = model.predict(X_test)

In [26]:
# RMSE

real_error = round(mean_squared_error(y_test, predictions)**0.5, 5)
real_error

541.39477

-------------------------------------------------------------------------------------------------------------------

**Test predictions**

In [27]:
diamonds_test

Unnamed: 0,carat,good_color,uniform,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,clarity_I1,clarity_IF,...,clarity_VS2,clarity_VVS1,clarity_VVS2,color_D,color_E,color_F,color_G,color_H,color_I,color_J
0,0.79,2,0.623090,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1.20,1,0.606676,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1.57,1,0.624317,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.90,2,0.636215,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0.50,2,0.626719,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.57,2,0.620301,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
13481,0.71,1,0.621291,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
13482,0.70,2,0.618214,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
13483,0.70,2,0.585739,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [28]:
# Diamonds to be predicted

X = diamonds_test

In [29]:
#Scaling

scaler = RobustScaler()
X = scaler.fit_transform(X)
X

array([[ 0.140625  ,  1.        ,  0.33724739, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78125   ,  0.        , -0.71893347, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.359375  ,  0.        ,  0.4161993 , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  1.        ,  0.02346698, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.        , -2.066234  , ...,  0.        ,
         0.        ,  0.        ],
       [-0.46875   ,  0.        ,  0.23908483, ...,  0.        ,
         1.        ,  0.        ]])

In [30]:
# Predictions

predictions = model.predict(X)

In [31]:
# Formatting for Kaggle

predictions = pd.DataFrame(predictions, columns = ['Price'])
predictions.index.names = ['id']
predictions

Unnamed: 0_level_0,Price
id,Unnamed: 1_level_1
0,3021.451399
1,5362.802883
2,9994.988494
3,3863.230633
4,1512.949708
...,...
13480,1685.381561
13481,2324.654370
13482,3143.356353
13483,1972.244541


In [32]:
# Creation of file where predictions are saved

predictions.to_csv('./results_rn.csv')