# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from joblib import dump

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from  sklearn.pipeline  import  Pipeline
from  sklearn  import  set_config
set_config(display='diagram')

In [3]:
from sklearn.neural_network import MLPRegressor

# Custom functions

In [4]:
def dataset_parameters(df,target):
    df_features = df.drop([target], axis=1)
    data_type_numerical = ['int64', 'float64']
    data_type_object = ['object', 'bool', 'category']
    data_type_date = ['datetime64']
    categorical_features  = list(filter(lambda x: df_features[x].dtype in data_type_object, df_features.columns))
    numerical_features = list(filter(lambda x: df_features[x].dtype in data_type_numerical, df_features.columns))
    date_features = list(filter(lambda x: df_features[x].dtype in data_type_date, df_features.columns))
    all_features  = numerical_features  + categorical_features + date_features 
    return categorical_features, numerical_features, date_features, all_features,target 

# Loading and set data

In [5]:
with open('diamonds_df.pkl', 'rb') as f:
    df = pickle.load(f)

In [6]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326.0
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326.0
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327.0
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63,334.0
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335.0
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50,2757.0
53936,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61,2757.0
53937,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56,2757.0
53938,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74,2757.0


In [7]:
categorical_features, numerical_features, date_features, all_features, target = dataset_parameters(df,'price')

In [8]:
X = df[all_features]

In [9]:
y = df[target]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13) 

# Setup pipeline, train and test with best hyperparameters

In [11]:
categorical_transformer = Pipeline(steps=[
    ('one' , OneHotEncoder())
])

In [12]:
numerical_transformer = Pipeline(steps=[
    ('scale', MinMaxScaler()),
    ('normalize', Normalizer())
])

In [13]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

In [14]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', MLPRegressor(hidden_layer_sizes=(50, 50, 50), max_iter=600, power_t=0.25))
])

In [15]:
pipeline.fit(X_train,  y_train)

In [16]:
pipeline.score(X_test,y_test)

0.9802995880118562

# Save pipeline to disk

In [17]:
dump(pipeline, 'pipeline.joblib')

['pipeline.joblib']