In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
df = pd.read_csv('./data/data.csv')

In [3]:
print(df.columns)
print(df.shape)

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z',
       'price'],
      dtype='object')
(40455, 10)


In [4]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,2.26,Ideal,G,SI2,61.9,57.0,8.44,8.36,5.2,12831
1,2.43,Very Good,H,SI2,63.2,57.0,8.56,8.5,5.39,16170
2,0.8,Premium,F,SI2,61.0,57.0,6.03,6.01,3.67,2797
3,0.4,Ideal,F,I1,63.3,60.0,4.68,4.64,2.95,630
4,0.31,Ideal,G,VS2,61.6,55.0,4.39,4.37,2.7,698


In [5]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [6]:
df.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object

In [7]:
df.describe()

Unnamed: 0,carat,depth,table,x,y,z,price
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,0.798385,61.747932,57.459085,5.732041,5.735939,3.539537,3939.242813
std,0.474353,1.432322,2.231152,1.123169,1.152802,0.709709,4000.344155
min,0.2,43.0,43.0,0.0,0.0,0.0,326.0
25%,0.4,61.0,56.0,4.71,4.72,2.91,950.0
50%,0.7,61.8,57.0,5.7,5.71,3.53,2409.0
75%,1.04,62.5,59.0,6.54,6.54,4.04,5331.0
max,5.01,79.0,79.0,10.74,58.9,31.8,18823.0


In [8]:
df_transformed = pd.get_dummies(df)

In [9]:
corr_matrix=df.corr()
corr_matrix

Unnamed: 0,carat,depth,table,x,y,z,price
carat,1.0,0.02437,0.180861,0.97501,0.944669,0.948835,0.921898
depth,0.02437,1.0,-0.296418,-0.028645,-0.032708,0.090617,-0.014383
table,0.180861,-0.296418,1.0,0.195254,0.181997,0.149342,0.125521
x,0.97501,-0.028645,0.195254,1.0,0.967161,0.966188,0.8845
y,0.944669,-0.032708,0.181997,0.967161,1.0,0.941816,0.859138
z,0.948835,0.090617,0.149342,0.966188,0.941816,1.0,0.857643
price,0.921898,-0.014383,0.125521,0.8845,0.859138,0.857643,1.0


In [10]:
def max_values(matrix):
    import operator
    empty_dict=dict()
    for e in matrix:
        empty_dict[e]=list(sorted(matrix[e].items(),key=operator.itemgetter(1), reverse=True)[1:3])
    return sorted(empty_dict.items(), key=operator.itemgetter(1))

In [11]:
max_values(corr_matrix)

[('price', [('carat', 0.9218982221675251), ('x', 0.8844995434273552)]),
 ('x', [('carat', 0.9750098928785256), ('y', 0.9671609363769315)]),
 ('table', [('x', 0.19525447946432958), ('y', 0.18199686329655315)]),
 ('z', [('x', 0.9661883961916592), ('carat', 0.9488345041078399)]),
 ('y', [('x', 0.9671609363769315), ('carat', 0.9446690298074747)]),
 ('carat', [('x', 0.9750098928785256), ('z', 0.9488345041078399)]),
 ('depth', [('z', 0.09061674288958951), ('carat', 0.024370169758772976)])]

In [12]:
df_better = df.drop(['x'], axis=1)
print(df_better.columns)

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'y', 'z',
       'price'],
      dtype='object')


In [13]:
corr_matrix_better = df_better.corr()

In [14]:
max_values(corr_matrix_better)

[('price', [('carat', 0.9218982221675251), ('y', 0.8591380044859781)]),
 ('y', [('carat', 0.9446690298074747), ('z', 0.9418163900714378)]),
 ('z', [('carat', 0.9488345041078399), ('y', 0.9418163900714378)]),
 ('table', [('y', 0.18199686329655315), ('carat', 0.1808610708852861)]),
 ('depth', [('z', 0.09061674288958951), ('carat', 0.024370169758772976)]),
 ('carat', [('z', 0.9488345041078399), ('y', 0.9446690298074747)])]

In [15]:
df_better = pd.get_dummies(df_better)
print(df_better.columns)

Index(['carat', 'depth', 'table', 'y', 'z', 'price', 'cut_Fair', 'cut_Good',
       'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_D', 'color_E',
       'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_I1',
       'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1',
       'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2'],
      dtype='object')


In [16]:
linear = LinearRegression()

In [17]:
X = df_better[['carat', 'depth', 'table', 'y', 'z', 'cut_Fair', 'cut_Good',
       'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_D', 'color_E',
       'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_I1',
       'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1',
       'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2']]

y = df_better['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [18]:
fit = linear.fit(X_train, y_train)

In [19]:
pred = fit.predict(X_test)

In [20]:
sqrt(mean_squared_error(y_test, pred))

1142.8336323365425

In [21]:
df_better_2 = df_better.drop(['carat'], axis=1)

In [22]:
matrix_corr_better2=df_better_2.corr()

In [23]:
#max_values(matrix_corr_better2)

In [24]:
X2 = df_better[['depth', 'table', 'y', 'z', 'cut_Fair', 'cut_Good',
       'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_D', 'color_E',
       'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_I1',
       'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1',
       'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2']]

y2 = df_better['price']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.20)

In [25]:
fit2 = linear.fit(X_train2, y_train2)

In [26]:
pred2 = fit2.predict(X_test2)

In [27]:
sqrt(mean_squared_error(y_test, pred2))

5782.563042317385

In [28]:
from sklearn.neural_network import MLPRegressor

In [44]:
neural_regressor= MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', 
                               alpha=0.0001, batch_size='auto', learning_rate='constant', 
                               learning_rate_init=0.001, power_t=0.5, max_iter=1000, shuffle=True, 
                               random_state=None, tol=0.0001, verbose=False, warm_start=False, 
                               momentum=0.9, nesterovs_momentum=True, early_stopping=False, 
                               validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08,
                               n_iter_no_change=10)

In [45]:
neural_regressor.fit(X_train, y_train)



MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=1000, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)

In [46]:
n_pred = neural_regressor.predict(X_test)

In [47]:
sqrt(mean_squared_error(y_test, n_pred))

587.330184388376