In [1]:
import warnings 
warnings.filterwarnings("ignore")

### Description:
<li> price in US dollars (\$326--\$18,823)

<li>carat weight of the diamond (0.2--5.01)

<li>cut quality of the cut (Fair, Good, Very Good, Premium, Ideal)

<li>colour diamond colour, from J (worst) to D (best)

<li>clarity a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1,VVS2, VVS1, IF (best))

<li>x length in mm (0--10.74)

<li>y width in mm (0--58.9)

<li>z depth in mm (0--31.8)

<li>depth total depth percentage = z / mean (x, y) = 2 * z / (x + y) (43--79)

<li>table width of top of diamond relative to widest point (43--95)

In [2]:
import pandas as pd
import numpy as np
import tensorflow

### Steps to Perform the Model:
<ol>1.Loading the data</ol>
<ol>2. Preprocessing.</ol>
        a) Print the first 5 rows of the dataset

        b) Check the features in the dataset

        c) Check the missing values

        d) Check the numerical features in the dataset

        e) Check the distribution of categorical columns
<ol>3. Separate features and Labels</ol>
<ol>4. Splitting the Data into Training and Testing</ol>
<ol>5. Creating Deep Learning- Artificial Neural Networks(ANN) model</ol>
<ol>6. Hyperparameter tuning of ANN Find the best set of parameters using grid search</ol>
<ol>7. Training the ANN model with the best parameters</ol>
<ol>8. Finding the accuracy of the model</ol>
<ol>9. Visualize train and validation Accuracy and Losses for every mode</ol>

In [3]:
df = pd.read_csv("diamonds.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  53940 non-null  int64  
 1   carat       53940 non-null  float64
 2   cut         53940 non-null  object 
 3   color       53940 non-null  object 
 4   clarity     53940 non-null  object 
 5   depth       53940 non-null  float64
 6   table       53940 non-null  float64
 7   price       53940 non-null  int64  
 8   x           53940 non-null  float64
 9   y           53940 non-null  float64
 10  z           53940 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 4.5+ MB


In [9]:
df.drop('Unnamed: 0', axis=1, inplace =True)

### Check the numerical features in the dataset

In [10]:
# selecting only the integers and float values from the data
numerical = df.select_dtypes(include=['int64','float64'])
numerical.head()

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.2,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75


In [13]:
numerical.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


### Check the categorical features in the dataset

In [11]:
# selecting all the categorical columns.
categorical = df.select_dtypes(include=['object'])
categorical.head()

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI2
1,Premium,E,SI1
2,Good,E,VS1
3,Premium,I,VS2
4,Good,J,SI2


###  Check the distribution of categorical columns

In [28]:
categorical.describe()

Unnamed: 0,cut,color,clarity
count,53940.0,53940.0,53940.0
mean,2.904097,3.405803,3.05102
std,1.1166,1.701105,1.647136
min,0.0,0.0,0.0
25%,2.0,2.0,2.0
50%,3.0,3.0,3.0
75%,4.0,5.0,4.0
max,4.0,6.0,7.0


In [14]:
# finding the unique attributes in the column for label encoding.
categorical['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [15]:
# finding the unique attributes in the column for label encoding.
categorical['color'].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [16]:
# finding the unique attributes in the column for label encoding.
categorical['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

### Label Encoding

In [18]:
# cut label encoding order (Fair, Good, Very Good, Premium, Ideal)
encoder = {"Fair":0, "Good":1, "Very Good":2, "Premium":3, "Ideal":4}

categorical['cut'] = categorical['cut'].apply(lambda x: encoder[x])
categorical.head()

Unnamed: 0,cut,color,clarity
0,4,E,SI2
1,3,E,SI1
2,1,E,VS1
3,3,I,VS2
4,1,J,SI2


In [19]:
# color label encoding order from J (worst) to D (best) ('E', 'I', 'J', 'H', 'F', 'G', 'D')
encoder = {"J":0, "I":1, "H":2, "G":3, "F":4, "E":5, "D":6}

categorical['color'] = categorical['color'].apply(lambda x: encoder[x])
categorical.head()

Unnamed: 0,cut,color,clarity
0,4,5,SI2
1,3,5,SI1
2,1,5,VS1
3,3,1,VS2
4,1,0,SI2


In [20]:
# clarity label encoding order (I1 (worst), SI2, SI1, VS2, VS1,VVS2, VVS1, IF (best))
encoder = {"I1":0, "SI2":1, "SI1":2, "VS2":3, "VS1":4, "VVS2":5, "VVS1":6, "IF":7}

categorical['clarity'] = categorical['clarity'].apply(lambda x: encoder[x])
categorical.head()

Unnamed: 0,cut,color,clarity
0,4,5,1
1,3,5,2
2,1,5,4
3,3,1,3
4,1,0,1


### concating numerical and categorical columns

In [30]:
temp_df = pd.concat([numerical, categorical],axis = 1)
temp_df.head()

Unnamed: 0,carat,depth,table,price,x,y,z,cut,color,clarity
0,0.23,61.5,55.0,326,3.95,3.98,2.43,4,5,1
1,0.21,59.8,61.0,326,3.89,3.84,2.31,3,5,2
2,0.23,56.9,65.0,327,4.05,4.07,2.31,1,5,4
3,0.29,62.4,58.0,334,4.2,4.23,2.63,3,1,3
4,0.31,63.3,58.0,335,4.34,4.35,2.75,1,0,1


###  Separate features and Labels

In [31]:
inputs= temp_df.drop('price',axis=1)
target = temp_df['price']

In [32]:
inputs.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
0,0.23,61.5,55.0,3.95,3.98,2.43,4,5,1
1,0.21,59.8,61.0,3.89,3.84,2.31,3,5,2
2,0.23,56.9,65.0,4.05,4.07,2.31,1,5,4
3,0.29,62.4,58.0,4.2,4.23,2.63,3,1,3
4,0.31,63.3,58.0,4.34,4.35,2.75,1,0,1


###  Splitting the Data into Training and Testing

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(inputs,target, train_size =0.75, random_state=100)

In [34]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_train.shape)

(40455, 9) (40455,)
(13485, 9) (40455,)


### Standardization

In [38]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_train.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
0,-1.050294,0.099874,-1.495687,-1.293573,-1.220387,-1.249413,0.980027,1.529921,0.576753
1,-0.186302,-1.57225,2.031741,0.10509,0.003601,-0.128311,0.084616,0.941277,-0.641148
2,-1.050294,0.30889,-0.647318,-1.302482,-1.237749,-1.249413,0.980027,1.529921,0.576753
3,0.46696,0.796592,0.692211,0.612885,0.550489,0.690955,0.084616,0.352633,1.185703
4,-1.050294,-1.154219,1.585231,-1.257938,-1.24643,-1.364397,0.084616,-0.824654,0.576753


In [39]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)
X_test.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
0,-0.479543,-0.159848,-0.216121,-0.338117,-0.369181,-0.349953,0.985825,0.92494,-0.027353
1,0.769407,-0.159848,-1.117476,0.91245,0.97069,0.861382,0.985825,-0.246162,0.574378
2,-0.606555,1.033738,0.234556,-0.606095,-0.585,-0.458837,0.985825,-0.246162,-0.629084
3,-0.797073,-0.791746,-0.666799,-0.802613,-0.77384,-0.812711,0.985825,0.339389,0.574378
4,0.007336,0.612473,0.234556,0.144245,0.179356,0.221688,0.089711,-0.246162,2.37957
