In [5]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import  LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [6]:
df = pd.read_csv(r"C:\Users\conno\workspace\projects\Diamond_Price_Prediction\Resources\processed_diamond_data")

In [7]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,xy
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,0.992462
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1.013021
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,0.995086
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,0.992908
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,0.997701


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53933 entries, 0 to 53932
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53933 non-null  float64
 1   cut      53933 non-null  object 
 2   color    53933 non-null  object 
 3   clarity  53933 non-null  object 
 4   depth    53933 non-null  float64
 5   table    53933 non-null  float64
 6   price    53933 non-null  int64  
 7   x        53933 non-null  float64
 8   y        53933 non-null  float64
 9   z        53933 non-null  float64
 10  xy       53933 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 4.5+ MB


In [9]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z,xy
count,53933.0,53933.0,53933.0,53933.0,53933.0,53933.0,53933.0,53933.0
mean,0.797884,61.749333,57.457002,3932.155026,5.731901,5.73527,3.539193,0.999409
std,0.473983,1.432501,2.234052,3988.700283,1.119932,1.140339,0.704592,0.012434
min,0.2,43.0,43.0,326.0,0.0,3.68,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91,0.992625
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53,0.995745
75%,1.04,62.5,59.0,5324.0,6.54,6.54,4.04,1.006944
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8,1.615572


In [10]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z', 'xy'],
      dtype='object')

In [11]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,xy
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,0.992462
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1.013021
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,0.995086
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,0.992908
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,0.997701


In [12]:
# Select only columns with numerical data types
numerical_data = df.select_dtypes(include=['float64', 'int64'])

# Calculate correlation matrix for numerical data
corr_matrix = numerical_data.corr()


## Functions

In [13]:
# Creating a heatmap function for future use
def corr_heatmap(corr_matrix):
    """"Convient way to initiate a detailed correlation matrix"""


    plt.figure(figsize=(14,12))
    sns.heatmap(corr_matrix,
                annot=True,
                linewidths=0.5,
                cmap='seismic',
                cbar=True,
                square=True,
                center=0)
    plt.show()
   


In [14]:
def train_model(X, y):
    """Input 'X' and 'y' to build and train model"""

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

# Notes about data set
### Three C's
- Cut
- Color
- Clarity
### Other important metrics
- Depth
- Table: top part or flato part
- Price (Target Variable): self explanitor
- x, y, z (Stone Deminsions) **high correlation**
    - 

### It is usually reccomended to drop highly correlated columns / features due
- Think of the correlation between Sales volume & Units volume

In [15]:
df['xy'] = df['x']/df['y'] # This creates 7 rows of NaN values that will need to be dropped
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,xy
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,0.992462
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1.013021
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,0.995086
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,0.992908
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,0.997701


In [16]:
df = df.dropna(axis=0)

In [17]:
# Encode categorical variables into machine readable values
d_df = pd.get_dummies(df)
X = d_df.drop(['price', 'x', 'y', 'z'], axis=1) # Dropping target variable & highly correlated columns
y = d_df['price'] # Target variable

In [18]:
# corr_heatmap(d_df.corr())

In [19]:
d_df.head() # True & False = 1 & 0

Unnamed: 0,carat,depth,table,price,x,y,z,xy,cut_Fair,cut_Good,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,61.5,55.0,326,3.95,3.98,2.43,0.992462,False,False,...,False,False,False,False,False,True,False,False,False,False
1,0.21,59.8,61.0,326,3.89,3.84,2.31,1.013021,False,False,...,False,False,False,False,True,False,False,False,False,False
2,0.23,56.9,65.0,327,4.05,4.07,2.31,0.995086,False,True,...,False,False,False,False,False,False,True,False,False,False
3,0.29,62.4,58.0,334,4.2,4.23,2.63,0.992908,False,False,...,True,False,False,False,False,False,False,True,False,False
4,0.31,63.3,58.0,335,4.34,4.35,2.75,0.997701,False,True,...,False,True,False,False,False,True,False,False,False,False


In [20]:
# 'Scaler' is a sklearn preproscessing module that normalizes the features of our data.
# 'Normalization' in this context means ajusting the features so that they have a mean of '0' and standard deviation of 1
s = StandardScaler()
X = s.fit_transform(X)

In [21]:
X_train, X_test, y_train, y_test = train_model(X, y)

In [22]:
# Calculating a baseline
ypred_null = y_train.mean()

## K-Neighbours Regressor

In [23]:
knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(X_train, y_train)

In [24]:
y_pred = knn.predict(X_test)

## Random Forrest Regression

In [25]:
rf = RandomForestRegressor(n_estimators=10)
rf.fit(X_train, y_train)

In [26]:
y_pred2 = rf.predict(X_test)

## Linear Regression

In [27]:
lin = LinearRegression()
lin = lin.fit(X_train, y_train)

In [28]:
y_pred3 = lin.predict(X_test)

## Lasso Regression

In [29]:
lasso = Lasso()
lasso.fit(X_train, y_train)

In [30]:
y_pred4 = lasso.predict(X_test)

In [31]:
# Creating a data frame to hold results
models_eval = pd.DataFrame(index=['KNN', 'MLR'], columns=['RMSE'])

models_eval.loc['KNN', 'RMSE'] = np.sqrt(mean_squared_error(y_test, y_pred))
models_eval.loc['RF', 'RMSE'] = np.sqrt(mean_squared_error(y_test, y_pred2))
models_eval.loc['MLR', 'RMSE'] = np.sqrt(mean_squared_error(y_test, y_pred3))
models_eval.loc['Lasso', 'RMSE'] = np.sqrt(mean_squared_error(y_test, y_pred4))
models_eval.loc['Null', 'RMSE'] = ypred_null

In [32]:
models_eval

Unnamed: 0,RMSE
KNN,1170.659452
MLR,1120.830488
RF,550.491166
Lasso,1120.723179
Null,3942.168776


# Model Evaluation
The random forest regressor model had the lowest 'RMSE' score therby making it the most effective model when trying to predict a diamond's selling price based of the variables trained on. We will use the RF model in our Power BI report.

In [34]:
# Save ML model to disk
import pickle

directory_path = r"C:\Users\conno\workspace\projects\Diamond_Price_Prediction\Resources"
file_name = 'random_forest_model.pkl'
full_path = f"{directory_path}\\{file_name}"

with open(full_path, 'wb') as file:
    pickle.dump(knn, file)

In [None]:
# Saving the processed data as a csv
processed_data = 'processed_diamond_data'

df.to_csv(f"{directory_path}\\{processed_data}", index=False)