In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# Set path to CSV and read in CSV
csv_path = Path('Resources/diamonds.csv')
df=pd.read_csv(csv_path)
df.head(-1)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53934,53935,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56


In [4]:
df = df.drop(columns='Unnamed: 0')
df.head(-1)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53934,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56


In [5]:
df.cut.unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [6]:
# cut dictionary
cut_num = {
    'Ideal': 1,
    'Premium': 2,
    'Good': 3,
    'Very Good': 4,
    'Fair': 5,
}

In [7]:
# Cut names encoded using the dictionary values
df["cut_num"] = df["cut"].apply(lambda x: cut_num[x])
df.head(-1)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,cut_num
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,1
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,2
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,3
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,2
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,3
...,...,...,...,...,...,...,...,...,...,...,...
53934,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58,2
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,1
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,3
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,4


In [8]:
# Dropping cut column
df.drop(["cut"], axis=1, inplace=True)
df.head(-1)

Unnamed: 0,carat,color,clarity,depth,table,price,x,y,z,cut_num
0,0.23,E,SI2,61.5,55.0,326,3.95,3.98,2.43,1
1,0.21,E,SI1,59.8,61.0,326,3.89,3.84,2.31,2
2,0.23,E,VS1,56.9,65.0,327,4.05,4.07,2.31,3
3,0.29,I,VS2,62.4,58.0,334,4.20,4.23,2.63,2
4,0.31,J,SI2,63.3,58.0,335,4.34,4.35,2.75,3
...,...,...,...,...,...,...,...,...,...,...
53934,0.72,D,SI1,62.7,59.0,2757,5.69,5.73,3.58,2
53935,0.72,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,1
53936,0.72,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,3
53937,0.70,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,4


In [9]:
# Encode required data (converts str fields to float)
df_enc = pd.get_dummies(df, columns=['color','clarity'])
df_enc.head(-1)

Unnamed: 0,carat,depth,table,price,x,y,z,cut_num,color_D,color_E,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,61.5,55.0,326,3.95,3.98,2.43,1,0,1,...,0,0,0,0,0,1,0,0,0,0
1,0.21,59.8,61.0,326,3.89,3.84,2.31,2,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0.23,56.9,65.0,327,4.05,4.07,2.31,3,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0.29,62.4,58.0,334,4.20,4.23,2.63,2,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0.31,63.3,58.0,335,4.34,4.35,2.75,3,0,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53934,0.72,62.7,59.0,2757,5.69,5.73,3.58,2,1,0,...,0,0,0,0,1,0,0,0,0,0
53935,0.72,60.8,57.0,2757,5.75,5.76,3.50,1,1,0,...,0,0,0,0,1,0,0,0,0,0
53936,0.72,63.1,55.0,2757,5.69,5.75,3.61,3,1,0,...,0,0,0,0,1,0,0,0,0,0
53937,0.70,62.8,60.0,2757,5.66,5.68,3.56,4,1,0,...,0,0,0,0,1,0,0,0,0,0


In [10]:
# Split data into training/test
# Create features
X = df_enc.drop(columns='cut_num')

# Create target
y = df_enc.cut_num

In [11]:
# Check the balance of our target values
y.value_counts()

1    21551
2    13791
4    12082
3     4906
5     1610
Name: cut_num, dtype: int64

In [12]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [13]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [14]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [15]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Create random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=7)

In [17]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [18]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [21]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=['Actual 1','Actual 2','Actual 3','Actual 4','Actual 5'], columns=['Predicted 1','Predicted 2','Predicted 3','Predicted 4','Predicted 5']
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [22]:
# isplaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5
Actual 1,4943,226,6,190,4
Actual 2,366,2824,20,237,0
Actual 3,30,78,813,299,40
Actual 4,675,706,180,1420,1
Actual 5,4,8,47,10,358


Accuracy Score : 0.7681127178346311
Classification Report
              precision    recall  f1-score   support

           1       0.82      0.92      0.87      5369
           2       0.74      0.82      0.77      3447
           3       0.76      0.65      0.70      1260
           4       0.66      0.48      0.55      2982
           5       0.89      0.84      0.86       427

    accuracy                           0.77     13485
   macro avg       0.77      0.74      0.75     13485
weighted avg       0.76      0.77      0.76     13485

