In [57]:
## Importing necessary libraries ##
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [58]:
data = pd.read_csv("/config/workspace/Fariz_Gemstone_Prediction/data/gemstone.csv")
data.head(3)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772


In [59]:
data = data.drop(labels=["id"],axis=1)

In [60]:
# Seggregating dependent and independent features
X = data.drop(labels=["price"], axis=1)
y = data[["price"]]

In [61]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [62]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [63]:
categorical_columns = X.select_dtypes(include="object").columns
numerical_columns = X.select_dtypes(exclude="object").columns

In [64]:
categorical_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [65]:
numerical_columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [66]:
cut_categories = list(data["cut"].unique())

In [67]:
cut_categories

['Premium', 'Very Good', 'Ideal', 'Good', 'Fair']

In [68]:
color_categories=list(data["color"].unique())
# color_categories = list(data["color"].unique())

In [69]:
color_categories

['F', 'J', 'G', 'E', 'D', 'H', 'I']

In [70]:
clarity_categories = list(data["clarity"].unique())

In [71]:
clarity_categories

['VS2', 'SI2', 'VS1', 'SI1', 'IF', 'VVS2', 'VVS1', 'I1']

In [72]:
type(color_categories)

list

In [73]:
# Define the custom ranking for each ordinal variable
#cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
#color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
#clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

# Automating EDA

In [74]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Numerical Pipeline

In [75]:
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), # We chose median incase of presence of outliers.
        ("scaler", StandardScaler())
        ])

## Categorical Pipeline

In [76]:
categorical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinal_encoder",OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ("scaler", StandardScaler())
    ]
)

## Final Pipeline

In [77]:
## Combining pipelines for both numerical as well as categorical features using column transformer.
pre_processor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline',categorical_pipeline,categorical_columns)
])

## Train test split

In [78]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=20)

In [79]:
X_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
94263,1.18,Ideal,H,SI1,62.1,55.0,6.79,6.74,4.21
73052,0.70,Ideal,H,VVS1,61.4,57.0,5.73,5.76,3.52
117658,1.50,Very Good,I,SI2,62.7,57.0,7.24,7.30,4.55
105104,1.25,Premium,H,SI2,62.1,57.0,6.88,6.83,4.24
165397,0.34,Premium,I,VVS2,60.3,59.0,4.56,4.52,2.74
...,...,...,...,...,...,...,...,...,...
178569,1.00,Ideal,H,VS1,61.5,56.0,6.48,6.44,3.97
31962,1.00,Good,E,SI2,63.6,59.0,6.29,6.32,4.01
23775,0.51,Premium,E,VS1,61.3,57.0,5.16,5.07,3.14
37135,0.70,Ideal,F,SI2,62.0,57.0,5.71,5.65,3.52


In [80]:
X_test

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
108124,0.70,Very Good,F,VS2,62.4,59.0,5.61,5.65,3.51
84143,0.78,Good,D,SI2,63.8,55.0,5.87,5.84,3.73
5293,0.60,Very Good,G,VS2,60.4,63.0,5.42,5.48,3.29
164452,0.33,Ideal,F,VVS1,62.1,56.0,4.46,4.49,2.78
18447,0.31,Ideal,G,VS1,61.1,56.0,4.43,4.39,2.68
...,...,...,...,...,...,...,...,...,...
100546,1.11,Premium,J,SI2,62.0,58.0,6.66,6.67,4.13
75003,0.40,Ideal,G,SI1,61.4,55.0,4.77,4.74,2.93
28887,0.70,Premium,F,SI1,61.6,57.0,5.70,5.66,3.50
24792,0.31,Ideal,I,VS1,62.3,55.0,4.34,4.37,2.71


In [81]:
X_train['color'].value_counts()

color
G    31043
E    24976
F    23967
H    21669
D    17004
I    12339
J     4503
Name: count, dtype: int64

In [82]:
X_test['color'].value_counts()

color
G    13348
E    10893
F    10291
H     9130
D     7282
I     5175
J     1953
Name: count, dtype: int64

In [83]:
X_train_df = pd.DataFrame(pre_processor.fit_transform(X_train))
X_test_df = pd.DataFrame(pre_processor.transform(X_test))

In [84]:
X_train_new=pd.DataFrame(pre_processor.fit_transform(X_train),columns=pre_processor.get_feature_names_out())
X_test_new=pd.DataFrame(pre_processor.transform(X_test),columns=pre_processor.get_feature_names_out())

In [85]:
X_test_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.197641,0.535805,0.922132,-0.096711,-0.065432,-0.03696,-0.383443,-1.540071,-1.204067
1,-0.024795,1.829627,-1.1623,0.137635,0.106901,0.281831,1.690065,0.589807,-0.6426
2,-0.413698,-1.312512,3.006564,-0.267963,-0.219624,-0.355751,-0.383443,-0.475132,-1.204067


In [86]:
X_test_new.head(3)

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.197641,0.535805,0.922132,-0.096711,-0.065432,-0.03696,-0.383443,-1.540071,-1.204067
1,-0.024795,1.829627,-1.1623,0.137635,0.106901,0.281831,1.690065,0.589807,-0.6426
2,-0.413698,-1.312512,3.006564,-0.267963,-0.219624,-0.355751,-0.383443,-0.475132,-1.204067
