In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

import pickle

In [3]:
df_raw = pd.read_csv("diamonds.csv")
df_raw

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [4]:
df_raw.columns

Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
       'price', 'x', 'y', 'z'],
      dtype='object')

In [5]:
df_raw.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [7]:
df_raw.describe(exclude='object')

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [8]:
df_raw.describe(include='object')

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


In [9]:
numerical_cols = df_raw.select_dtypes(exclude='object').columns.tolist()
categorical_cols = df_raw.select_dtypes(include='object').columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

Categorical columns: ['cut', 'color', 'clarity']
Numerical columns: ['carat', 'depth', 'table', 'price', 'x', 'y', 'z']


In [10]:
df_raw.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [11]:
print(df_raw.duplicated().sum())
df_cleaned = df_raw.drop_duplicates()

146


In [12]:
for i in categorical_cols:
    print(df_cleaned[i].value_counts())

cut
Ideal        21488
Premium      13748
Very Good    12069
Good          4891
Fair          1598
Name: count, dtype: int64
color
G    11262
E     9776
F     9520
H     8272
D     6755
I     5407
J     2802
Name: count, dtype: int64
clarity
SI1     13032
VS2     12229
SI2      9150
VS1      8156
VVS2     5056
VVS1     3647
IF       1784
I1        740
Name: count, dtype: int64


In [13]:
X = df_cleaned.drop('price',axis=1) # features
y = df_cleaned['price'] # target

In [14]:
numerical_cols.remove('price')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, shuffle=True)

In [16]:
rb_scaler = RobustScaler()
X_train[numerical_cols] = rb_scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = rb_scaler.transform(X_test[numerical_cols])

In [17]:
cut_order = ["Fair", "Good", "Very Good", "Premium", "Ideal"]
color_order = ["J", "I", "H", "G", "F", "E", "D"]
clarity_order = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]

In [18]:
ordinal_encoder = OrdinalEncoder(categories=[cut_order, color_order, clarity_order])

X_train[categorical_cols] = ordinal_encoder.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = ordinal_encoder.transform(X_test[categorical_cols])

In [19]:
X_train.head().reset_index(drop=True)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,-0.609375,3.0,6.0,1.0,0.2,0.666667,-0.737705,-0.773481,-0.741071
1,-0.3125,2.0,0.0,3.0,0.933333,1.0,-0.349727,-0.38674,-0.303571
2,-0.609375,2.0,5.0,2.0,-1.133333,1.333333,-0.737705,-0.723757,-0.794643
3,-0.65625,2.0,1.0,4.0,-0.6,0.0,-0.808743,-0.80663,-0.839286
4,-0.5,4.0,2.0,2.0,-0.4,-0.633333,-0.551913,-0.541436,-0.571429


In [20]:
# Train Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [21]:
# Train XGBoost
xgb = XGBRegressor(random_state=42, verbosity=0)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

In [24]:
# Evaluate Random Forest
rf_r2 = r2_score(y_test, rf_pred)
print(f"Random Forest R2: {rf_r2}")

Random Forest R2: 0.9823118802397329


In [23]:
# Evaluate XGBoost
xgb_r2 = r2_score(y_test, xgb_pred)
print(f"XGBoost R2: {xgb_r2}")

XGBoost R2: 0.9824733138084412


In [26]:
# Simpan model Random Forest
with open("random_forest_model.pkl", "wb") as f:
    pickle.dump(rf, f)

# Simpan model XGBoost
with open("xgboost_model.pkl", "wb") as f:
    pickle.dump(xgb, f)

# Simpan ordinal encoder
with open("ordinal_encoder.pkl", "wb") as f:
    pickle.dump(ordinal_encoder, f)

# Simpan scaler
with open("robust_scaler.pkl", "wb") as f:
    pickle.dump(rb_scaler, f)

# Simpan daftar fitur (urutan kolom sangat penting untuk prediksi nantinya)
with open("feature_columns.pkl", "wb") as f:
    pickle.dump(X_train.columns.tolist(), f)
