In [358]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
from copy import deepcopy
from functools import partial
from itertools import combinations

# Import sklearn classes for model selection, cross validation, and performance evaluation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, accuracy_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from category_encoders import OneHotEncoder, OrdinalEncoder, CountEncoder

# Import libraries for Hypertuning
import optuna
# Import libraries for gradient boosting
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from catboost import Pool
from hyperopt import STATUS_OK,Trials, fmin,hp,tpe

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


In [357]:
pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
     ---------------------------------------- 1.6/1.6 MB 2.3 MB/s eta 0:00:00
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
     -------------------------------------- 200.5/200.5 kB 2.0 MB/s eta 0:00:00
Installing collected packages: py4j, hyperopt
Successfully installed hyperopt-0.2.7 py4j-0.10.9.7
Note: you may need to restart the kernel to use updated packages.


In [317]:
df_train = pd.read_csv('train.csv',index_col='id')
df_train['is_original']= False
df_test = pd.read_csv('test.csv',index_col='id')
df_test['is_original']= False

In [320]:
original = pd.read_csv('original_dataset.csv',index_col=[0])

# depth has missing values
original = original[-original.depth.isna()]
df_train = pd.concat([df_train, original]).drop_duplicates()

In [321]:
def process(df):
    df['cut'] = df['cut'].apply(lambda x: cut_dic[x])
    df['color'] = df['color'].apply(lambda x:color_dic[x])
    df['clarity'] = df['clarity'].apply(lambda x:clarity_dic[x])
    df["volume"] = df["x"] * df["y"] * df["z"]
    df["surface_area"] = 2 * (df["x"] * df["y"] + df["y"] * df["z"] + df["z"] * df["x"])
    df["aspect_ratio_xy"] = df["x"] / df["y"]
    df["aspect_ratio_yz"] = df["y"] / df["z"]
    df["aspect_ratio_zx"] = df["z"] / df["x"]
    df["diagonal_distance"] = np.sqrt(df["x"] ** 2 + df["y"] ** 2 + df["z"] ** 2)
    df["relative_height"] = (df["z"] - df["z"].min()) / (df["z"].max() - df["z"].min())
    df["relative_position"] = (df["x"] + df["y"] + df["z"]) / (df["x"] + df["y"] + df["z"]).sum()
    df["volume_ratio"] = df["x"] * df["y"] * df["z"] / (df["x"].mean() * df["y"].mean() * df["z"].mean())
    df["length_ratio"] = df["x"] / df["x"].mean()
    df["width_ratio"] = df["y"] / df["y"].mean()
    df["height_ratio"] = df["z"] / df["z"].mean()
    df["sphericity"] = 1.4641 * (6 * df["volume"])**(2/3) / df["surface_area"]
    df["compactness"] = df["volume"]**(1/3) / df["x"]
    return df

cut_dic = {'Fair':0,'Good':1,'Very Good':2,'Premium':3,'Ideal':4}
color_dic = {'D':6,'E':5,'F':4,'G':3,'H':2,'I':1,'J':0}
clarity_dic = {'FL':10, 'IF':9, 'VVS1':8, 'VVS2':7, 'VS1':6, 'VS2':5, 'SI1':4, 'SI2':3, 'I1':2, 'I2':1, 'I3':0}
train_df = process(df_train)
test_df = process(df_test)

In [339]:
train_df = train_df.drop(columns = "is_original")

In [453]:
X_train = X_train.fillna(0)
Y = train_df.price

In [384]:
X_train = train_df.drop(columns = "price",axis = 1)

In [417]:
Y_train  = train_df.price

In [451]:
Y_train.ndim

1

In [456]:
x_train,x_test,y_train,y_test = train_test_split(X_train,Y,test_size=0.2, random_state= 23)

In [457]:
y_train

32709      919
187440     630
16674     2677
74886      525
57911     4390
          ... 
9704       739
76726     4739
9256      2415
127718     631
107091     684
Name: price, Length: 175847, dtype: int64

In [381]:
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [382]:
test_df = test_df.fillna(0)

In [431]:
x_train.shape

(175847, 23)

In [458]:
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [459]:
lgbm_model = LGBMRegressor()
xgb_model = XGBRegressor()
cat_model = CatBoostRegressor()

from sklearn.ensemble import VotingRegressor
# Define the ensemble model
# Create ensemble model
ensemble_model = VotingRegressor(estimators=[('lgbm', lgbm_model), ('xgb', xgb_model), ('cat', cat_model)], weights=[1, 1, 1])
# Fit ensemble model on training data
ensemble_model.fit(x_train, y_train)

  'float64': 'float',


Learning rate set to 0.092664
0:	learn: 3691.0422672	total: 158ms	remaining: 2m 37s
1:	learn: 3381.6316650	total: 171ms	remaining: 1m 25s
2:	learn: 3108.2153408	total: 185ms	remaining: 1m 1s
3:	learn: 2855.1942576	total: 198ms	remaining: 49.2s
4:	learn: 2627.7141005	total: 212ms	remaining: 42.2s
5:	learn: 2428.9279321	total: 224ms	remaining: 37.2s
6:	learn: 2243.6350863	total: 236ms	remaining: 33.4s
7:	learn: 2069.8154103	total: 247ms	remaining: 30.6s
8:	learn: 1915.4887167	total: 258ms	remaining: 28.4s
9:	learn: 1779.0255458	total: 268ms	remaining: 26.5s
10:	learn: 1655.7636704	total: 279ms	remaining: 25s
11:	learn: 1544.4330068	total: 289ms	remaining: 23.8s
12:	learn: 1445.7382394	total: 300ms	remaining: 22.8s
13:	learn: 1353.2507563	total: 311ms	remaining: 21.9s
14:	learn: 1274.6921865	total: 322ms	remaining: 21.1s
15:	learn: 1203.2528010	total: 333ms	remaining: 20.5s
16:	learn: 1139.8072844	total: 344ms	remaining: 19.9s
17:	learn: 1083.7568625	total: 354ms	remaining: 19.3s
18:	lear

VotingRegressor(estimators=[('lgbm', LGBMRegressor()),
                            ('xgb',
                             XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n_jobs=None,
                                          num_para

In [460]:
y_pred = ensemble_model.predict(x_test)

In [461]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# y_true and y_pred are the true and predicted values, respectively
# Let's assume they are numpy arrays of shape (n_samples,)

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print('R2 Score:', r2)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print('MSE:', mse)

# Calculate RMSE
rmse = np.sqrt(mse)
print('RMSE:', rmse)

R2 Score: 0.9794832551230636
MSE: 334451.49497058574
RMSE: 578.3178148480174


In [462]:
preds =  ensemble_model.predict(test)
preds

array([1257.36851088, 2641.28830458, 2163.41190516, ..., 7027.24733257,
       6098.95793076, 4221.54572973])

In [485]:
sample = pd.read_csv("sample_submission.csv")

In [487]:
sample.price = preds

In [491]:
sample.to_csv('katilim.csv' ,index = False)