In [19]:
#importing libs
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from IPython.display import clear_output

# model
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Work with data

In [2]:
# loading data
df_train = pd.read_csv("briliants_train.csv", index_col=0)
df_test = pd.read_csv("briliants_test.csv", index_col=0)

df = pd.concat([df_train,df_test])

In [3]:
df.describe()

Unnamed: 0,carat,depth,table,x,y,z,price
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,40455.0
mean,0.79794,61.749405,57.457184,5.731157,5.734526,3.538734,3942.538302
std,0.474011,1.432621,2.234491,1.121761,1.142135,0.705699,3997.986695
min,0.2,43.0,43.0,0.0,0.0,0.0,326.0
25%,0.4,61.0,56.0,4.71,4.72,2.91,956.0
50%,0.7,61.8,57.0,5.7,5.71,3.53,2400.0
75%,1.04,62.5,59.0,6.54,6.54,4.04,5357.5
max,5.01,79.0,95.0,10.74,58.9,31.8,18818.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53940 entries, 0 to 13484
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   x        53940 non-null  float64
 7   y        53940 non-null  float64
 8   z        53940 non-null  float64
 9   price    40455 non-null  float64
dtypes: float64(7), object(3)
memory usage: 4.5+ MB


In [5]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.25,Good,E,VVS2,64.9,58.0,3.95,3.97,2.57,476.0
1,0.84,Ideal,J,SI1,61.8,56.0,6.04,6.07,3.74,2493.0
2,1.05,Premium,J,VS2,61.1,58.0,6.56,6.51,3.99,4145.0
3,1.02,Ideal,F,SI2,60.7,56.0,6.53,6.50,3.95,4541.0
4,0.61,Ideal,F,VS1,61.8,57.0,5.43,5.47,3.37,2283.0
...,...,...,...,...,...,...,...,...,...,...
13480,0.56,Very Good,J,SI1,60.3,59.0,5.30,5.34,3.21,
13481,1.63,Good,F,SI1,59.8,56.0,7.57,7.64,4.55,
13482,1.21,Ideal,E,VS2,62.4,54.0,6.86,6.80,4.26,
13483,1.01,Premium,F,SI2,62.2,59.0,6.41,6.45,4.00,


In [6]:
# make a copy to not remove the data
df_encoded = df.copy()

In [7]:
# # categorical features: cut, color, clarity
# label_encoder = LabelEncoder()

# #cut
# df_encoded['cut'] = label_encoder.fit_transform(df['cut'])
# print('Уникальные категории cut:', dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# #clarity
# df_encoded['clarity'] = label_encoder.fit_transform(df['clarity'])
# print('Уникальные категории clarity:', dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

In [8]:
# categorical features: cut, color, clarity
#cut
label_encoder = LabelEncoder()
desired_order_cut = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']

df_encoded['cut'] = df_encoded['cut'].astype('category')
df_encoded['cut'] = df_encoded['cut'].cat.reorder_categories(desired_order_cut, ordered=True)
df_encoded['cut'] = df_encoded['cut'].cat.codes

In [9]:
#clarity
label_encoder = LabelEncoder()
desired_order_clarity = ['I1', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2', 'IF']

df_encoded['clarity'] = df_encoded['clarity'].astype('category')
df_encoded['clarity'] = df_encoded['clarity'].cat.reorder_categories(desired_order_clarity, ordered=True)
df_encoded['clarity'] = df_encoded['clarity'].cat.codes

In [10]:
#color (we cannot ranl it, therefore make onecode-method)
df_encoded = pd.get_dummies(df_encoded, columns=['color'])

In [11]:
#final data
df_encoded

Unnamed: 0,carat,cut,clarity,depth,table,x,y,z,price,color_D,color_E,color_F,color_G,color_H,color_I,color_J
0,0.25,1,6,64.9,58.0,3.95,3.97,2.57,476.0,0,1,0,0,0,0,0
1,0.84,4,1,61.8,56.0,6.04,6.07,3.74,2493.0,0,0,0,0,0,0,1
2,1.05,3,4,61.1,58.0,6.56,6.51,3.99,4145.0,0,0,0,0,0,0,1
3,1.02,4,2,60.7,56.0,6.53,6.50,3.95,4541.0,0,0,1,0,0,0,0
4,0.61,4,3,61.8,57.0,5.43,5.47,3.37,2283.0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.56,2,1,60.3,59.0,5.30,5.34,3.21,,0,0,0,0,0,0,1
13481,1.63,1,1,59.8,56.0,7.57,7.64,4.55,,0,0,1,0,0,0,0
13482,1.21,4,4,62.4,54.0,6.86,6.80,4.26,,0,1,0,0,0,0,0
13483,1.01,3,2,62.2,59.0,6.41,6.45,4.00,,0,0,1,0,0,0,0


# Model

In [12]:
# make the sample
X_train = df_encoded[~df_encoded.price.isnull()].drop('price', axis=1) # features
X_test =  df_encoded[df_encoded.price.isnull()].drop('price', axis=1)
y_train = df_encoded[~df_encoded.price.isnull()]['price']  # target

X_train_tr, X_test_tr, y_train_tr, y_test_tr = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [13]:
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train_tr, y_train_tr)

predictions = model.predict(X_test_tr)
mse = mean_squared_error(y_test_tr, predictions)
print(f"RMSE: {np.sqrt(mse)}")

RMSE: 548.136890896518


In [14]:
#find params

# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.1, 0.2, 1],
#     'max_depth': [3, 4, 5, 6, 8]
# }

# grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=10)
# grid_search.fit(X_train, y_train)
# clear_output()

# print("Наилучшие параметры:", grid_search.best_params_)
# print("Лучшее значение MSE:", -grid_search.best_score_)

# Валидация (Эрдэни)

In [15]:
#prediction for test
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

np.savetxt('Yana_Ivanova_test_1.csv', predictions, delimiter=',', fmt='%d')

In [16]:
len(predictions)

13485

In [17]:
df_true = pd.read_csv("briliants_y_true.csv")
y_true = df_true['price'].values
y_true

array([ 559, 2201, 1238, ..., 9215, 4416, 3564])

In [18]:
mean_squared_error(y_true, predictions, squared=False)

541.2797836565446

In [20]:
mean_absolute_error(y_true, predictions)

275.31351266811856