# Предобработка + Encoding

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv("/content/briliants_train.csv", index_col=0)
test = pd.read_csv("/content/briliants_test.csv", index_col=0)

In [None]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40455 entries, 0 to 40454
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    40455 non-null  float64
 1   cut      40455 non-null  object 
 2   color    40455 non-null  object 
 3   clarity  40455 non-null  object 
 4   depth    40455 non-null  float64
 5   table    40455 non-null  float64
 6   x        40455 non-null  float64
 7   y        40455 non-null  float64
 8   z        40455 non-null  float64
 9   price    40455 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 3.4+ MB


In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13485 entries, 0 to 13484
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    13485 non-null  float64
 1   cut      13485 non-null  object 
 2   color    13485 non-null  object 
 3   clarity  13485 non-null  object 
 4   depth    13485 non-null  float64
 5   table    13485 non-null  float64
 6   x        13485 non-null  float64
 7   y        13485 non-null  float64
 8   z        13485 non-null  float64
dtypes: float64(6), object(3)
memory usage: 1.0+ MB


In [None]:
print(df['cut'].unique())
print(df['color'].unique())
print(df['clarity'].unique())

['Good' 'Ideal' 'Premium' 'Fair' 'Very Good']
['E' 'J' 'F' 'G' 'D' 'H' 'I']
['VVS2' 'SI1' 'VS2' 'SI2' 'VS1' 'IF' 'VVS1' 'I1']


In [None]:
df = pd.get_dummies(df, columns = ['cut', 'color', 'clarity'])
test = pd.get_dummies(test, columns = ['cut', 'color', 'clarity'])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40455 entries, 0 to 40454
Data columns (total 27 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   carat          40455 non-null  float64
 1   depth          40455 non-null  float64
 2   table          40455 non-null  float64
 3   x              40455 non-null  float64
 4   y              40455 non-null  float64
 5   z              40455 non-null  float64
 6   price          40455 non-null  int64  
 7   cut_Fair       40455 non-null  uint8  
 8   cut_Good       40455 non-null  uint8  
 9   cut_Ideal      40455 non-null  uint8  
 10  cut_Premium    40455 non-null  uint8  
 11  cut_Very Good  40455 non-null  uint8  
 12  color_D        40455 non-null  uint8  
 13  color_E        40455 non-null  uint8  
 14  color_F        40455 non-null  uint8  
 15  color_G        40455 non-null  uint8  
 16  color_H        40455 non-null  uint8  
 17  color_I        40455 non-null  uint8  
 18  color_

In [None]:
data_X = df[['carat', 'depth', 'table', 'x',	'y',	'z',	'cut_Fair',	'cut_Good',	'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_D', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_I1',	'clarity_IF',	'clarity_SI1',	'clarity_SI2',	'clarity_VS1',	'clarity_VS2',	'clarity_VVS1',	'clarity_VVS2']]

In [None]:
data_Y = df[['price']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_X, data_Y, random_state=42, test_size=0.1)

# RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
tree = RandomForestRegressor().fit(X_train, y_train.values.ravel())

In [None]:
y_pred = tree.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred, squared=False)

554.2836672825694

# GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gradient = GradientBoostingRegressor().fit(X_train, y_train.values.ravel())

In [None]:
y_pred1 = gradient.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred1, squared=False)

750.3405450025116

# LinearRegression


In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linmodel = LinearRegression().fit(X_train, y_train)

In [None]:
y_pred2 = linmodel.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred2, squared=False)

1131.1830849449223

# GridSearchCV for RandomForestRegressor

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {
            "n_estimators"      : [10,50,100,200],
            "max_depth"      : [4,6,8,10],
            "max_features"      : ["auto", "sqrt", "log2"]
            }

In [None]:
func = RandomForestRegressor()

In [None]:
grid = GridSearchCV(func, parameters, cv=5, n_jobs=-1)

In [None]:
grid.fit(X_train, y_train.values.ravel())

  warn(


In [None]:
grid.best_params_

{'max_depth': 10, 'max_features': 'auto', 'n_estimators': 200}

In [None]:
tree_grid = RandomForestRegressor(max_depth = 14, n_estimators = 150).fit(X_train, y_train.values.ravel())

In [None]:
y_pred_grid = tree_grid.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred_grid, squared=False)

556.0847585177478