# Предобработка + Encoding

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("briliants_train.csv", index_col=0)
test = pd.read_csv("briliants_test.csv", index_col=0)

In [3]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40455 entries, 0 to 40454
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    40455 non-null  float64
 1   cut      40455 non-null  object 
 2   color    40455 non-null  object 
 3   clarity  40455 non-null  object 
 4   depth    40455 non-null  float64
 5   table    40455 non-null  float64
 6   x        40455 non-null  float64
 7   y        40455 non-null  float64
 8   z        40455 non-null  float64
 9   price    40455 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 3.4+ MB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13485 entries, 0 to 13484
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    13485 non-null  float64
 1   cut      13485 non-null  object 
 2   color    13485 non-null  object 
 3   clarity  13485 non-null  object 
 4   depth    13485 non-null  float64
 5   table    13485 non-null  float64
 6   x        13485 non-null  float64
 7   y        13485 non-null  float64
 8   z        13485 non-null  float64
dtypes: float64(6), object(3)
memory usage: 1.0+ MB


In [5]:
print(df['cut'].unique())
print(df['color'].unique())
print(df['clarity'].unique())

['Good' 'Ideal' 'Premium' 'Fair' 'Very Good']
['E' 'J' 'F' 'G' 'D' 'H' 'I']
['VVS2' 'SI1' 'VS2' 'SI2' 'VS1' 'IF' 'VVS1' 'I1']


In [6]:
df = pd.get_dummies(df, columns = ['cut', 'color', 'clarity'])
test = pd.get_dummies(test, columns = ['cut', 'color', 'clarity'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40455 entries, 0 to 40454
Data columns (total 27 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   carat          40455 non-null  float64
 1   depth          40455 non-null  float64
 2   table          40455 non-null  float64
 3   x              40455 non-null  float64
 4   y              40455 non-null  float64
 5   z              40455 non-null  float64
 6   price          40455 non-null  int64  
 7   cut_Fair       40455 non-null  uint8  
 8   cut_Good       40455 non-null  uint8  
 9   cut_Ideal      40455 non-null  uint8  
 10  cut_Premium    40455 non-null  uint8  
 11  cut_Very Good  40455 non-null  uint8  
 12  color_D        40455 non-null  uint8  
 13  color_E        40455 non-null  uint8  
 14  color_F        40455 non-null  uint8  
 15  color_G        40455 non-null  uint8  
 16  color_H        40455 non-null  uint8  
 17  color_I        40455 non-null  uint8  
 18  color_

In [8]:
data_X = df[['carat', 'depth', 'table', 'x',	'y',	'z',	'cut_Fair',	'cut_Good',	'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_D', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_I1',	'clarity_IF',	'clarity_SI1',	'clarity_SI2',	'clarity_VS1',	'clarity_VS2',	'clarity_VVS1',	'clarity_VVS2']]

In [9]:
data_Y = df[['price']]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data_X, data_Y, random_state=42, test_size=0.1)

# RandomForestRegressor

In [11]:
from sklearn.ensemble import RandomForestRegressor

In [12]:
tree = RandomForestRegressor().fit(X_train, y_train.values.ravel())

In [13]:
y_pred = tree.predict(X_test)

In [14]:
score1 = mean_squared_error(y_test, y_pred, squared=False)

In [15]:
print('RMSE: ', score1)

RMSE:  552.683619396358


In [16]:
y_pred_test = tree.predict(test)
#y_pred_test = pd.DataFrame(y_pred_test)

In [17]:
len(y_pred_test)

13485

# Валидация (Эрдэни)

In [None]:
from google.colab import files
y_pred_test.to_excel("Test_predictions_bril.xlsx")
files.download("Test_predictions_bril.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
df_true = pd.read_csv("briliants_y_true.csv")
y_true = df_true['price'].values
y_true

array([ 559, 2201, 1238, ..., 9215, 4416, 3564])

In [19]:
mean_squared_error(y_true, y_pred_test, squared=False)

552.468984086726

In [20]:
MAE(y_true, y_pred_test)

274.482035488285