In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("briliants_train.csv", index_col=0)

In [3]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.25,Good,E,VVS2,64.9,58.0,3.95,3.97,2.57,476
1,0.84,Ideal,J,SI1,61.8,56.0,6.04,6.07,3.74,2493
2,1.05,Premium,J,VS2,61.1,58.0,6.56,6.51,3.99,4145
3,1.02,Ideal,F,SI2,60.7,56.0,6.53,6.50,3.95,4541
4,0.61,Ideal,F,VS1,61.8,57.0,5.43,5.47,3.37,2283
...,...,...,...,...,...,...,...,...,...,...
40450,1.05,Very Good,I,VS2,62.4,59.0,6.48,6.51,4.05,4975
40451,0.47,Ideal,D,VS1,61.0,55.0,5.03,5.01,3.06,1617
40452,0.33,Very Good,F,IF,60.3,58.0,4.49,4.46,2.70,1014
40453,0.90,Premium,J,SI1,62.8,59.0,6.13,6.03,3.82,2871


# Обработка данных


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40455 entries, 0 to 40454
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    40455 non-null  float64
 1   cut      40455 non-null  object 
 2   color    40455 non-null  object 
 3   clarity  40455 non-null  object 
 4   depth    40455 non-null  float64
 5   table    40455 non-null  float64
 6   x        40455 non-null  float64
 7   y        40455 non-null  float64
 8   z        40455 non-null  float64
 9   price    40455 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 3.4+ MB


In [5]:
df['color'].unique()

array(['E', 'J', 'F', 'G', 'D', 'H', 'I'], dtype=object)

In [6]:
df['cut'].unique()

array(['Good', 'Ideal', 'Premium', 'Fair', 'Very Good'], dtype=object)

In [7]:
df['clarity'].unique()

array(['VVS2', 'SI1', 'VS2', 'SI2', 'VS1', 'IF', 'VVS1', 'I1'],
      dtype=object)

In [8]:
# Создаем словарь для замены значений
color_mapping = {'D': 1, 'E': 2, 'F': 3, 'G': 4, 'H': 5, 'I': 6, 'J': 7}
cut_mapping = {'Ideal': 1, 'Premium': 2, 'Very Good': 3, 'Good': 4, 'Fair': 5}
clarity_mapping = {'IF': 1, 'VVS1': 2, 'VVS2': 3, 'VS1': 4, 'VS2': 5, 'SI1': 6, 'SI2': 7, 'I1': 8}

# Заменяем значения в столбце "color" с помощью метода map
df['color'] = df['color'].map(color_mapping)
df['cut'] = df['cut'].map(cut_mapping)
df['clarity'] = df['clarity'].map(clarity_mapping)

#LableEncoder

In [None]:
#le = LabelEncoder()

#df['color']=le.fit_transform(df['color'])
#df['cut']=le.fit_transform(df['cut'])
#df['clarity']=le.fit_transform(df['clarity'])

In [9]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.25,4,2,3,64.9,58.0,3.95,3.97,2.57,476
1,0.84,1,7,6,61.8,56.0,6.04,6.07,3.74,2493
2,1.05,2,7,5,61.1,58.0,6.56,6.51,3.99,4145
3,1.02,1,3,7,60.7,56.0,6.53,6.50,3.95,4541
4,0.61,1,3,4,61.8,57.0,5.43,5.47,3.37,2283
...,...,...,...,...,...,...,...,...,...,...
40450,1.05,3,6,5,62.4,59.0,6.48,6.51,4.05,4975
40451,0.47,1,1,4,61.0,55.0,5.03,5.01,3.06,1617
40452,0.33,3,3,1,60.3,58.0,4.49,4.46,2.70,1014
40453,0.90,2,7,6,62.8,59.0,6.13,6.03,3.82,2871


# Research

In [10]:
#Делим датасет на X и y и делим каждый на train и test

X = df[['carat', 'cut', 'color', 'clarity', 'depth']]
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [11]:
#LinearRegression

linmodel = LinearRegression().fit(X_train, y_train)
y_pred = linmodel.predict(X_test)

print("RMSE: ", mean_squared_error(y_test, y_pred, squared=False))

RMSE:  1254.1431178154432


In [12]:
#KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=5).fit(X_train, y_train)
y_pred2 = knn.predict(X_test)

print("RMSE: ", mean_squared_error(y_test, y_pred2, squared=False))

RMSE:  1193.075120488816


In [13]:
#RandomForestRegressor

randomforest = RandomForestRegressor(n_estimators=1000)
randomforest.fit(X_train, y_train)
y_pred3 = randomforest.predict(X_test)

print("RMSE: ", mean_squared_error(y_test, y_pred3, squared=False))

RMSE:  558.0271664262921


In [14]:
#AdaBoostRegressor

adaboost = AdaBoostRegressor(random_state=0, n_estimators=100)
adaboost.fit(X_train, y_train)

y_pred4 = adaboost.predict(X_test)

print("RMSE: ", mean_squared_error(y_test, y_pred4, squared=False))

RMSE:  1235.945843375882


In [15]:
#GradientBoostingRegressor

gradboost = GradientBoostingRegressor(random_state=0)
gradboost.fit(X_train, y_train)

y_pred5 = gradboost.predict(X_test)

print("RMSE: ", mean_squared_error(y_test, y_pred5, squared=False))

RMSE:  622.9297663919658


#Итого

In [16]:
#RandomForestRegressor

# randomforest = RandomForestRegressor(n_estimators=1000)
# randomforest.fit(X_train, y_train)
# y_pred3 = randomforest.predict(X_test)

print("RMSE: ", mean_squared_error(y_test, y_pred3, squared=False))

RMSE:  558.0271664262921


# Валидация (Эрдэни)

In [17]:
df2 = pd.read_csv("briliants_test.csv", index_col=0)

In [18]:
# Заменяем значения в столбце "color" с помощью метода map
df2['color'] = df2['color'].map(color_mapping)
df2['cut'] = df2['cut'].map(cut_mapping)
df2['clarity'] = df2['clarity'].map(clarity_mapping)

In [20]:
y_pred_test = randomforest.predict(df2[['carat', 'cut', 'color', 'clarity', 'depth']].values)



In [21]:
len(y_pred_test)

13485

In [22]:
df_true = pd.read_csv("briliants_y_true.csv")
y_true = df_true['price'].values
y_true

array([ 559, 2201, 1238, ..., 9215, 4416, 3564])

In [23]:
print("RMSE: ", mean_squared_error(y_true, y_pred_test, squared=False))

RMSE:  570.8881618070014


In [24]:
mean_absolute_error(y_true, y_pred_test)

291.4098259858102