In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.metrics import *
from sklearn.model_selection import train_test_split

In [46]:
d = pd.read_csv('diamonds_train.csv')

In [50]:
d.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'price'],
      dtype='object')

In [51]:
d.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.78,Premium,F,VS1,61.5,58.0,5.93,5.98,3.66,3446
1,1,0.31,Ideal,D,SI1,60.8,56.0,4.37,4.32,2.64,732
2,2,0.3,Ideal,F,SI1,62.3,54.0,4.3,4.34,2.69,475
3,3,1.04,Ideal,E,VVS2,62.0,58.0,6.54,6.46,4.03,9552
4,4,0.65,Ideal,J,SI1,61.4,55.0,5.58,5.62,3.44,1276


In [7]:
d.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'price'],
      dtype='object')

In [8]:
cuts = {'Fair': 1,
        'Good': 2,
        'Very Good': 3,
        'Ideal': 4,
        'Premium': 5}
d.cut = d.cut.map(cuts)

In [9]:
colors = {'J': 1,
          'I': 2,
          'H': 3,
          'G': 4,
          'F': 5,
          'E': 6,
          'D': 7}
d.color = d.color.map(colors)

In [10]:
clar = {'I1':1,
        'SI2':2,
        'SI1':3,
        'VS2':4,
        'VS1':5,
        'VVS2':6,
        'VVS1':7,
        'IF':8}
d.clarity = d.clarity.map(clar)

In [11]:
d.to_csv('d.csv')

In [12]:
d

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.78,5,5,5,61.5,58.0,5.93,5.98,3.66,3446
1,1,0.31,4,7,3,60.8,56.0,4.37,4.32,2.64,732
2,2,0.30,4,5,3,62.3,54.0,4.30,4.34,2.69,475
3,3,1.04,4,6,6,62.0,58.0,6.54,6.46,4.03,9552
4,4,0.65,4,1,3,61.4,55.0,5.58,5.62,3.44,1276
...,...,...,...,...,...,...,...,...,...,...,...
40340,40340,0.50,4,6,5,61.9,56.0,5.09,5.12,3.16,1716
40341,40341,0.33,4,7,4,61.8,56.0,4.40,4.44,2.73,781
40342,40342,0.40,5,4,7,61.5,58.0,4.69,4.74,2.90,1123
40343,40343,1.06,4,2,5,61.2,57.0,6.59,6.56,4.03,5651


In [27]:
rf_reg = RandomForestRegressor(n_estimators=15, max_depth=3, min_samples_leaf=3, random_state=111)
selector = RFECV(rf_reg, step=1, cv=5)

In [21]:
d.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'price'],
      dtype='object')

In [23]:
cols = list(d.columns[1:7])
cols

['carat', 'cut', 'color', 'clarity', 'depth', 'table']

In [24]:
X = d[cols]
y = d['price']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
testeo = rf_reg.fit(X_train, y_train)
testeo

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=3, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=15, n_jobs=None, oob_score=False,
                      random_state=111, verbose=0, warm_start=False)

In [29]:
rf_reg.feature_importances_

array([0.97110102, 0.        , 0.        , 0.02889898, 0.        ,
       0.        ])

In [30]:
print(rf_reg.score(X_train, y_train))
print(rf_reg.score(X_test, y_test))

0.8942689360276888
0.8956099442572785


In [31]:
print('METRIC SUMMARY')
print('MSE', mean_squared_error(y_test, rf_reg.predict(X_test)))
print('RMSE', np.sqrt(mean_squared_error(y_test, rf_reg.predict(X_test))))
print('MSLE', mean_squared_log_error(y_test, rf_reg.predict(X_test)))
print('MAE', mean_absolute_error(y_test, rf_reg.predict(X_test)))
print('R2', r2_score(y_test, rf_reg.predict(X_test)))

METRIC SUMMARY
MSE 1635649.142490213
RMSE 1278.9249948649112
MSLE 0.06781532964194177
MAE 729.9337835070905
R2 0.8956099442572785


In [33]:
df = pd.read_csv('diamonds_test.csv')

In [55]:
df.to_csv("df.csv", index=False)

In [56]:
df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,1.10,5,3,2,62.2,58.0,6.69,6.60,4.13
1,1,0.51,4,2,3,62.5,57.0,5.07,5.10,3.18
2,2,2.03,5,4,3,61.9,59.0,8.14,8.09,5.02
3,3,1.21,5,5,3,60.0,60.0,6.96,6.91,4.16
4,4,0.55,4,5,3,61.8,55.0,5.27,5.22,3.24
...,...,...,...,...,...,...,...,...,...,...
13444,13444,0.30,5,6,4,60.0,58.0,4.37,4.40,2.63
13445,13445,0.55,4,7,6,60.8,56.0,5.31,5.34,3.24
13446,13446,0.23,3,6,7,61.3,58.0,3.94,3.96,2.42
13447,13447,0.30,4,7,4,61.4,58.0,4.29,4.31,2.64


In [35]:
df.cut = df.cut.map(cuts)
df.color = df.color.map(colors)
df.clarity = df.clarity.map(clar)

In [54]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,1.1,5,3,2,62.2,58.0,6.69,6.6,4.13
1,1,0.51,4,2,3,62.5,57.0,5.07,5.1,3.18
2,2,2.03,5,4,3,61.9,59.0,8.14,8.09,5.02
3,3,1.21,5,5,3,60.0,60.0,6.96,6.91,4.16
4,4,0.55,4,5,3,61.8,55.0,5.27,5.22,3.24


In [38]:
cols

['carat', 'cut', 'color', 'clarity', 'depth', 'table']

In [39]:
X_test = df[cols]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [42]:
y_test_pred = testeo.predict(X_test)

In [43]:
df['price']=y_test_pred

ValueError: Length of values does not match length of index

In [44]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,1.1,5,3,2,62.2,58.0,6.69,6.6,4.13
1,1,0.51,4,2,3,62.5,57.0,5.07,5.1,3.18
2,2,2.03,5,4,3,61.9,59.0,8.14,8.09,5.02
3,3,1.21,5,5,3,60.0,60.0,6.96,6.91,4.16
4,4,0.55,4,5,3,61.8,55.0,5.27,5.22,3.24


In [237]:
sub = df[['id', 'price']]

In [238]:
sub

Unnamed: 0,id,price
0,0,4997.417164
1,1,1669.846534
2,2,14862.674678
3,3,4997.417164
4,4,1669.846534
...,...,...
13444,13444,786.850712
13445,13445,1669.846534
13446,13446,786.850712
13447,13447,786.850712


In [241]:
sub.to_csv("sub.csv", index=False)