In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import sys
sys.path.append('../src')
import traintest as trf

In [2]:
diamonds = pd.read_csv("../data/modelo.csv")

In [3]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,1.14,5,4,8,61.0,56.0,9013
1,0.76,5,3,6,62.7,57.0,2692
2,0.84,5,4,7,61.4,56.0,4372
3,1.55,5,3,7,62.0,57.0,13665
4,0.3,5,4,4,61.9,57.0,422


# Training train dataset with model RandomForestRegressor

In [4]:
X = diamonds.drop("price",axis=1)
y = diamonds.price

In [5]:
X_train, X_test, y_train, y_test = tts(X,y, test_size= 0.2)

In [6]:
Ran_Forest = RandomForestRegressor()

In [7]:
Ran_Forest.fit(X_train,y_train)

RandomForestRegressor()

In [8]:
y_pred = Ran_Forest.predict(X_test)

In [9]:
print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2 - ', metrics.r2_score(y_test, y_pred))

RMSE -  544.7040344672191
R2 -  0.9812779402933928


In [10]:
#training with test set to check how fit is our model

In [11]:
y_pred_train = Ran_Forest.predict(X_train)

In [12]:
print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train)))
print('R2 - ', metrics.r2_score(y_train, y_pred_train))

RMSE -  210.89123454733777
R2 -  0.9972181007541568


## testing training and testing errors to get the fittest depth of the random forest model

In [13]:
inicial = 1
final = 25
forest_df = trf.treedepth(inicial, final,X_train,y_train,X_test,y_test) #function on src folder

In [14]:
forest_df.sample(5)

Unnamed: 0,model,depth,train_error,test_error
19,"(DecisionTreeRegressor(max_depth=20, max_featu...",20,47299.26,291305.59
13,"(DecisionTreeRegressor(max_depth=14, max_featu...",14,101379.82,288560.61
10,"(DecisionTreeRegressor(max_depth=11, max_featu...",11,192380.55,289869.7
14,"(DecisionTreeRegressor(max_depth=15, max_featu...",15,81848.15,287002.25
5,"(DecisionTreeRegressor(max_depth=6, max_featur...",6,597484.04,588030.34


In [15]:
#max depth acording to test_error to fit model
minimo = forest_df["test_error"].min()
depth_rf = forest_df.depth[forest_df["test_error"] == minimo].values[0]
depth_rf

13

In [16]:
Ran_Forest_max = RandomForestRegressor(max_depth=depth_rf)

In [17]:
Ran_Forest_max.fit(X_train,y_train)

RandomForestRegressor(max_depth=13)

In [18]:
y_pred = Ran_Forest_max.predict(X_test)

In [19]:
print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2 - ', metrics.r2_score(y_test, y_pred))

RMSE -  533.2200060564861
R2 -  0.9820590550782003


## Apply the already trained model to test.csv (after doing same cleaning)

In [27]:
diamonds_test = pd.read_csv("../data/test.csv")

In [28]:
diamonds_test.drop("id",axis=1,inplace = True)

In [29]:
diamonds_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.0,Fair,J,SI2,56.3,64.0,6.58,6.54,3.69
1,0.83,Premium,G,SI1,62.3,58.0,6.01,5.97,3.73
2,1.0,Fair,E,SI2,67.0,53.0,6.19,6.13,4.13
3,1.0,Fair,H,SI2,66.5,62.0,6.19,6.1,4.09
4,1.2,Very Good,I,SI1,62.6,57.0,6.74,6.77,4.23


In [30]:
cut_dicc = {"Ideal":5, "Premium":4,"Very Good":3,"Good":2, "Fair":1}
color_dic = {'D':7, 'E': 6, 'F':5, 'G':4, 'H':3, 'I':2, 'J':1} 
clarity_dic = {"I3" : 1, "I2" : 2, "I1" : 3,
               "SI2" : 4 ,"SI1" : 5,
               "VS2" : 6, "VS1" : 7,
               "VVS2" : 8, "VVS1":9,
               "IF" : 10 , "FL" : 1
                }
diamonds_test.cut = diamonds_test.cut.replace(cut_dicc)
diamonds_test.color = diamonds_test.color.replace(color_dic)
diamonds_test.clarity = diamonds_test.clarity.replace(clarity_dic)
dropeando = ["x","y","z"]
diamonds_test.drop(dropeando, axis=1,inplace = True)

In [31]:
diamonds_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table
0,1.0,1,1,4,56.3,64.0
1,0.83,4,4,5,62.3,58.0
2,1.0,1,6,4,67.0,53.0
3,1.0,1,3,4,66.5,62.0
4,1.2,3,2,5,62.6,57.0


In [32]:
y_prediction = Ran_Forest_max.predict(diamonds_test)

In [33]:
diamonds_test["price"] = y_prediction

In [34]:
diamonds_test.drop(['carat', 'cut', 'color', 'clarity', 'depth', 'table'],axis=1,inplace = True)

In [35]:
diamonds_test.reset_index(inplace = True)
diamonds_test.rename({"index":"id"},axis=1,inplace=True)

In [36]:
diamonds_test.head()

Unnamed: 0,id,price
0,0,3326.871409
1,1,2962.788695
2,2,3302.181468
3,3,3161.754542
4,4,5230.822989


In [37]:
diamonds_test.to_csv("../data/submission_05.csv",index=False) #exporting submission