In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import statsmodels.formula.api as smf

In [2]:
df = pd.read_csv('../data/train_data_encoded.csv')
print('shape:', df.shape)
df.head(5)

shape: (36944, 8)


Unnamed: 0,quality,color,clarity,price,weight_ES,depth_percent_ES,table_percent_ES,volume_ES
0,3,6,1,6.353,-0.625,0.357143,0.333333,-0.591767
1,4,5,5,9.183,0.484375,0.571429,-0.333333,0.550637
2,4,4,3,7.983,0.03125,-0.071429,0.666667,0.052269
3,2,3,1,8.371,0.59375,0.928571,0.0,0.624043
4,3,3,4,6.588,-0.53125,0.285714,0.666667,-0.511469


In [3]:
#results using stats library
results = smf.ols("price ~  quality + color + clarity + weight_ES + depth_percent_ES + table_percent_ES + volume_ES", data=df).fit() 
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.924
Model:,OLS,Adj. R-squared:,0.924
Method:,Least Squares,F-statistic:,64320.0
Date:,"Thu, 26 May 2022",Prob (F-statistic):,0.0
Time:,18:54:01,Log-Likelihood:,-3953.7
No. Observations:,36944,AIC:,7923.0
Df Residuals:,36936,BIC:,7992.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.9232,0.007,974.699,0.000,6.909,6.937
quality,0.0212,0.002,12.736,0.000,0.018,0.024
color,0.0846,0.001,97.352,0.000,0.083,0.086
clarity,0.0932,0.001,101.233,0.000,0.091,0.095
weight_ES,0.6229,0.064,9.757,0.000,0.498,0.748
depth_percent_ES,0.0138,0.002,6.223,0.000,0.009,0.018
table_percent_ES,0.0186,0.003,6.843,0.000,0.013,0.024
volume_ES,0.9778,0.064,15.374,0.000,0.853,1.102

0,1,2,3
Omnibus:,2730.333,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3611.965
Skew:,-0.656,Prob(JB):,0.0
Kurtosis:,3.792,Cond. No.,377.0


In [4]:
#split dara train and test
X = df.drop(["price"], axis = 1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

print("Datos de entrenamiento")
print("-----------------------")
print(y_train.describe())
print('\n')
print("Datos de testeo")
print("-----------------------")
print(y_test.describe())

Datos de entrenamiento
-----------------------
count    29555.000000
mean         7.702645
std          0.976815
min          5.814000
25%          6.809000
50%          7.689000
75%          8.505000
max          9.842000
Name: price, dtype: float64


Datos de testeo
-----------------------
count    7389.000000
mean        7.699702
std         0.982903
min         5.787000
25%         6.802000
50%         7.695000
75%         8.511000
max         9.841000
Name: price, dtype: float64


In [5]:
X.sample(5)

Unnamed: 0,quality,color,clarity,weight_ES,depth_percent_ES,table_percent_ES,volume_ES
12427,3,4,1,0.0,-0.928571,0.0,0.05902
14952,3,3,4,-0.453125,0.142857,1.0,-0.426188
27304,4,4,3,-0.484375,-0.5,0.0,-0.45199
26589,4,6,1,0.0,-0.928571,-0.133333,0.049992
9120,4,5,5,-0.46875,-0.214286,-0.333333,-0.429198


In [6]:
pd.DataFrame(y).sample(5)

Unnamed: 0,price
29118,9.204
6390,7.472
27798,8.409
11358,8.281
23995,8.561


In [7]:
#predictor variables importance model
regressor = DecisionTreeRegressor(random_state = 0) 
regressor.fit(X_train, y_train)
max_features = np.sqrt(len(X_train.columns))
print('max_features:', max_features)
print('max_depth:', regressor.tree_.max_depth)
y_pred_test_dt = regressor.predict(X_test)
y_pred_train_dt = regressor.predict(X_train)

predictor_importance = pd.DataFrame({'predictor': X_train.columns, 'importance': regressor.feature_importances_})

predictor_importance.sort_values( ascending=False, by= "importance", inplace=True)
print("predictor variables importance")
print("-------------------------------------------")
predictor_importance

max_features: 2.6457513110645907
max_depth: 32
predictor variables importance
-------------------------------------------


Unnamed: 0,predictor,importance
3,weight_ES,0.829231
6,volume_ES,0.114811
2,clarity,0.033403
1,color,0.016696
4,depth_percent_ES,0.002462
0,quality,0.001933
5,table_percent_ES,0.001464


In [9]:
#average price per weight range and color
df['weight_range'] = df['weight_ES'].round(0).astype('category')
weight_clarity_analysis = pd.DataFrame(df.groupby(['weight_range', 'clarity']).mean().head(50).sort_index(ascending=False)['price']).dropna()
weight_clarity_analysis

Unnamed: 0_level_0,Unnamed: 1_level_0,price
weight_range,clarity,Unnamed: 2_level_1
2.0,7,9.606
2.0,6,9.56
2.0,5,9.56332
2.0,4,9.536557
2.0,3,9.500393
2.0,2,9.446124
2.0,1,9.302644
2.0,0,8.75616
1.0,7,9.373285
1.0,6,9.284297
