In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import statsmodels.formula.api as smf

In [28]:
df = pd.read_csv('../data/train_data_encoded.csv')
print('shape:', df.shape)
df.head(5)

shape: (28942, 7)


Unnamed: 0,weight,quality,color,clarity,table_percent,price,depth_percent
0,0.3,3,6,1,0.58,6.353,0.624
1,1.01,4,5,5,0.56,9.183,0.627
2,0.72,4,4,3,0.59,7.983,0.618
3,1.08,2,3,1,0.57,8.371,0.632
4,0.36,3,3,4,0.59,6.588,0.623


In [30]:
#results using stats library
results = smf.ols("price ~  weight + quality + color + clarity + table_percent + depth_percent", data=df).fit() 
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.878
Model:,OLS,Adj. R-squared:,0.878
Method:,Least Squares,F-statistic:,34840.0
Date:,"Mon, 23 May 2022",Prob (F-statistic):,0.0
Time:,21:18:37,Log-Likelihood:,-10353.0
No. Observations:,28942,AIC:,20720.0
Df Residuals:,28935,BIC:,20780.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.9415,0.150,32.841,0.000,4.647,5.236
weight,2.0946,0.005,439.801,0.000,2.085,2.104
quality,0.0269,0.002,12.309,0.000,0.023,0.031
color,0.0869,0.001,71.122,0.000,0.085,0.089
clarity,0.0944,0.001,71.575,0.000,0.092,0.097
table_percent,0.4203,0.115,3.667,0.000,0.196,0.645
depth_percent,0.5009,0.170,2.954,0.003,0.169,0.833

0,1,2,3
Omnibus:,3642.533,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5556.356
Skew:,-0.913,Prob(JB):,0.0
Kurtosis:,4.129,Cond. No.,655.0


In [24]:
#split dara train and test
X = df.drop(["price"], axis = 1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

print("Datos de entrenamiento")
print("-----------------------")
print(y_train.describe())
print('\n')
print("Datos de testeo")
print("-----------------------")
print(y_test.describe())

Datos de entrenamiento
-----------------------
count    23153.000000
mean         7.992375
std          0.993806
min          5.787000
25%          7.156000
50%          8.082000
75%          8.755000
max          9.842000
Name: price, dtype: float64


Datos de testeo
-----------------------
count    5789.000000
mean        8.006556
std         0.986775
min         5.866000
25%         7.189000
50%         8.089000
75%         8.755000
max         9.841000
Name: price, dtype: float64


In [31]:
X.sample(5)

Unnamed: 0,weight,quality,color,clarity,table_percent,depth_percent
7316,1.01,2,0,1,0.55,0.627
27298,0.55,3,4,3,0.58,0.59
25008,0.31,4,2,2,0.56,0.619
2640,1.01,4,3,3,0.56,0.627
23854,1.05,2,1,3,0.59,0.624


In [34]:
pd.DataFrame(y).sample(5)

Unnamed: 0,price
6764,7.772
14893,8.214
26432,8.161
24561,8.792
10747,8.771


In [35]:
#predictor variables importance model
regressor = DecisionTreeRegressor(random_state = 0) 
regressor.fit(X_train, y_train)
max_features = np.sqrt(len(X_train.columns))
print('max_features:', max_features)
print('max_depth:', regressor.tree_.max_depth)
y_pred_test_dt = regressor.predict(X_test)
y_pred_train_dt = regressor.predict(X_train)

predictor_importance = pd.DataFrame({'predictor': X_train.columns, 'importance': regressor.feature_importances_})

predictor_importance.sort_values( ascending=False, by= "importance", inplace=True)
print("predictor variables importance")
print("-------------------------------------------")
predictor_importance

max_features: 2.449489742783178
max_depth: 31
predictor variables importance
-------------------------------------------


Unnamed: 0,predictor,importance
0,weight,0.934484
3,clarity,0.038469
2,color,0.017513
5,depth_percent,0.004796
4,table_percent,0.002481
1,quality,0.002257


In [36]:
#average price per weight range and color
df['weight_range'] = df['weight'].round(0).astype('category')
weight_clarity_analysis = pd.DataFrame(df.groupby(['weight_range', 'clarity']).mean().head(50).sort_index(ascending=False)['price']).dropna()
weight_clarity_analysis

Unnamed: 0_level_0,Unnamed: 1_level_0,price
weight_range,clarity,Unnamed: 2_level_1
3.0,4,9.7745
3.0,2,9.754375
3.0,1,9.693298
3.0,0,9.161667
2.0,7,9.549821
2.0,6,9.4583
2.0,5,9.438352
2.0,4,9.441877
2.0,3,9.448293
2.0,2,9.379058
