# Arboles de Regresion

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("../datasets/boston/Boston.csv")

In [3]:
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
data.shape

(506, 14)

In [5]:
colnames = data.columns.values.tolist()
predictors = colnames[:13]
target = colnames [13]
X = data[predictors]
Y = data[target]

In [6]:
from sklearn.tree import DecisionTreeRegressor

In [7]:
regtree = DecisionTreeRegressor(min_samples_split=30, min_samples_leaf=10, random_state=0)

In [8]:
regtree.fit(X,Y)

DecisionTreeRegressor(min_samples_leaf=10, min_samples_split=30, random_state=0)

In [9]:
preds = regtree.predict(data[predictors])

In [10]:
data["preds"] = preds

In [11]:
data[['preds', 'medv']]

Unnamed: 0,preds,medv
0,22.840000,24.0
1,22.840000,21.6
2,35.247826,34.7
3,35.247826,33.4
4,35.247826,36.2
...,...,...
501,22.840000,22.4
502,20.624138,20.6
503,28.978261,23.9
504,31.170000,22.0


In [12]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np

In [13]:
cv = KFold(n_splits=X.shape[0], shuffle=True, random_state=1)
scores = cross_val_score(regtree, X, Y, scoring="neg_mean_squared_error", cv=cv, n_jobs=1)
print(scores)
score = np.mean(scores)
print(score)

[-9.65519849e-01 -1.22500000e+01 -1.27417958e+01 -2.03062500e+00
 -1.19008264e+00 -1.68100000e+01 -1.48833518e+01 -3.04073469e+01
 -1.30561778e+01 -7.70878893e+00 -1.37573554e+01 -1.47617729e-01
 -2.56000000e+00 -2.23238754e+00 -7.96213018e-01 -4.49605540e+01
 -2.33841327e+01 -2.57293817e+01 -1.58040816e+00 -3.39432892e-01
 -1.40892985e+01 -9.16438017e+00 -5.10575207e+01 -9.50625000e-01
 -7.20711634e+01 -6.81790123e-02 -3.19470699e-01 -8.43321600e+00
 -4.26636147e+00 -1.24163265e+00 -2.90260631e-02 -2.51843100e+00
 -2.57293817e+01 -1.96000000e+00 -1.23201000e+01 -8.70250000e+00
 -5.75069444e-01 -9.90929752e-01 -1.45852071e+00 -1.13140496e-01
 -1.91873800e+00 -2.83798347e+01 -8.79251418e+00 -5.98941399e-01
 -5.15017301e-01 -2.11834793e+02 -2.57293817e+01 -1.67474048e+00
 -1.93181179e+01 -2.90309172e+00 -7.38037190e-01 -1.35424000e+01
 -3.69200907e+01 -5.70958449e+00 -3.84400000e+01 -3.24000000e+00
 -4.67758291e+01 -2.33653061e+00 -4.33396694e+00 -1.09812247e+01
 -8.65051903e+00 -4.71909

In [14]:
list(zip(predictors,regtree.feature_importances_))

[('crim', 0.03421203230639308),
 ('zn', 0.0),
 ('indus', 0.0011605887788380146),
 ('chas', 0.0),
 ('nox', 0.01856163073811432),
 ('rm', 0.6308568014337028),
 ('age', 0.01725115143448847),
 ('dis', 0.0013745115995791378),
 ('rad', 0.0),
 ('tax', 0.0023698305298803803),
 ('ptratio', 0.009333247332530954),
 ('black', 0.0),
 ('lstat', 0.28488020584647283)]

## Random Forests

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [43]:
forest = RandomForestRegressor(n_jobs=2, oob_score=True, n_estimators=500)
forest.fit(X,Y)

RandomForestRegressor(n_estimators=500, n_jobs=2, oob_score=True)

In [44]:
data["rforest_pred"] = forest.oob_prediction_
data[["rforest_pred", "medv"]]

Unnamed: 0,rforest_pred,medv
0,28.925568,24.0
1,22.663043,21.6
2,34.778804,34.7
3,35.091758,33.4
4,34.164583,36.2
...,...,...
501,24.250254,22.4
502,19.063793,20.6
503,27.912644,23.9
504,26.383146,22.0


In [45]:
data["rforest_error2"] = (data["rforest_pred"] - data["medv"])**2
sum(data["rforest_error2"])/len(data)

10.29229804664033

In [46]:
forest.oob_score_

0.8780815901519209