### Regresión de Árbol aleatorio

Es un algoritmo de machine learning que combina múltiples árboles de decisión para predecir valores numéricos. 
Cada árbol se entrena con un subconjunto aleatorio de datos y sus predicciones se promedian para obtener una predicción final.

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('dataset_fish.csv')

In [3]:
df.dtypes

Species     object
Weight     float64
Length1    float64
Length2    float64
Length3    float64
Height     float64
Width      float64
dtype: object

In [4]:
df.Species.unique()

array(['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt'],
      dtype=object)

In [5]:
#df.Species = df.Species.map({'Bream': 1, 'Roach': 2, 'Whitefish': 3, 'Parkki': 4, 'Perch': 5, 'Pike': 6, 'Smelt': 7})

#prueba de 'mapeo' con codificacion one-hot. Se crearán columnas binarias para cada valor de Species
df_nuevo = pd.get_dummies(df, columns=['Species'], prefix=['Species'])

In [6]:
df_nuevo.dtypes

Weight               float64
Length1              float64
Length2              float64
Length3              float64
Height               float64
Width                float64
Species_Bream           bool
Species_Parkki          bool
Species_Perch           bool
Species_Pike            bool
Species_Roach           bool
Species_Smelt           bool
Species_Whitefish       bool
dtype: object

In [7]:
df_nuevo.tail()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width,Species_Bream,Species_Parkki,Species_Perch,Species_Pike,Species_Roach,Species_Smelt,Species_Whitefish
154,12.2,11.5,12.2,13.4,2.0904,1.3936,False,False,False,False,False,True,False
155,13.4,11.7,12.4,13.5,2.43,1.269,False,False,False,False,False,True,False
156,12.2,12.1,13.0,13.8,2.277,1.2558,False,False,False,False,False,True,False
157,19.7,13.2,14.3,15.2,2.8728,2.0672,False,False,False,False,False,True,False
158,19.9,13.8,15.0,16.2,2.9322,1.8792,False,False,False,False,False,True,False


In [8]:
df_nuevo.describe()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
count,159.0,159.0,159.0,159.0,159.0,159.0
mean,398.326415,26.24717,28.415723,31.227044,8.970994,4.417486
std,357.978317,9.996441,10.716328,11.610246,4.286208,1.685804
min,0.0,7.5,8.4,8.8,1.7284,1.0476
25%,120.0,19.05,21.0,23.15,5.9448,3.38565
50%,273.0,25.2,27.3,29.4,7.786,4.2485
75%,650.0,32.7,35.5,39.65,12.3659,5.5845
max,1650.0,59.0,63.4,68.0,18.957,8.142


### Entrenamiento del modelo

In [9]:
corr = round(df_nuevo.corr(), 4)
corr.style.background_gradient()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width,Species_Bream,Species_Parkki,Species_Perch,Species_Pike,Species_Roach,Species_Smelt,Species_Whitefish
Weight,1.0,0.9157,0.9186,0.923,0.7243,0.8865,0.3268,-0.186,-0.0332,0.3106,-0.2618,-0.3371,0.0736
Length1,0.9157,1.0,0.9995,0.992,0.6254,0.867,0.2164,-0.2057,-0.0378,0.5635,-0.2132,-0.4674,0.0507
Length2,0.9186,0.9995,1.0,0.9941,0.6404,0.8735,0.2334,-0.206,-0.0361,0.5528,-0.218,-0.4798,0.0538
Length3,0.923,0.992,0.9941,1.0,0.7034,0.8785,0.3272,-0.1987,-0.1055,0.5229,-0.2051,-0.4884,0.0529
Height,0.7243,0.6254,0.6404,0.7034,1.0,0.7929,0.7724,-0.0005,-0.1914,-0.1018,-0.2021,-0.4917,0.049
Width,0.8865,0.867,0.8735,0.8785,0.7929,1.0,0.3193,-0.1941,0.144,0.1377,-0.1715,-0.569,0.1244
Species_Bream,0.3268,0.2164,0.2334,0.3272,0.7724,0.3193,1.0,-0.1448,-0.3917,-0.1838,-0.2015,-0.1651,-0.1052
Species_Parkki,-0.186,-0.2057,-0.206,-0.1987,-0.0005,-0.1941,-0.1448,1.0,-0.201,-0.0943,-0.1034,-0.0847,-0.054
Species_Perch,-0.0332,-0.0378,-0.0361,-0.1055,-0.1914,0.144,-0.3917,-0.201,1.0,-0.2551,-0.2797,-0.2291,-0.146
Species_Pike,0.3106,0.5635,0.5528,0.5229,-0.1018,0.1377,-0.1838,-0.0943,-0.2551,1.0,-0.1312,-0.1075,-0.0685


In [10]:
data_x = df_nuevo.drop(['Weight', 'Species_Parkki', 'Species_Perch', 'Species_Roach', 'Species_Smelt'], axis='columns')
data_y = df_nuevo.Weight

In [11]:
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.2)

In [12]:
random_forest_regression = RandomForestRegressor(n_estimators=300, max_depth=8)

In [13]:
random_forest_regression.fit(train_x, train_y)

In [14]:
y_predicted = random_forest_regression.predict(test_x)

In [15]:
print(f'Calificación de prediccion: {round(random_forest_regression.score(test_x, test_y), 4) * 100}%')


Calificación de prediccion: 96.99%


In [16]:
train_x

Unnamed: 0,Length1,Length2,Length3,Height,Width,Species_Bream,Species_Pike,Species_Whitefish
107,26.9,28.7,30.1,7.5852,4.6354,False,False,False
87,20.0,22.0,23.5,5.6400,3.5250,False,False,False
91,20.5,22.5,24.0,5.8560,3.6240,False,False,False
134,40.0,42.5,45.5,7.2800,4.3225,False,True,False
59,33.7,36.4,39.6,11.7612,6.5736,False,False,True
...,...,...,...,...,...,...,...,...
80,17.8,19.6,20.8,5.1376,3.0368,False,False,False
0,23.2,25.4,30.0,11.5200,4.0200,True,False,False
142,56.0,60.0,64.0,9.6000,6.1440,False,True,False
133,36.0,38.5,41.0,6.3960,3.9770,False,True,False


In [32]:
len1 = 26.9
len2 = 28.7	
len3 = 30.1
height = 7.5852
width = 4.6354
bream = False
pike = False
whitefish = False

In [33]:
def predict_fish_weight(len1, len2, len3, height, width, bream, pike, whitefish, model):
    result = model.predict([[len1, len2, len3, height, bream, pike, whitefish,width]])
    print(f"El pez con longitud 1: {len1} , longitud 2: {len2}, longitud 3: {len3}, con ancho: {width} y altura: {height} pesa: {result[0]}")


In [34]:
predict_fish_weight(len1, len2, len3, height, width, bream, pike, whitefish, random_forest_regression)

El pez con longitud 1: 26.9 , longitud 2: 28.7, longitud 3: 30.1, con ancho: 4.6354 y altura: 7.5852 pesa: 249.45774603174607


