<a href="https://colab.research.google.com/github/gabrielfernandorey/GGGR/blob/main/Arboles/SRT%20v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SRT v4 - Ensayo
### Arboles de regresion - Nuevo Dataset

In [1]:
import random
import string
import pandas as pd
import numpy as np
from datetime import date

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set() # setting seaborn default for plots

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


### Importamos Datos

In [2]:
!wget https://raw.githubusercontent.com/gabrielfernandorey/GGGR/main/Regresion%20lineal/serie_acc_SRT.csv

--2023-05-05 23:31:41--  https://raw.githubusercontent.com/gabrielfernandorey/GGGR/main/Regresion%20lineal/serie_acc_SRT.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26984867 (26M) [text/plain]
Saving to: ‘serie_acc_SRT.csv.2’


2023-05-05 23:31:41 (121 MB/s) - ‘serie_acc_SRT.csv.2’ saved [26984867/26984867]



In [3]:
df = pd.read_csv('/content/serie_acc_SRT.csv', header=0, sep=';',
                 names=['año','mes','codigo','total_cp'],
                 dtype={'año': str, 'mes': str, 'codigo': str, 'total_cp': str})

In [4]:
column_order = ['codigo', 'año', 'mes', 'total_cp']
new_df = df.reindex(columns=column_order)
df = new_df.copy()

In [5]:
df

Unnamed: 0,codigo,año,mes,total_cp
0,01657000,1995,1,1
1,08300000,1995,7,1
2,01900000,1995,9,1
3,01900000,1995,11,1
4,01650000,1996,3,1
...,...,...,...,...
1470273,05582000,2023,5,1
1470274,01754000,2023,5,7
1470275,03503000,2023,5,3
1470276,X5000FJA,2023,5,1


### EDA

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470278 entries, 0 to 1470277
Data columns (total 4 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   codigo    1469958 non-null  object
 1   año       1470278 non-null  object
 2   mes       1470278 non-null  object
 3   total_cp  1470278 non-null  object
dtypes: object(4)
memory usage: 44.9+ MB


#### Ajustamos tipo de datos

In [7]:
df['año'] = pd.to_numeric(df['año'], downcast='integer') #corregimos el formato de la columna 

In [8]:
df['mes'] = pd.to_numeric(df['mes']) #corregimos el formato de la columna 

In [9]:
df['total_cp'] = pd.to_numeric(df['total_cp']) #corregimos el formato de la columna 

In [10]:
df.dtypes

codigo      object
año          int16
mes          int64
total_cp     int64
dtype: object

In [11]:
df.head()

Unnamed: 0,codigo,año,mes,total_cp
0,1657000,1995,1,1
1,8300000,1995,7,1
2,1900000,1995,9,1
3,1900000,1995,11,1
4,1650000,1996,3,1


### Eliminamos Nan

In [12]:
df = df.dropna(subset=['codigo'])
df

Unnamed: 0,codigo,año,mes,total_cp
0,01657000,1995,1,1
1,08300000,1995,7,1
2,01900000,1995,9,1
3,01900000,1995,11,1
4,01650000,1996,3,1
...,...,...,...,...
1470273,05582000,2023,5,1
1470274,01754000,2023,5,7
1470275,03503000,2023,5,3
1470276,X5000FJA,2023,5,1


In [13]:
df = df.sort_values(['año','mes'], ascending=True)
df

Unnamed: 0,codigo,año,mes,total_cp
0,01657000,1995,1,1
1,08300000,1995,7,1
2,01900000,1995,9,1
3,01900000,1995,11,1
4,01650000,1996,3,1
...,...,...,...,...
1470273,05582000,2023,5,1
1470274,01754000,2023,5,7
1470275,03503000,2023,5,3
1470276,X5000FJA,2023,5,1


In [14]:
# Verificar cuantos registros tienen el codpos mayor a 8 caracteres
df_codigo = df.loc[df['codigo'].apply(lambda x: len(str(x)) > 8)]
df_codigo

Unnamed: 0,codigo,año,mes,total_cp


In [15]:
# Verificar cuantos registros tienen el codpos menor a 4 caracteres
df_codigo = df.loc[df['codigo'].apply(lambda x: len(str(x)) < 4)]
df_codigo

Unnamed: 0,codigo,año,mes,total_cp


### Convertimos el codigo postal a 4 caracteres

In [16]:
#df.codigo = df['codigo'].str.slice(start=1, stop=5)
#df

#### Verificamos cantidad de registros por codigo

In [17]:
# codigos unicos
contador = df['codigo'].value_counts()
contador

01900000    315
01437000    306
02000000    300
01001000    299
05000000    290
           ... 
X5127ACB      1
C1416DCH      1
X5013DAE      1
B1804HUF      1
C1431CRD      1
Name: codigo, Length: 207967, dtype: int64

In [18]:
mayores_a_12_datos = contador[contador > 12]

In [19]:
len(mayores_a_12_datos)

22897

In [20]:
df = df[df['codigo'].isin(mayores_a_12_datos.index)]

In [21]:
df

Unnamed: 0,codigo,año,mes,total_cp
0,01657000,1995,1,1
1,08300000,1995,7,1
2,01900000,1995,9,1
3,01900000,1995,11,1
4,01650000,1996,3,1
...,...,...,...,...
1470273,05582000,2023,5,1
1470274,01754000,2023,5,7
1470275,03503000,2023,5,3
1470276,X5000FJA,2023,5,1


### Transformacion

#### LabelEncoder

In [22]:
from sklearn.preprocessing import LabelEncoder

In [23]:
# Crear una instancia de OneHotEncoder
encoder = LabelEncoder()

In [24]:
# Codificar los valores alfanuméricos
X_encoded = encoder.fit_transform(df.codigo.values.reshape(-1,))

In [25]:
X_encoded.shape

(996749,)

#### Embeddings

In [26]:
tamano_embedding = 50

In [27]:
matriz_embedding = np.random.rand(len(encoder.classes_), tamano_embedding)

In [28]:
matriz_embedding

array([[0.75065066, 0.17768879, 0.08442293, ..., 0.98263879, 0.46365607,
        0.51208969],
       [0.84752884, 0.92009784, 0.02993433, ..., 0.8374046 , 0.26976752,
        0.75106958],
       [0.04083885, 0.41738566, 0.53446623, ..., 0.8888751 , 0.46896913,
        0.96829454],
       ...,
       [0.14090303, 0.40334792, 0.9999996 , ..., 0.17480473, 0.92760792,
        0.45777411],
       [0.29851384, 0.55132407, 0.78182484, ..., 0.73328213, 0.60457005,
        0.76313941],
       [0.59896546, 0.20791686, 0.55869074, ..., 0.59296823, 0.86047156,
        0.04874945]])

In [29]:
encoder.classes_

array(['        ', '0   0000', '00000000', ..., 'Z9405DHZ', 'Z9407CPE',
       'Z9407DIE'], dtype=object)

In [30]:
# Vectores de embeddings asociados a un valor por cada atributo
atributos_embeddings = matriz_embedding[X_encoded]

In [31]:
atributos_embeddings.shape

(996749, 50)

#### Verificacion embeddings

In [32]:
i = 0
encoder.classes_[i]

'        '

In [33]:
matriz_embedding[i]

array([7.50650657e-01, 1.77688792e-01, 8.44229289e-02, 6.11496690e-01,
       2.51974581e-01, 6.36908689e-01, 6.12235724e-01, 2.92541711e-01,
       7.23304940e-01, 7.96412993e-01, 9.82418455e-01, 2.26766305e-01,
       8.70439758e-01, 6.90682355e-01, 3.47644662e-04, 6.01441956e-01,
       5.11246728e-01, 5.22246641e-01, 3.75450777e-01, 1.31053542e-01,
       9.24926400e-01, 8.32157103e-01, 2.93214511e-01, 7.41680700e-01,
       7.29741680e-01, 5.92071555e-01, 9.11930269e-01, 2.93403338e-01,
       5.91467696e-01, 4.72990342e-01, 6.06854387e-01, 4.17502548e-01,
       2.98194332e-01, 7.10907176e-01, 8.99260163e-01, 8.39249251e-02,
       9.95678788e-01, 9.35141163e-01, 2.47684956e-01, 3.49724458e-01,
       6.68868590e-01, 9.93454247e-01, 4.23872417e-04, 4.05092726e-01,
       1.77388267e-01, 7.21187582e-01, 3.79463005e-01, 9.82638792e-01,
       4.63656073e-01, 5.12089689e-01])

In [34]:
matriz_embedding.shape

(22897, 50)

#### Dataframe con datos de embeddings

In [35]:
# Creamos un nuevo dataframe con los valores del embedding
embedding_df = pd.DataFrame(matriz_embedding, columns=[f'emb_{i}' for i in range(matriz_embedding.shape[1])])

In [36]:
embedding_df

Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_40,emb_41,emb_42,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49
0,0.750651,0.177689,0.084423,0.611497,0.251975,0.636909,0.612236,0.292542,0.723305,0.796413,...,0.668869,0.993454,0.000424,0.405093,0.177388,0.721188,0.379463,0.982639,0.463656,0.512090
1,0.847529,0.920098,0.029934,0.095806,0.406458,0.372271,0.602118,0.322010,0.359750,0.790602,...,0.545421,0.299370,0.511922,0.964377,0.179305,0.147712,0.074055,0.837405,0.269768,0.751070
2,0.040839,0.417386,0.534466,0.692311,0.190499,0.939614,0.201885,0.129226,0.264504,0.682586,...,0.991173,0.407506,0.699751,0.070682,0.989595,0.183763,0.249622,0.888875,0.468969,0.968295
3,0.142712,0.402303,0.366357,0.643081,0.328984,0.907380,0.349406,0.973775,0.013449,0.902859,...,0.513869,0.795339,0.519394,0.335175,0.230065,0.440960,0.557001,0.713332,0.407249,0.965706
4,0.438221,0.711872,0.322800,0.921337,0.537523,0.635537,0.016219,0.145340,0.748705,0.281298,...,0.125058,0.512906,0.917000,0.934554,0.998077,0.071442,0.719200,0.076946,0.325761,0.257092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22892,0.491790,0.883620,0.189683,0.120390,0.197363,0.176571,0.196714,0.276963,0.843139,0.041635,...,0.732075,0.049575,0.481910,0.311429,0.508707,0.356263,0.863088,0.408652,0.870609,0.942027
22893,0.609781,0.160721,0.240282,0.312588,0.769954,0.393776,0.805088,0.868398,0.163777,0.453708,...,0.319583,0.272113,0.136574,0.748940,0.375089,0.637654,0.113700,0.388207,0.903981,0.329242
22894,0.140903,0.403348,1.000000,0.073702,0.952070,0.810907,0.666908,0.669251,0.076849,0.167721,...,0.100719,0.472893,0.475795,0.922311,0.841443,0.851414,0.159572,0.174805,0.927608,0.457774
22895,0.298514,0.551324,0.781825,0.531518,0.605272,0.890254,0.394207,0.097411,0.447967,0.218923,...,0.038244,0.686751,0.067083,0.640662,0.929688,0.737828,0.608147,0.733282,0.604570,0.763139


In [37]:
embedding_df['codigo'] = encoder.classes_

In [38]:
# Utilizamos el codigo como indice (para joinear)
embedding_df = embedding_df.set_index('codigo')
embedding_df

Unnamed: 0_level_0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_40,emb_41,emb_42,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49
codigo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.750651,0.177689,0.084423,0.611497,0.251975,0.636909,0.612236,0.292542,0.723305,0.796413,...,0.668869,0.993454,0.000424,0.405093,0.177388,0.721188,0.379463,0.982639,0.463656,0.512090
0 0000,0.847529,0.920098,0.029934,0.095806,0.406458,0.372271,0.602118,0.322010,0.359750,0.790602,...,0.545421,0.299370,0.511922,0.964377,0.179305,0.147712,0.074055,0.837405,0.269768,0.751070
00000000,0.040839,0.417386,0.534466,0.692311,0.190499,0.939614,0.201885,0.129226,0.264504,0.682586,...,0.991173,0.407506,0.699751,0.070682,0.989595,0.183763,0.249622,0.888875,0.468969,0.968295
00001000,0.142712,0.402303,0.366357,0.643081,0.328984,0.907380,0.349406,0.973775,0.013449,0.902859,...,0.513869,0.795339,0.519394,0.335175,0.230065,0.440960,0.557001,0.713332,0.407249,0.965706
00005000,0.438221,0.711872,0.322800,0.921337,0.537523,0.635537,0.016219,0.145340,0.748705,0.281298,...,0.125058,0.512906,0.917000,0.934554,0.998077,0.071442,0.719200,0.076946,0.325761,0.257092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Z9405CLL,0.491790,0.883620,0.189683,0.120390,0.197363,0.176571,0.196714,0.276963,0.843139,0.041635,...,0.732075,0.049575,0.481910,0.311429,0.508707,0.356263,0.863088,0.408652,0.870609,0.942027
Z9405DHC,0.609781,0.160721,0.240282,0.312588,0.769954,0.393776,0.805088,0.868398,0.163777,0.453708,...,0.319583,0.272113,0.136574,0.748940,0.375089,0.637654,0.113700,0.388207,0.903981,0.329242
Z9405DHZ,0.140903,0.403348,1.000000,0.073702,0.952070,0.810907,0.666908,0.669251,0.076849,0.167721,...,0.100719,0.472893,0.475795,0.922311,0.841443,0.851414,0.159572,0.174805,0.927608,0.457774
Z9407CPE,0.298514,0.551324,0.781825,0.531518,0.605272,0.890254,0.394207,0.097411,0.447967,0.218923,...,0.038244,0.686751,0.067083,0.640662,0.929688,0.737828,0.608147,0.733282,0.604570,0.763139


In [39]:
# Utilizamos el codigo como indice (para joinear)
df = df.set_index('codigo')
df

Unnamed: 0_level_0,año,mes,total_cp
codigo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01657000,1995,1,1
08300000,1995,7,1
01900000,1995,9,1
01900000,1995,11,1
01650000,1996,3,1
...,...,...,...
05582000,2023,5,1
01754000,2023,5,7
03503000,2023,5,3
X5000FJA,2023,5,1


In [40]:
df.total_cp.max()

5313

### Transformar la variable de salida

In [41]:
mmscaler = MinMaxScaler()

In [42]:
df['total_cp'] = mmscaler.fit_transform(df['total_cp'].values.reshape(-1,1))

In [43]:
df.head()

Unnamed: 0_level_0,año,mes,total_cp
codigo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1657000,1995,1,0.0
8300000,1995,7,0.0
1900000,1995,9,0.0
1900000,1995,11,0.0
1650000,1996,3,0.0


#### Joins de Dataframes

In [44]:
df_join = embedding_df.join(df, lsuffix='_izq', rsuffix='_der')
df_join

Unnamed: 0_level_0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49,año,mes,total_cp
codigo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.750651,0.177689,0.084423,0.611497,0.251975,0.636909,0.612236,0.292542,0.723305,0.796413,...,0.405093,0.177388,0.721188,0.379463,0.982639,0.463656,0.512090,2001,11,0.000000
,0.750651,0.177689,0.084423,0.611497,0.251975,0.636909,0.612236,0.292542,0.723305,0.796413,...,0.405093,0.177388,0.721188,0.379463,0.982639,0.463656,0.512090,2011,7,0.000000
,0.750651,0.177689,0.084423,0.611497,0.251975,0.636909,0.612236,0.292542,0.723305,0.796413,...,0.405093,0.177388,0.721188,0.379463,0.982639,0.463656,0.512090,2012,8,0.000000
,0.750651,0.177689,0.084423,0.611497,0.251975,0.636909,0.612236,0.292542,0.723305,0.796413,...,0.405093,0.177388,0.721188,0.379463,0.982639,0.463656,0.512090,2013,2,0.000000
,0.750651,0.177689,0.084423,0.611497,0.251975,0.636909,0.612236,0.292542,0.723305,0.796413,...,0.405093,0.177388,0.721188,0.379463,0.982639,0.463656,0.512090,2013,9,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Z9407DIE,0.598965,0.207917,0.558691,0.913485,0.738199,0.094040,0.216571,0.926080,0.782726,0.395117,...,0.690914,0.116587,0.377514,0.431713,0.592968,0.860472,0.048749,2016,2,0.000188
Z9407DIE,0.598965,0.207917,0.558691,0.913485,0.738199,0.094040,0.216571,0.926080,0.782726,0.395117,...,0.690914,0.116587,0.377514,0.431713,0.592968,0.860472,0.048749,2016,3,0.000000
Z9407DIE,0.598965,0.207917,0.558691,0.913485,0.738199,0.094040,0.216571,0.926080,0.782726,0.395117,...,0.690914,0.116587,0.377514,0.431713,0.592968,0.860472,0.048749,2016,5,0.000188
Z9407DIE,0.598965,0.207917,0.558691,0.913485,0.738199,0.094040,0.216571,0.926080,0.782726,0.395117,...,0.690914,0.116587,0.377514,0.431713,0.592968,0.860472,0.048749,2016,6,0.000000


#### Datos de entrenamiento

In [45]:
df_final = df_join.reset_index()
df_final

Unnamed: 0,codigo,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49,año,mes,total_cp
0,,0.750651,0.177689,0.084423,0.611497,0.251975,0.636909,0.612236,0.292542,0.723305,...,0.405093,0.177388,0.721188,0.379463,0.982639,0.463656,0.512090,2001,11,0.000000
1,,0.750651,0.177689,0.084423,0.611497,0.251975,0.636909,0.612236,0.292542,0.723305,...,0.405093,0.177388,0.721188,0.379463,0.982639,0.463656,0.512090,2011,7,0.000000
2,,0.750651,0.177689,0.084423,0.611497,0.251975,0.636909,0.612236,0.292542,0.723305,...,0.405093,0.177388,0.721188,0.379463,0.982639,0.463656,0.512090,2012,8,0.000000
3,,0.750651,0.177689,0.084423,0.611497,0.251975,0.636909,0.612236,0.292542,0.723305,...,0.405093,0.177388,0.721188,0.379463,0.982639,0.463656,0.512090,2013,2,0.000000
4,,0.750651,0.177689,0.084423,0.611497,0.251975,0.636909,0.612236,0.292542,0.723305,...,0.405093,0.177388,0.721188,0.379463,0.982639,0.463656,0.512090,2013,9,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996744,Z9407DIE,0.598965,0.207917,0.558691,0.913485,0.738199,0.094040,0.216571,0.926080,0.782726,...,0.690914,0.116587,0.377514,0.431713,0.592968,0.860472,0.048749,2016,2,0.000188
996745,Z9407DIE,0.598965,0.207917,0.558691,0.913485,0.738199,0.094040,0.216571,0.926080,0.782726,...,0.690914,0.116587,0.377514,0.431713,0.592968,0.860472,0.048749,2016,3,0.000000
996746,Z9407DIE,0.598965,0.207917,0.558691,0.913485,0.738199,0.094040,0.216571,0.926080,0.782726,...,0.690914,0.116587,0.377514,0.431713,0.592968,0.860472,0.048749,2016,5,0.000188
996747,Z9407DIE,0.598965,0.207917,0.558691,0.913485,0.738199,0.094040,0.216571,0.926080,0.782726,...,0.690914,0.116587,0.377514,0.431713,0.592968,0.860472,0.048749,2016,6,0.000000


In [46]:
X_data = df_final.drop(['codigo','total_cp'], axis=1).to_numpy()

In [47]:
X_data.shape

(996749, 52)

In [48]:
y_data = df_final['total_cp'].to_numpy()

In [49]:
y_data

array([0.        , 0.        , 0.        , ..., 0.00018825, 0.        ,
       0.        ])

In [50]:
X_train, X_test, y_train, y_test = train_test_split( X_data, y_data, test_size=0.30, random_state=43) 

### Modelo

In [51]:
regr_2 = DecisionTreeRegressor(max_depth=10, min_samples_split=3, min_samples_leaf=3)
regr_3 = RandomForestRegressor(max_depth=5, min_samples_split=3, min_samples_leaf=3)
regr_4 = ExtraTreesRegressor(max_depth=5, min_samples_split=3, min_samples_leaf=3)

### Entrenamiento

In [52]:
regr_2.fit(X_train, y_train)
#regr_3.fit(X_train, y_train)
#regr_4.fit(X_train, y_train)

### Validacion

##### DecisionTreeRegressor

In [53]:
pred_2_train = regr_2.predict(X_train)
pred_2_test = regr_2.predict(X_test)

In [54]:
mse_2_train = mean_squared_error(y_train, pred_2_train)
mse_2_test  = mean_squared_error(y_test, pred_2_test)

In [55]:
mse_2_train, mse_2_test

(3.4284427409616e-05, 3.5504312391970816e-05)

In [56]:
rmse_2_train = np.sqrt(mse_2_train)
rmse_2_test  = np.sqrt(mse_2_test)
print('RMSE:', round(rmse_2_train,4), round(rmse_2_test,4))

RMSE: 0.0059 0.006


In [57]:
score_2 = r2_score(y_test, pred_2_test)
print(f'R2 score: {score_2:.5f}')

R2 score: 0.63855


##### RandomForestRegressor(max_depth=5)

In [58]:
pred_3_train = regr_3.predict(X_train)
pred_3_test = regr_3.predict(X_test)

NotFittedError: ignored

In [None]:
mse_3_train = mean_squared_error(y_train, pred_3_train)
mse_3_test  = mean_squared_error(y_test, pred_3_test)

In [None]:
mse_3_train, mse_3_test

In [None]:
rmse_3_train = np.sqrt(mse_3_train)
rmse_3_test  = np.sqrt(mse_3_test)
print('RMSE:', round(rmse_3_train,4), round(rmse_3_test,4))

In [None]:
score_3 = r2_score(y_test, pred_3_test)
print(f'R2 score: {score_3:.5f}')

##### ExtraTreesRegressor(max_depth=5)

In [None]:
pred_4_train = regr_4.predict(X_train)
pred_4_test = regr_4.predict(X_test)

In [None]:
mse_4_train = mean_squared_error(y_train, pred_4_train)
mse_4_test  = mean_squared_error(y_test, pred_4_test)

In [None]:
mse_4_train, mse_4_test

In [None]:
rmse_4_train = np.sqrt(mse_4_train)
rmse_4_test  = np.sqrt(mse_4_test)
print('RMSE:', round(rmse_4_train,4), round(rmse_4_test,4))

In [None]:
score_4 = r2_score(y_test, pred_4_test)
print(f'R2 score: {score_4:.5f}')

### Prediccion

In [None]:
# Ingresar codigo
codigo_in = random.choice(encoder.classes_) 
new_codigo_in = encoder.transform([codigo_in])
new_codigo_embedding = matriz_embedding[new_codigo_in]


In [None]:
# Ingresar año y mes
anio_in = 2023
mes_in = 6

In [None]:
atributos_in = np.concatenate((new_codigo_embedding, anio_in, mes_in), axis=None)

In [None]:
atributos_in = atributos_in.reshape(1, 52)

In [None]:
pred = regr_2.predict(atributos_in)

In [None]:
pred = mmscaler.inverse_transform(pred.reshape(-1, 1))

In [None]:
print(f"Prediccion para el código {codigo_in} Año {anio_in} Mes {mes_in}: {np.round(pred[0][0],2)} incidentes")  

### Verificacion grafica

In [None]:
filtro = df_final['codigo'] == codigo_in
df_filtrado = df_final[filtro]
df_filtrado

In [None]:
df_filtrado = df_filtrado.sort_values(['año','mes'], ascending=True)

In [None]:
fechas = []
totales = []

for row in df_filtrado.iterrows():
    fechas.append(str(row[1][51])+'-'+str(row[1][52])+'-01')
    totales.append(row[1][53])

In [None]:
fechas[0]

In [None]:
totales = mmscaler.inverse_transform([totales])

In [None]:
df_graf = pd.DataFrame({'fechas':fechas, 'totales':totales.reshape(-1,)})

In [None]:
plt.figure(figsize=(20, 5))
sns.set(color_codes=True)
plt.xticks(rotation=90)
sns.scatterplot(x='fechas', y='totales', data=df_graf)
plt.scatter(x=(str(anio_in)+'-'+str(mes_in)+'-01'), y=pred, color='red')
plt.plot(df_graf.fechas, df_graf.totales)
plt.title("Codigo: "+codigo_in)
plt.show()