<a href="https://colab.research.google.com/github/gabrielfernandorey/GGGR/blob/main/LSTM/SRT%20v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SRT v4 - Ensayo
### Arboles de regresion - Nuevo Dataset

In [None]:
import random
import string
import pandas as pd
import numpy as np
from datetime import date

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set() # setting seaborn default for plots

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


### Importamos Datos

In [None]:
!wget https://raw.githubusercontent.com/gabrielfernandorey/GGGR/main/Regresion%20lineal/serie_acc_SRT.csv

--2023-05-07 00:15:30--  https://raw.githubusercontent.com/gabrielfernandorey/GGGR/main/Regresion%20lineal/serie_acc_SRT.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26984867 (26M) [text/plain]
Saving to: ‘serie_acc_SRT.csv’


2023-05-07 00:15:32 (35.7 MB/s) - ‘serie_acc_SRT.csv’ saved [26984867/26984867]



In [None]:
df = pd.read_csv('/content/serie_acc_SRT.csv', header=0, sep=';',
                 names=['año','mes','codigo','total_cp'],
                 dtype={'año': str, 'mes': str, 'codigo': str, 'total_cp': str})

In [None]:
column_order = ['codigo', 'año', 'mes', 'total_cp']
new_df = df.reindex(columns=column_order)
df = new_df.copy()

In [None]:
df

Unnamed: 0,codigo,año,mes,total_cp
0,01657000,1995,1,1
1,08300000,1995,7,1
2,01900000,1995,9,1
3,01900000,1995,11,1
4,01650000,1996,3,1
...,...,...,...,...
1470273,05582000,2023,5,1
1470274,01754000,2023,5,7
1470275,03503000,2023,5,3
1470276,X5000FJA,2023,5,1


### EDA

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470278 entries, 0 to 1470277
Data columns (total 4 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   codigo    1469958 non-null  object
 1   año       1470278 non-null  object
 2   mes       1470278 non-null  object
 3   total_cp  1470278 non-null  object
dtypes: object(4)
memory usage: 44.9+ MB


#### Ajustamos tipo de datos

In [None]:
df['año'] = pd.to_numeric(df['año'], downcast='integer') #corregimos el formato de la columna 

In [None]:
df['mes'] = pd.to_numeric(df['mes']) #corregimos el formato de la columna 

In [None]:
df['total_cp'] = pd.to_numeric(df['total_cp']) #corregimos el formato de la columna 

In [None]:
df.dtypes

codigo      object
año          int16
mes          int64
total_cp     int64
dtype: object

In [None]:
df.head()

Unnamed: 0,codigo,año,mes,total_cp
0,1657000,1995,1,1
1,8300000,1995,7,1
2,1900000,1995,9,1
3,1900000,1995,11,1
4,1650000,1996,3,1


### Eliminamos NaN

In [None]:
df = df.dropna(subset=['codigo'])
df

Unnamed: 0,codigo,año,mes,total_cp
0,01657000,1995,1,1
1,08300000,1995,7,1
2,01900000,1995,9,1
3,01900000,1995,11,1
4,01650000,1996,3,1
...,...,...,...,...
1470273,05582000,2023,5,1
1470274,01754000,2023,5,7
1470275,03503000,2023,5,3
1470276,X5000FJA,2023,5,1


In [None]:
df = df.sort_values(['año','mes'], ascending=True)
df

Unnamed: 0,codigo,año,mes,total_cp
0,01657000,1995,1,1
1,08300000,1995,7,1
2,01900000,1995,9,1
3,01900000,1995,11,1
4,01650000,1996,3,1
...,...,...,...,...
1470273,05582000,2023,5,1
1470274,01754000,2023,5,7
1470275,03503000,2023,5,3
1470276,X5000FJA,2023,5,1


In [None]:
# Verificar cuantos registros tienen el codpos mayor a 8 caracteres
df_codigo = df.loc[df['codigo'].apply(lambda x: len(str(x)) > 8)]
df_codigo

Unnamed: 0,codigo,año,mes,total_cp


In [None]:
# Verificar cuantos registros tienen el codpos menor a 4 caracteres
df_codigo = df.loc[df['codigo'].apply(lambda x: len(str(x)) < 4)]
df_codigo

Unnamed: 0,codigo,año,mes,total_cp


### Convertimos el codigo postal a 4 caracteres

In [None]:
df.codigo = df['codigo'].str.slice(start=1, stop=5)
df

Unnamed: 0,codigo,año,mes,total_cp
0,1657,1995,1,1
1,8300,1995,7,1
2,1900,1995,9,1
3,1900,1995,11,1
4,1650,1996,3,1
...,...,...,...,...
1470273,5582,2023,5,1
1470274,1754,2023,5,7
1470275,3503,2023,5,3
1470276,5000,2023,5,1


#### Verificamos cantidad de registros por codigo

In [None]:
# codigos unicos
contador = df['codigo'].value_counts()
contador

5000    21916
2000    18100
4000    13459
5500    12482
1425    12020
        ...  
4361        1
6412        1
4677        1
5884        1
3033        1
Name: codigo, Length: 3355, dtype: int64

### Conservamos al menos 12 meses por codigo

In [None]:
mayores_a_12_datos = contador[contador > 12]

In [None]:
len(mayores_a_12_datos)

2610

In [None]:
df = df[df['codigo'].isin(mayores_a_12_datos.index)]

In [None]:
df

Unnamed: 0,codigo,año,mes,total_cp
0,1657,1995,1,1
1,8300,1995,7,1
2,1900,1995,9,1
3,1900,1995,11,1
4,1650,1996,3,1
...,...,...,...,...
1470273,5582,2023,5,1
1470274,1754,2023,5,7
1470275,3503,2023,5,3
1470276,5000,2023,5,1


### Eliminamos códigos no válidos

In [None]:
df = df.loc[df.codigo != "    "]

In [None]:
df = df.loc[df.codigo != "   0"]

In [None]:
df = df.loc[df.codigo != "0000"]

In [None]:
df = df.loc[df.codigo != "0001"]

In [None]:
df = df.loc[df.codigo != "0005"]

### Transformacion

#### LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Crear una instancia de OneHotEncoder
encoder = LabelEncoder()

In [None]:
# Codificar los valores alfanuméricos
X_encoded = encoder.fit_transform(df.codigo.values.reshape(-1,))

In [None]:
X_encoded.shape

(1466626,)

#### Embeddings

In [None]:
tamano_embedding = 50

In [None]:
matriz_embedding = np.random.rand(len(encoder.classes_), tamano_embedding)

In [None]:
matriz_embedding

array([[0.34765086, 0.42374962, 0.2411239 , ..., 0.94840079, 0.48719588,
        0.84148951],
       [0.42881301, 0.59511559, 0.11500518, ..., 0.72682298, 0.80507346,
        0.50576475],
       [0.14543468, 0.62766832, 0.36054143, ..., 0.33139388, 0.53791822,
        0.11189219],
       ...,
       [0.29422632, 0.78238901, 0.96971616, ..., 0.74674117, 0.12927801,
        0.53738548],
       [0.51170871, 0.55817769, 0.8609915 , ..., 0.37126118, 0.04843083,
        0.88303488],
       [0.95602335, 0.74197389, 0.5121128 , ..., 0.15962132, 0.33340065,
        0.83528841]])

In [None]:
encoder.classes_

array(['1000', '1001', '1002', ..., '9412', '9420', '9999'], dtype=object)

In [None]:
# Vectores de embeddings asociados a un valor por cada atributo
atributos_embeddings = matriz_embedding[X_encoded]

In [None]:
atributos_embeddings.shape

(1466626, 50)

#### Verificacion embeddings

In [None]:
i = 0
encoder.classes_[i]

'1000'

In [None]:
matriz_embedding[i]

array([0.34765086, 0.42374962, 0.2411239 , 0.58207705, 0.23169438,
       0.84414058, 0.55776054, 0.16373834, 0.03545   , 0.61431216,
       0.08825849, 0.38014731, 0.40840566, 0.57176366, 0.96356093,
       0.66218209, 0.88253113, 0.84456628, 0.69433916, 0.03064177,
       0.41298018, 0.83640492, 0.88983881, 0.89838062, 0.08304833,
       0.35937368, 0.13133554, 0.04734081, 0.70692098, 0.56247501,
       0.814896  , 0.74794829, 0.12861773, 0.80285844, 0.07457162,
       0.69100405, 0.8315956 , 0.59410756, 0.09630605, 0.24705336,
       0.64602416, 0.9846566 , 0.75460887, 0.50760403, 0.40117309,
       0.57365195, 0.13782538, 0.94840079, 0.48719588, 0.84148951])

In [None]:
matriz_embedding.shape

(2605, 50)

#### Dataframe con datos de embeddings

In [None]:
# Creamos un nuevo dataframe con los valores del embedding
embedding_df = pd.DataFrame(matriz_embedding, columns=[f'emb_{i}' for i in range(matriz_embedding.shape[1])])

In [None]:
embedding_df

Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_40,emb_41,emb_42,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49
0,0.347651,0.423750,0.241124,0.582077,0.231694,0.844141,0.557761,0.163738,0.035450,0.614312,...,0.646024,0.984657,0.754609,0.507604,0.401173,0.573652,0.137825,0.948401,0.487196,0.841490
1,0.428813,0.595116,0.115005,0.152337,0.871535,0.202826,0.257287,0.737266,0.948735,0.299486,...,0.224652,0.636091,0.737981,0.825898,0.952003,0.931050,0.994735,0.726823,0.805073,0.505765
2,0.145435,0.627668,0.360541,0.068103,0.337055,0.595825,0.805673,0.042375,0.127257,0.472042,...,0.604337,0.534223,0.273597,0.983368,0.518469,0.662990,0.809795,0.331394,0.537918,0.111892
3,0.320327,0.290243,0.145959,0.244058,0.211176,0.541624,0.615542,0.722359,0.706872,0.001551,...,0.960283,0.050008,0.464598,0.396550,0.287473,0.879473,0.864345,0.109365,0.933852,0.498087
4,0.157442,0.458735,0.160317,0.518039,0.359396,0.883476,0.055859,0.019694,0.745267,0.513579,...,0.698513,0.002063,0.221436,0.340213,0.908890,0.096180,0.289594,0.886545,0.557095,0.483195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2600,0.961880,0.838975,0.561755,0.409904,0.149273,0.936763,0.601275,0.394840,0.040919,0.314967,...,0.657267,0.496934,0.266962,0.087383,0.953331,0.544004,0.390692,0.179309,0.164014,0.284258
2601,0.818498,0.131161,0.111914,0.010867,0.612989,0.546496,0.688689,0.058273,0.747988,0.173909,...,0.398481,0.662757,0.930753,0.235281,0.245825,0.707305,0.400735,0.938704,0.687425,0.304161
2602,0.294226,0.782389,0.969716,0.658410,0.336029,0.260681,0.412973,0.923362,0.508144,0.797258,...,0.199664,0.997770,0.693548,0.070060,0.316671,0.098183,0.491740,0.746741,0.129278,0.537385
2603,0.511709,0.558178,0.860991,0.359487,0.707062,0.844868,0.123673,0.981296,0.193968,0.023786,...,0.831215,0.127630,0.002831,0.530014,0.282470,0.422988,0.545911,0.371261,0.048431,0.883035


In [None]:
embedding_df['codigo'] = encoder.classes_

In [None]:
# Utilizamos el codigo como indice (para joinear)
embedding_df = embedding_df.set_index('codigo')
embedding_df

Unnamed: 0_level_0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_40,emb_41,emb_42,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49
codigo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0.347651,0.423750,0.241124,0.582077,0.231694,0.844141,0.557761,0.163738,0.035450,0.614312,...,0.646024,0.984657,0.754609,0.507604,0.401173,0.573652,0.137825,0.948401,0.487196,0.841490
1001,0.428813,0.595116,0.115005,0.152337,0.871535,0.202826,0.257287,0.737266,0.948735,0.299486,...,0.224652,0.636091,0.737981,0.825898,0.952003,0.931050,0.994735,0.726823,0.805073,0.505765
1002,0.145435,0.627668,0.360541,0.068103,0.337055,0.595825,0.805673,0.042375,0.127257,0.472042,...,0.604337,0.534223,0.273597,0.983368,0.518469,0.662990,0.809795,0.331394,0.537918,0.111892
1003,0.320327,0.290243,0.145959,0.244058,0.211176,0.541624,0.615542,0.722359,0.706872,0.001551,...,0.960283,0.050008,0.464598,0.396550,0.287473,0.879473,0.864345,0.109365,0.933852,0.498087
1004,0.157442,0.458735,0.160317,0.518039,0.359396,0.883476,0.055859,0.019694,0.745267,0.513579,...,0.698513,0.002063,0.221436,0.340213,0.908890,0.096180,0.289594,0.886545,0.557095,0.483195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9408,0.961880,0.838975,0.561755,0.409904,0.149273,0.936763,0.601275,0.394840,0.040919,0.314967,...,0.657267,0.496934,0.266962,0.087383,0.953331,0.544004,0.390692,0.179309,0.164014,0.284258
9410,0.818498,0.131161,0.111914,0.010867,0.612989,0.546496,0.688689,0.058273,0.747988,0.173909,...,0.398481,0.662757,0.930753,0.235281,0.245825,0.707305,0.400735,0.938704,0.687425,0.304161
9412,0.294226,0.782389,0.969716,0.658410,0.336029,0.260681,0.412973,0.923362,0.508144,0.797258,...,0.199664,0.997770,0.693548,0.070060,0.316671,0.098183,0.491740,0.746741,0.129278,0.537385
9420,0.511709,0.558178,0.860991,0.359487,0.707062,0.844868,0.123673,0.981296,0.193968,0.023786,...,0.831215,0.127630,0.002831,0.530014,0.282470,0.422988,0.545911,0.371261,0.048431,0.883035


In [None]:
# Utilizamos el codigo como indice (para joinear)
df = df.set_index('codigo')
df

Unnamed: 0_level_0,año,mes,total_cp
codigo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1657,1995,1,1
8300,1995,7,1
1900,1995,9,1
1900,1995,11,1
1650,1996,3,1
...,...,...,...
5582,2023,5,1
1754,2023,5,7
3503,2023,5,3
5000,2023,5,1


In [None]:
df.total_cp.max()

5313

### Transformar la variable de salida

In [None]:
mmscaler = MinMaxScaler()

In [None]:
df['total_cp'] = mmscaler.fit_transform(df['total_cp'].values.reshape(-1,1))

In [None]:
df.head()

Unnamed: 0_level_0,año,mes,total_cp
codigo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1657,1995,1,0.0
8300,1995,7,0.0
1900,1995,9,0.0
1900,1995,11,0.0
1650,1996,3,0.0


#### Joins de Dataframes

In [None]:
df_join = embedding_df.join(df, lsuffix='_izq', rsuffix='_der')
df_join

Unnamed: 0_level_0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49,año,mes,total_cp
codigo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0.347651,0.423750,0.241124,0.582077,0.231694,0.844141,0.557761,0.163738,0.035450,0.614312,...,0.507604,0.401173,0.573652,0.137825,0.948401,0.487196,0.841490,1996,12,0.000000
1000,0.347651,0.423750,0.241124,0.582077,0.231694,0.844141,0.557761,0.163738,0.035450,0.614312,...,0.507604,0.401173,0.573652,0.137825,0.948401,0.487196,0.841490,1997,10,0.000188
1000,0.347651,0.423750,0.241124,0.582077,0.231694,0.844141,0.557761,0.163738,0.035450,0.614312,...,0.507604,0.401173,0.573652,0.137825,0.948401,0.487196,0.841490,1997,11,0.000000
1000,0.347651,0.423750,0.241124,0.582077,0.231694,0.844141,0.557761,0.163738,0.035450,0.614312,...,0.507604,0.401173,0.573652,0.137825,0.948401,0.487196,0.841490,1998,2,0.000941
1000,0.347651,0.423750,0.241124,0.582077,0.231694,0.844141,0.557761,0.163738,0.035450,0.614312,...,0.507604,0.401173,0.573652,0.137825,0.948401,0.487196,0.841490,1998,9,0.000188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999,0.956023,0.741974,0.512113,0.375719,0.660897,0.278925,0.691733,0.937478,0.655478,0.270477,...,0.963484,0.028042,0.682683,0.636836,0.159621,0.333401,0.835288,2013,7,0.000188
9999,0.956023,0.741974,0.512113,0.375719,0.660897,0.278925,0.691733,0.937478,0.655478,0.270477,...,0.963484,0.028042,0.682683,0.636836,0.159621,0.333401,0.835288,2013,8,0.000188
9999,0.956023,0.741974,0.512113,0.375719,0.660897,0.278925,0.691733,0.937478,0.655478,0.270477,...,0.963484,0.028042,0.682683,0.636836,0.159621,0.333401,0.835288,2013,9,0.000188
9999,0.956023,0.741974,0.512113,0.375719,0.660897,0.278925,0.691733,0.937478,0.655478,0.270477,...,0.963484,0.028042,0.682683,0.636836,0.159621,0.333401,0.835288,2013,12,0.000000


#### Datos de entrenamiento

In [None]:
df_final = df_join.reset_index()
df_final

Unnamed: 0,codigo,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49,año,mes,total_cp
0,1000,0.347651,0.423750,0.241124,0.582077,0.231694,0.844141,0.557761,0.163738,0.035450,...,0.507604,0.401173,0.573652,0.137825,0.948401,0.487196,0.841490,1996,12,0.000000
1,1000,0.347651,0.423750,0.241124,0.582077,0.231694,0.844141,0.557761,0.163738,0.035450,...,0.507604,0.401173,0.573652,0.137825,0.948401,0.487196,0.841490,1997,10,0.000188
2,1000,0.347651,0.423750,0.241124,0.582077,0.231694,0.844141,0.557761,0.163738,0.035450,...,0.507604,0.401173,0.573652,0.137825,0.948401,0.487196,0.841490,1997,11,0.000000
3,1000,0.347651,0.423750,0.241124,0.582077,0.231694,0.844141,0.557761,0.163738,0.035450,...,0.507604,0.401173,0.573652,0.137825,0.948401,0.487196,0.841490,1998,2,0.000941
4,1000,0.347651,0.423750,0.241124,0.582077,0.231694,0.844141,0.557761,0.163738,0.035450,...,0.507604,0.401173,0.573652,0.137825,0.948401,0.487196,0.841490,1998,9,0.000188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1466621,9999,0.956023,0.741974,0.512113,0.375719,0.660897,0.278925,0.691733,0.937478,0.655478,...,0.963484,0.028042,0.682683,0.636836,0.159621,0.333401,0.835288,2013,7,0.000188
1466622,9999,0.956023,0.741974,0.512113,0.375719,0.660897,0.278925,0.691733,0.937478,0.655478,...,0.963484,0.028042,0.682683,0.636836,0.159621,0.333401,0.835288,2013,8,0.000188
1466623,9999,0.956023,0.741974,0.512113,0.375719,0.660897,0.278925,0.691733,0.937478,0.655478,...,0.963484,0.028042,0.682683,0.636836,0.159621,0.333401,0.835288,2013,9,0.000188
1466624,9999,0.956023,0.741974,0.512113,0.375719,0.660897,0.278925,0.691733,0.937478,0.655478,...,0.963484,0.028042,0.682683,0.636836,0.159621,0.333401,0.835288,2013,12,0.000000


In [None]:
X_data = df_final.drop(['codigo','total_cp'], axis=1).to_numpy()

In [None]:
X_data.shape

(1466626, 52)

In [None]:
y_data = df_final['total_cp'].to_numpy()

In [None]:
y_data

array([0.        , 0.00018825, 0.        , ..., 0.00018825, 0.        ,
       0.        ])

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X_data, y_data, test_size=0.30, random_state=43) 

### Modelo

In [None]:
regr_2 = DecisionTreeRegressor(max_depth=10, min_samples_split=3, min_samples_leaf=3)
regr_3 = RandomForestRegressor(max_depth=5, min_samples_split=3, min_samples_leaf=3)
regr_4 = ExtraTreesRegressor(max_depth=5, min_samples_split=3, min_samples_leaf=3)

### Entrenamiento

In [None]:
#regr_2.fit(X_train, y_train)
regr_3.fit(X_train, y_train)
#regr_4.fit(X_train, y_train)

### Validacion

##### DecisionTreeRegressor

In [None]:
pred_2_train = regr_2.predict(X_train)
pred_2_test = regr_2.predict(X_test)

In [None]:
mse_2_train = mean_squared_error(y_train, pred_2_train)
mse_2_test  = mean_squared_error(y_test, pred_2_test)

In [None]:
mse_2_train, mse_2_test

(6.432420660259106e-05, 6.772408730520266e-05)

In [None]:
rmse_2_train = np.sqrt(mse_2_train)
rmse_2_test  = np.sqrt(mse_2_test)
print('RMSE:', round(rmse_2_train,4), round(rmse_2_test,4))

RMSE: 0.008 0.0082


In [None]:
score_2 = r2_score(y_test, pred_2_test)
print(f'R2 score: {score_2:.5f}')

R2 score: 0.00128


##### RandomForestRegressor(max_depth=5)

In [None]:
pred_3_train = regr_3.predict(X_train)
pred_3_test = regr_3.predict(X_test)

In [None]:
mse_3_train = mean_squared_error(y_train, pred_3_train)
mse_3_test  = mean_squared_error(y_test, pred_3_test)

In [None]:
mse_3_train, mse_3_test

In [None]:
rmse_3_train = np.sqrt(mse_3_train)
rmse_3_test  = np.sqrt(mse_3_test)
print('RMSE:', round(rmse_3_train,4), round(rmse_3_test,4))

In [None]:
score_3 = r2_score(y_test, pred_3_test)
print(f'R2 score: {score_3:.5f}')

##### ExtraTreesRegressor(max_depth=5)

In [None]:
pred_4_train = regr_4.predict(X_train)
pred_4_test = regr_4.predict(X_test)

In [None]:
mse_4_train = mean_squared_error(y_train, pred_4_train)
mse_4_test  = mean_squared_error(y_test, pred_4_test)

In [None]:
mse_4_train, mse_4_test

In [None]:
rmse_4_train = np.sqrt(mse_4_train)
rmse_4_test  = np.sqrt(mse_4_test)
print('RMSE:', round(rmse_4_train,4), round(rmse_4_test,4))

In [None]:
score_4 = r2_score(y_test, pred_4_test)
print(f'R2 score: {score_4:.5f}')

### Prediccion

In [None]:
# Ingresar codigo
codigo_in = random.choice(encoder.classes_) 
new_codigo_in = encoder.transform([codigo_in])
new_codigo_embedding = matriz_embedding[new_codigo_in]


In [None]:
# Ingresar año y mes
anio_in = 2023
mes_in = 6

In [None]:
atributos_in = np.concatenate((new_codigo_embedding, anio_in, mes_in), axis=None)

In [None]:
atributos_in = atributos_in.reshape(1, 52)

In [None]:
pred = regr_2.predict(atributos_in)

In [None]:
pred = mmscaler.inverse_transform(pred.reshape(-1, 1))

In [None]:
print(f"Prediccion para el código {codigo_in} Año {anio_in} Mes {mes_in}: {np.round(pred[0][0],2)} incidentes")  

### Verificacion grafica

In [None]:
filtro = df_final['codigo'] == codigo_in
df_filtrado = df_final[filtro]
df_filtrado

In [None]:
df_filtrado = df_filtrado.sort_values(['año','mes'], ascending=True)

In [None]:
fechas = []
totales = []

for row in df_filtrado.iterrows():
    fechas.append(str(row[1][51])+'-'+str(row[1][52])+'-01')
    totales.append(row[1][53])

In [None]:
fechas[0]

In [None]:
totales = mmscaler.inverse_transform([totales])

In [None]:
df_graf = pd.DataFrame({'fechas':fechas, 'totales':totales.reshape(-1,)})

In [None]:
plt.figure(figsize=(20, 5))
sns.set(color_codes=True)
plt.xticks(rotation=90)
sns.scatterplot(x='fechas', y='totales', data=df_graf)
plt.scatter(x=(str(anio_in)+'-'+str(mes_in)+'-01'), y=pred, color='red')
plt.plot(df_graf.fechas, df_graf.totales)
plt.title("Codigo: "+codigo_in)
plt.show()