# Notebook principal para modelado

```{mermaid}{mermaid}
graph LR
A[Notebook A] -->|1-Ejecuta limpieza | B[Notebook B]
B<--> |2-Lectura de datos| C[(Storage)]
B--> |3-Limpieza de datos|B
B--> |4-Guarda datos limpios|C
A --> |5-Ejecuta preprocesamiento|D[Notebook C]
D<--> |6-Lectura datos| C[(Storage)]
D-->|7-Preprocesamiento|D
D--> |8-Guarda datos preprocesado|C
A-->|9-Ejecuta modelado|E[[Ajuste de modelo]]
E--> |10-Guarda modelo|C
```

In [25]:
import os
import pandas as pd
import papermill as pm
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Params

In [10]:
input_path_data_cleaning = 'storage/01_titanic_dataset.csv'
output_path_data_cleaning = 'storage/02_data_cleaned.parquet'

input_path_data_preprocessing = output_path_data_cleaning
output_path_data_preprocessing = 'storage/03_data_preprocessed.parquet'

## Ejecución de limpieza de datos

In [11]:
pm.execute_notebook(
    input_path = 'Notebook_B.ipynb',  
    output_path = 'Notebook_B_out.ipynb',
    parameters = {
        'INPUT_DATA': input_path_data_cleaning,
        'OUTPUT_DATA': output_path_data_cleaning
    }, # Parámetros a pasar al cuaderno de entrada
)

os.remove('Notebook_B_out.ipynb')

Black is not installed, parameters wont be formatted
Executing: 100%|██████████| 12/12 [00:01<00:00,  6.01cell/s]


## Ejecución de preprocesamiento

In [21]:
pm.execute_notebook(
    input_path = 'Notebook_C.ipynb',  
    output_path = 'Notebook_C_out.ipynb',
    parameters = {
        'INPUT_DATA': input_path_data_preprocessing,
        'OUTPUT_DATA': output_path_data_preprocessing
    }, # Parámetros a pasar al cuaderno de entrada
)

os.remove('Notebook_C_out.ipynb')

Black is not installed, parameters wont be formatted
Executing: 100%|██████████| 16/16 [00:02<00:00,  7.96cell/s]


## Lectura de datos resultantes de step anteriores

In [22]:
df = pd.read_parquet(output_path_data_preprocessing)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,adult_male,alone,embarked_C,embarked_Q,embarked_S,family_size,is_child,age_group_0-18,age_group_19-30,age_group_31-50,age_group_51+
1,1,1,1,38.0,1,0,71.2833,False,False,1,0,0,2,False,0,0,1,0
3,1,1,1,35.0,1,0,53.1,False,False,0,0,1,2,False,0,0,1,0
6,0,1,0,54.0,0,0,51.8625,True,True,0,0,1,1,False,0,0,0,1
10,1,3,1,4.0,1,1,16.7,False,False,0,0,1,3,True,1,0,0,0
11,1,1,1,58.0,0,0,26.55,False,True,0,0,1,1,False,0,0,0,1


## Modelado

In [23]:
CLASE = "survived"

y = df[CLASE]
X = df.drop([CLASE], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=True, stratify=y, random_state=42
)

In [24]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

## Guardado del modelo

In [26]:
joblib.dump(clf,'storage/model.joblib')

['storage/model.joblib']