In [4]:

import os
import sys
import pandas as pd
import mlflow

# Adiciona o diretório src ao PYTHONPATH
project_dir = os.path.abspath(os.path.join(os.getcwd(), '../../../'))
if project_dir not in sys.path:
    sys.path.append(project_dir)

from src.config import (
    get_config,
    get_data_path,
    configure_mlflow,
    CONFIG,
    logger,
    USE_S3,
    DATA_PATH,
    SAMPLE_RATE,
    COLD_START_THRESHOLD
)
from src.features.constants import USERS_COLS_TO_EXPLODE, USERS_DTYPES, NEWS_COLS_TO_DROP
from storage.io import Storage
from src.features.utils import concatenate_csv_files

from src.train.utils import prepare_features, load_train_data
from src.train.core import (
    log_model_to_mlflow,
    log_encoder_mapping,
    log_basic_metrics,
    get_run_name,
)
from src.train.pipeline import (
    load_features,
    prepare_and_save_train_data,
    validate_and_load_train_data,
    train_and_log_model
)



In [3]:
configure_mlflow()
storage = Storage(use_s3=USE_S3)

2025-03-01 18:39:54,725 - config - INFO - factory.py - Inicializando armazenamento local


In [5]:
final_feats = load_features(storage)


2025-03-01 18:41:13,098 - src.config - INFO - pipeline.py - Carregando features de /Users/mauricioaraujo/Eng_ML/FIAP/Fase_5-MLOps/ML_Engineer_Datathon/data/features/final_feats_with_target.parquet...
2025-03-01 18:41:14,447 - src.config - INFO - pipeline.py - Shape: (802824, 17)


In [6]:

# trusted = prepare_and_save_train_data(storage, final_feats)

trusted = prepare_features(final_feats)
# train_dir = os.path.join(DATA_PATH, "train")
# for key, data in trusted.items():
#     if key == "encoder_mapping":
#         continue
#     if not isinstance(data, pd.DataFrame):
#         data = pd.DataFrame(data)
#     file_path = os.path.join(train_dir, f"{key}.parquet")
#     logger.info("Salvando '%s' em %s, shape: %s", key, file_path, data.shape)
#     storage.write_parquet(data, file_path)




2025-03-01 18:42:21,822 - src.config - INFO - utils.py - Separando features do target...
2025-03-01 18:42:22,058 - src.config - INFO - utils.py - Removido 64467 registros cold start
2025-03-01 18:42:22,138 - src.config - INFO - utils.py - Proporção de registros cold_start: 8.03 %
2025-03-01 18:42:22,139 - src.config - INFO - utils.py - Dividindo dados em treino e teste...
2025-03-01 18:42:22,286 - src.config - INFO - utils.py - Aplicando Frequency Encoding nas variáveis categóricas...
2025-03-01 18:42:22,422 - src.config - INFO - utils.py - Removendo identificadores...


In [9]:
X_train, y_train, group_train = validate_and_load_train_data(storage)


2025-03-01 18:43:03,117 - src.config - INFO - pipeline.py - Validando dados de treino...
2025-03-01 18:43:03,119 - src.config - INFO - utils.py - Carregando X_train de /Users/mauricioaraujo/Eng_ML/FIAP/Fase_5-MLOps/ML_Engineer_Datathon/data/train/X_train.parquet...
2025-03-01 18:43:03,233 - src.config - INFO - utils.py - Carregando y_train de /Users/mauricioaraujo/Eng_ML/FIAP/Fase_5-MLOps/ML_Engineer_Datathon/data/train/y_train.parquet...
2025-03-01 18:43:03,238 - src.config - INFO - pipeline.py - X_train: (516849, 11), y_train: (516849, 1)
2025-03-01 18:43:03,266 - src.config - INFO - pipeline.py - Dados: X_train (516849, 11), y_train (516849, 1), group_train (17047, 2)


In [10]:
X_train.head()

Unnamed: 0,isWeekend,relLocalState,relLocalRegion,relThemeMain,relThemeSub,userTypeFreq,dayPeriodFreq,localStateFreq,localRegionFreq,themeMainFreq,themeSubFreq
0,False,0.044944,0.044944,,,0.530528,0.322764,0.042995,0.042995,,
1,False,0.933333,0.933333,,,0.530528,0.139795,0.009825,0.008729,,
2,False,0.014085,0.014085,,,0.469472,0.395706,0.054219,0.054219,,
3,False,,,0.16,,0.530528,0.141736,,,0.116119,
4,False,0.136364,0.136364,,,0.530528,0.139795,0.235105,0.221997,,


In [16]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516849 entries, 0 to 516848
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   isWeekend        516849 non-null  bool   
 1   relLocalState    328338 non-null  float64
 2   relLocalRegion   328338 non-null  float64
 3   relThemeMain     189245 non-null  float64
 4   relThemeSub      51377 non-null   float64
 5   userTypeFreq     516849 non-null  float64
 6   dayPeriodFreq    516849 non-null  float64
 7   localStateFreq   328338 non-null  float64
 8   localRegionFreq  328338 non-null  float64
 9   themeMainFreq    189245 non-null  float64
 10  themeSubFreq     51377 non-null   float64
dtypes: bool(1), float64(10)
memory usage: 39.9 MB


In [15]:
X_train.describe()

Unnamed: 0,relLocalState,relLocalRegion,relThemeMain,relThemeSub,userTypeFreq,dayPeriodFreq,localStateFreq,localRegionFreq,themeMainFreq,themeSubFreq
count,328338.0,328338.0,189245.0,51377.0,516849.0,516849.0,328338.0,328338.0,189245.0,51377.0
mean,0.215626,0.168085,0.110239,0.084915,0.501864,0.300391,0.142669,0.080966,0.116576,0.128502
std,0.224882,0.210149,0.127348,0.141364,0.030471,0.104545,0.107214,0.081872,0.081895,0.09878
min,0.000524,0.000279,0.000279,0.000279,0.469472,0.139795,0.003167,3e-06,5e-06,1.9e-05
25%,0.052632,0.030435,0.033981,0.016667,0.469472,0.141736,0.039965,0.016711,0.057518,0.041127
50%,0.148973,0.084337,0.073826,0.038043,0.530528,0.322764,0.06493,0.042995,0.116119,0.09261
75%,0.267442,0.205128,0.138158,0.085526,0.530528,0.395706,0.261718,0.11853,0.149246,0.210853
max,1.0,1.0,1.0,1.0,0.530528,0.395706,0.261718,0.221997,0.239753,0.248964


In [17]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516849 entries, 0 to 516848
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   TARGET  516849 non-null  int64
dtypes: int64(1)
memory usage: 3.9 MB


In [18]:
group_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17047 entries, 0 to 17046
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   userId      17047 non-null  object
 1   groupCount  17047 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 266.5+ KB


In [13]:
group_train

Unnamed: 0,userId,groupCount
0,0003f534fe8124d98eef78b01baa3b69e3ffcda6e5a657...,5
1,00061025222fb1d2f9861cae9d91aa71aaaae57c460b87...,5
2,000a2a60d1b7655d40beb4042e0610cc8cd4c2e8a51300...,154
3,000daf04a745c346dc2c43d55a00a1d0361826c5a3d939...,13
4,0010cbb9e4fd6b426ac740d39063c6822e854f3d45893e...,249
...,...,...
17042,ffe1086ec2b886715129f72b78a2123cda63acdcf87b48...,7
17043,ffe133162533bd67689c667be6c302b7342f8a682d28d7...,213
17044,ffe9c4900999b26fde3c3f8e4ac440d83aac150280e519...,5
17045,ffea39849846ad10de9ec9bf0dd8475d2e22b03fe41b2f...,29


In [19]:
trusted.keys()

dict_keys(['X_train_full', 'X_train', 'X_test_full', 'X_test', 'y_train', 'y_test', 'encoder_mapping', 'group_train', 'group_test'])

In [14]:
trusted

{'X_train_full':                                                    userId  \
 0       ed93a78d03476cd479f22ec4c9f119f76edcdc3842d014...   
 1       40b221679af85bda14fec9e8706a7ee27b2e46be3e4a6c...   
 2       6241ed429e1c6e0046106a23a5304c06512ef71ea707d0...   
 3       c5717e991473faa1c8046c86bf70b6f1abfab977db1931...   
 4       9db411578e57e00ba8763a82f3282f8b37c0f40ece0ff4...   
 ...                                                   ...   
 516844  5a530638c80d13f4925067df6af58d5c780d70117a37cb...   
 516845  7f3c6b1250b0fcf1db9c74ea0fbecfb322de935999e55c...   
 516846  2da138542452090bfff95a5264d571ec215e8b257b702d...   
 516847  e9c048b887823974af6416dde3de201598a75b95219ab3...   
 516848  2a7cd7c8956e888c10bf3988b929ec568d289a61f58d12...   
 
                                       pageId    userType  isWeekend  \
 0       6acdd9ff-e022-451b-8732-7493e9d41112  Non-Logged      False   
 1       4ab20151-8a01-49b8-943c-6530b6c9bad0  Non-Logged      False   
 2       e4a32f87-4d4a

In [None]:


# train_and_log_model(X_train, y_train, group_train, trusted)