In [28]:
import sys

# Adiciona o diretório src ao PYTHONPATH
project_dir = os.path.abspath(os.path.join(os.getcwd(), '../../../'))
if project_dir not in sys.path:
    sys.path.append(project_dir)


import os
import json
import numpy as np
import pandas as pd
import mlflow
import re
from typing import List


from src.config import logger, DATA_PATH, USE_S3, get_config
from storage.io import Storage
from src.train.core import load_model_from_mlflow
from src.data.data_loader import get_predicted_news


In [3]:
storage = Storage()

2025-03-02 10:37:53,446 - config - INFO - factory.py - Inicializando S3 no bucket 'fiap-mleng-datathon-data-grupo57'
2025-03-02 10:37:54,259 - config - INFO - s3.py - S3 válido para bucket 'fiap-mleng-datathon-data-grupo57'


In [34]:
validation_file = os.path.join(DATA_PATH, "challenge-webmedia-e-globo-2023/val_data/validacao.csv")
validation_data = storage.read_csv(validation_file)

validation_data.info()
validation_data.head(3)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112184 entries, 0 to 112183
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   userId            112184 non-null  object
 1   userType          112184 non-null  object
 2   history           112184 non-null  object
 3   timestampHistory  112184 non-null  object
dtypes: object(4)
memory usage: 3.4+ MB


Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,['be89a7da-d9fa-49d4-9fdc-388c27a15bc8'\n '01c...,[1660533136590 1660672113513]
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,['77901133-aee7-4f7b-afc0-652231d76fe9'],[1660556860253]
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,['857aa90f-a7ec-410d-ba82-dfa4f85d4e71'],[1660561649242]


In [37]:
def explode_history(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converte a coluna 'history' (que contém strings com hashes de pageIds)
    em múltiplas linhas, extraindo os valores entre aspas simples.
    
    Exemplo de entrada na coluna 'history':
        "'be89a7da-d9fa-49d4-9fdc-388c27a15bc8'\n '01c59ff6-fb82-4258-918f-2910cb2d4c52'"
    
    A função irá retornar um DataFrame com uma coluna 'pageId' contendo cada hash.
    
    Args:
        df (pd.DataFrame): DataFrame que contém a coluna 'history'.
    
    Returns:
        pd.DataFrame: DataFrame com a coluna 'history' explodida e renomeada para 'pageId'.
    """
    def parse_history_str(s: str) -> list:
        # Remove colchetes, se existirem, e quebras de linha
        s = s.strip().replace("[", "").replace("]", "")
        # Utiliza regex para extrair os valores entre aspas simples
        tokens = re.findall(r"'([^']+)'", s)
        return tokens

    # Aplica a função de parsing somente se o valor for string
    df["history"] = df["history"].apply(lambda x: parse_history_str(x) if isinstance(x, str) else x)
    # Explode a coluna para ter 1 hash por linha
    df_exploded = df.explode("history").reset_index(drop=True)
    # Renomeia a coluna para 'pageId'
    df_exploded.rename(columns={"history": "pageId"}, inplace=True)
    return df_exploded



validation_exploded = explode_history(validation_data)
# logger.info(f"Explodido dataset de validação. Shape: {validation_exploded.shape}")
validation_exploded.head(5)


Unnamed: 0,userId,userType,pageId,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,be89a7da-d9fa-49d4-9fdc-388c27a15bc8,[1660533136590 1660672113513]
1,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,01c59ff6-fb82-4258-918f-2910cb2d4c52,[1660533136590 1660672113513]
2,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,77901133-aee7-4f7b-afc0-652231d76fe9,[1660556860253]
3,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,857aa90f-a7ec-410d-ba82-dfa4f85d4e71,[1660561649242]
4,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,b7b90e18-7613-4ca0-a8fc-fd69addfcd85,[1660533830245 1660540831707 1660542659111 166...


In [36]:
validation_data["history"].str.replace("[", "").str.replace("]", "").iloc[0]

"'be89a7da-d9fa-49d4-9fdc-388c27a15bc8'\n '01c59ff6-fb82-4258-918f-2910cb2d4c52'"

In [40]:
validation_data["userId"].iloc[0]

'e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4b901419051126488b9'

In [41]:
validation_exploded["userId"].iloc[0]

'e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4b901419051126488b9'

In [42]:
validation_data.query("userId == 'e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4b901419051126488b9'").head(5)

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,"[be89a7da-d9fa-49d4-9fdc-388c27a15bc8, 01c59ff...",[1660533136590 1660672113513]


In [43]:
validation_exploded.query("userId == 'e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4b901419051126488b9'").head(5)

Unnamed: 0,userId,userType,pageId,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,be89a7da-d9fa-49d4-9fdc-388c27a15bc8,[1660533136590 1660672113513]
1,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,01c59ff6-fb82-4258-918f-2910cb2d4c52,[1660533136590 1660672113513]


In [14]:
# Notebook cell
from config import configure_mlflow


configure_mlflow()

model_name = get_config("MODEL_NAME", "news-recommender-dev")  # ou "news-recommender-prod"
model_alias = "champion"  # ou "staging" etc.
model = load_model_from_mlflow(model_name, model_alias=model_alias)

if model is None:
    raise RuntimeError("Não foi possível carregar o modelo do MLflow!")


2025-03-02 11:00:02,327 - src.config - INFO - core.py - 🔄 [Core] Carregando modelo do MLflow: models:/news-recommender-prod@champion
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00,  7.24it/s]
2025-03-02 11:00:05,360 - src.config - INFO - core.py - ✅ [Core] Modelo carregado com sucesso!
