In [1]:

import os
import sys
import pandas as pd
import mlflow

# Adiciona o diretório src ao PYTHONPATH
project_dir = os.path.abspath(os.path.join(os.getcwd(), '../../../'))
if project_dir not in sys.path:
    sys.path.append(project_dir)

from src.config import (
    get_config,
    get_data_path,
    configure_mlflow,
    CONFIG,
    logger,
    USE_S3,
    DATA_PATH,
    SAMPLE_RATE,
    COLD_START_THRESHOLD
)
from src.features.constants import USERS_COLS_TO_EXPLODE, USERS_DTYPES, NEWS_COLS_TO_DROP
from storage.io import Storage
from src.features.utils import concatenate_csv_files

from src.data.data_loader import get_client_features, get_predicted_news
from src.train.core import load_model_from_mlflow




2025-03-01 19:18:02,984 - src.config - INFO - config.py - Ambiente: dev
2025-03-01 19:18:02,991 - config - INFO - config.py - Ambiente: dev


In [2]:
configure_mlflow()
pred_path = os.path.join(DATA_PATH, "predict", "X_train_full.parquet")
storage = Storage(use_s3=USE_S3)
full_df = storage.read_parquet(pred_path)
model = load_model_from_mlflow()


2025-03-01 19:18:03,118 - config - INFO - factory.py - Inicializando armazenamento local
2025-03-01 19:18:03,292 - src.config - INFO - core.py - Carregando modelo models:/news-recommender-dev@champion do MLflow
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 815.58it/s]  


In [3]:
full_df.head()

Unnamed: 0,userId,pageId,userType,isWeekend,dayPeriod,issuedDatetime,timestampHistoryDatetime,coldStart,localState,localRegion,...,relLocalState,relLocalRegion,relThemeMain,relThemeSub,userTypeFreq,dayPeriodFreq,localStateFreq,localRegionFreq,themeMainFreq,themeSubFreq
0,ed93a78d03476cd479f22ec4c9f119f76edcdc3842d014...,6acdd9ff-e022-451b-8732-7493e9d41112,Non-Logged,False,night,2022-07-28 18:32:30,2022-07-12 20:52:09,False,sc,santa-catarina,...,0.044944,0.044944,,,0.530528,0.322764,0.042995,0.042995,,
1,40b221679af85bda14fec9e8706a7ee27b2e46be3e4a6c...,4ab20151-8a01-49b8-943c-6530b6c9bad0,Non-Logged,False,dawn,2022-02-18 02:54:17,2022-07-28 00:31:56,False,pa,para,...,0.933333,0.933333,,,0.530528,0.139795,0.009825,0.008729,,
2,6241ed429e1c6e0046106a23a5304c06512ef71ea707d0...,e4a32f87-4d4a-42dd-b1d0-b4d0564b03bf,Logged,False,afternoon,2022-07-07 12:28:32,2022-08-05 13:14:13,False,go,goias,...,0.014085,0.014085,,,0.469472,0.395706,0.054219,0.054219,,
3,c5717e991473faa1c8046c86bf70b6f1abfab977db1931...,d44cb6e8-1e0c-471a-b843-993a590b3ffc,Non-Logged,False,morning,2022-04-26 20:08:46,2022-07-05 11:35:17,False,,,...,,,0.16,,0.530528,0.141736,,,0.116119,
4,9db411578e57e00ba8763a82f3282f8b37c0f40ece0ff4...,bd4e7054-4043-4acf-9a49-7d883152189d,Non-Logged,False,dawn,2022-07-13 00:46:16,2022-08-03 01:51:42,False,rj,rio-de-janeiro,...,0.136364,0.136364,,,0.530528,0.139795,0.235105,0.221997,,


In [4]:
# recs = predict_for_userId(userId, full_df, model)

userId = "ed93a78d03476cd479f22ec4c9f119f76edcdc3842d0147564b8901a4044b54f"

seen = full_df.loc[full_df["userId"] == userId, "pageId"].unique()
non_viewed = full_df[~full_df["pageId"].isin(seen)].copy()

In [7]:

client_feat = get_client_features(userId, full_df)
client_feat


userId                      ed93a78d03476cd479f22ec4c9f119f76edcdc3842d014...
pageId                                   6acdd9ff-e022-451b-8732-7493e9d41112
userType                                                           Non-Logged
isWeekend                                                               False
dayPeriod                                                               night
issuedDatetime                                            2022-07-28 18:32:30
timestampHistoryDatetime                                  2022-07-12 20:52:09
coldStart                                                               False
localState                                                                 sc
localRegion                                                    santa-catarina
themeMain                                                                None
themeSub                                                                 None
relLocalState                                                   

In [None]:

client_df = pd.DataFrame([client_feat])


model_input = non_viewed.assign(userId=userId).merge(
    client_df.drop(columns=["userId"]), how="cross")


In [None]:


scores = model.predict(model_input)
pred_news = get_predicted_news(scores, non_viewed, n=n,
                               score_threshold=score_threshold)