In [14]:
# IMPORTS GLOBAIS
import pandas as pd
import numpy as np
import os
import pickle
import mlflow.pyfunc

In [15]:
# MUDANDO DIRETÓRIO
os.chdir("c://Users//gufer//OneDrive//Documentos//FIAP//Fase_05//ML_Engineer_Datathon")
os.getcwd()

'c:\\Users\\gufer\\OneDrive\\Documentos\\FIAP\\Fase_05\\ML_Engineer_Datathon'

In [16]:
# IMPORTS LOCAIS
from src.recommendation_model.lgbm_ranker import LightGBMRanker

In [17]:
# LOADS
X_train = pd.read_parquet("data/train/X_train.parquet")
y_train = pd.read_parquet("data/train/y_train.parquet")
X_train_full = pd.read_parquet("data/train/X_train_full.parquet")
group_train = pd.read_parquet("data/train/group_train.parquet")

In [18]:
X_train_full.columns

Index(['userId', 'pageId', 'userType', 'isWeekend', 'dayPeriod',
       'issuedDatetime', 'timestampHistoryDatetime', 'coldStart', 'localState',
       'localRegion', 'themeMain', 'themeSub', 'relLocalState',
       'relLocalRegion', 'relThemeMain', 'relThemeSub', 'userTypeFreq',
       'dayPeriodFreq', 'localStateFreq', 'localRegionFreq', 'themeMainFreq',
       'themeSubFreq'],
      dtype='object')

In [19]:
# INSTANCIANDO E TREINANDO O MODELO
model = LightGBMRanker()
model.train(X_train.values, y_train.values.ravel(), group_train["groupCount"].values)

In [20]:
# COLUNAS
CLIENT_FEATURES = [
    'userTypeFreq', 
    'isWeekend', 
    'dayPeriodFreq',
]
NEWS_FEATURES = [
    'relLocalState', 
    'relLocalRegion',
    'relThemeMain', 
    'relThemeSub',  
    'localStateFreq', 
    'localRegionFreq', 
    'themeMainFreq',
    'themeSubFreq',
]

client_features = X_train[CLIENT_FEATURES]
news_features = X_train[NEWS_FEATURES] 

client_features.columns, news_features.columns

(Index(['userTypeFreq', 'isWeekend', 'dayPeriodFreq'], dtype='object'),
 Index(['relLocalState', 'relLocalRegion', 'relThemeMain', 'relThemeSub',
        'localStateFreq', 'localRegionFreq', 'themeMainFreq', 'themeSubFreq'],
       dtype='object'))

In [21]:
# MONTA O DICIONÁRIO
input_data = {
    "client_features": client_features,
    "news_features": news_features
}

In [22]:
# MODELO LOCAL
predicoes = model.predict(input_data)
print("Predições:", predicoes)


Predições: [-1.88824921 -2.49841121 -2.4491163  ... -1.29674705 -1.9811297
 -1.82170664]


In [23]:
# FUNÇÃO PARA COMPATIBILIZAR PREDICT
def flatten_input(input_data: dict) -> pd.DataFrame:
    """
    Recebe um dicionário com as chaves 'client_features' e 'news_features',
    onde cada valor é um DataFrame, e retorna um único DataFrame com as colunas
    na ordem esperada pela assinatura do modelo.
    """
    client_df = input_data.get("client_features")
    news_df = input_data.get("news_features")
    
    if client_df is None or news_df is None:
        raise ValueError("O input deve conter 'client_features' e 'news_features'.")
    
    # Verifica se já são DataFrames; se não, converte
    if not isinstance(client_df, pd.DataFrame):
        client_df = pd.DataFrame(client_df)
    if not isinstance(news_df, pd.DataFrame):
        news_df = pd.DataFrame(news_df)
    
    # Concatena os DataFrames horizontalmente (colunas lado a lado)
    df = pd.concat([client_df, news_df], axis=1)
    
    # Define a ordem das colunas conforme a assinatura do modelo
    expected_columns = [
        'isWeekend', 
        'relLocalState', 'relLocalRegion', 'relThemeMain', 'relThemeSub', 
        'userTypeFreq', 'dayPeriodFreq', 
        'localStateFreq', 'localRegionFreq', 'themeMainFreq', 'themeSubFreq'
    ]
    
    # Reordena as colunas; se alguma coluna estiver faltando, será gerado um KeyError
    df = df[expected_columns]
    
    return df

In [24]:
# MODELO MLFLOW

# Setando o host
mlflow.set_tracking_uri("http://localhost:5001")

# Carrega o modelo a partir do MLflow
model = mlflow.pyfunc.load_model("models:/news-recommender-dev@champion")

# "Achata" o input para obter um único DataFrame
flattened_input = flatten_input(input_data)

# Realiza a predição utilizando o método predict do modelo carregado
# predicoes = model.predict(flattened_input)
# print("Predições:", predicoes)


Downloading artifacts: 100%|██████████| 7/7 [00:02<00:00,  3.22it/s]


In [29]:
news_feats = pd.read_parquet("data/features/news_feats.parquet")
news_feats.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80208 entries, 0 to 255601
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   pageId        80208 non-null  object
 1   url           80208 non-null  object
 2   title         80208 non-null  object
 3   issuedDate    80208 non-null  object
 4   issuedTime    80208 non-null  object
 5   modifiedDate  80208 non-null  object
 6   modifiedTime  80208 non-null  object
 7   localState    59697 non-null  object
 8   localRegion   59697 non-null  object
 9   themeMain     25358 non-null  object
 10  themeSub      9100 non-null   object
dtypes: object(11)
memory usage: 7.3+ MB


In [None]:
news_metadata = pd.read_parquet("data/features/news_feats.parquet")[["pageId","url","title"]]
news_metadata

Unnamed: 0,pageId,url,title
0,13db0ab1-eea2-4603-84c4-f40a876c7400,http://g1.globo.com/am/amazonas/noticia/2022/0...,Caso Bruno e Dom: 3º suspeito tem prisão tempo...
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,http://g1.globo.com/pa/santarem-regiao/noticia...,Linguajar dos santarenos é diferenciado e chei...
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,http://g1.globo.com/mundo/noticia/2022/07/08/e...,Ex-premiê Shinzo Abe morre após ser baleado no...
5,a9fd6d34-6f40-4c90-849b-2ad36f04fd6f,http://g1.globo.com/politica/noticia/2021/08/2...,O que é o marco temporal sobre terras indígena...
7,682da2fa-6f5b-4017-be35-7968990f62b9,http://g1.globo.com/pop-arte/musica/noticia/20...,Como Cornershop criou hit 'complicado' com mis...
...,...,...,...
255594,fbf456df-ce43-4fed-80ff-4a07c301c038,http://g1.globo.com/bemestar/coronavirus/notic...,Segunda onda de Covid? As cidades e regiões do...
255596,9999021c-bf95-46dd-941e-ffeb9e7e63aa,http://g1.globo.com/tecnologia/noticia/2022/06...,Mercado de tecnologia oferece vagas para quem ...
255599,d21c1bfc-6a90-4e2d-8c4c-ff1daee1b4f2,http://g1.globo.com/especial-publicitario/vae/...,"Em 10 passos, saiba o que você precisa fazer p..."
255600,abc5ecd9-81e1-40cf-b706-0b5fa7bea3be,http://g1.globo.com/sp/campinas-regiao/terra-d...,Ratão-do-banhado é roedor adaptado para viver ...
