In [1]:

import os
import sys
import pandas as pd
import mlflow

# Adiciona o diretório src ao PYTHONPATH
project_dir = os.path.abspath(os.path.join(os.getcwd(), '../../../'))
if project_dir not in sys.path:
    sys.path.append(project_dir)

from src.config import get_config, get_data_path, CONFIG, logger, USE_S3, DATA_PATH, SAMPLE_RATE, COLD_START_THRESHOLD, USERS_DIRECTORY

from src.config import SAMPLE_RATE, COLD_START_THRESHOLD, USERS_DIRECTORY, NEWS_DIRECTORY
from src.features.pp_mix import preprocess_mix_feats, generate_suggested_feats
from src.features.pp_news import preprocess_news
from src.features.pp_target import preprocess_target
from src.features.pp_users import preprocess_users
from src.features.constants import USERS_COLS_TO_EXPLODE, USERS_DTYPES, NEWS_COLS_TO_DROP
from storage.io import Storage
from src.features.utils import concatenate_csv_files


from src.features.pipeline import (
    _preprocess_and_save_users,
    _preprocess_and_save_news,
    _preprocess_and_save_mix_feats,
    _assemble_and_save_suggested_feats,
    _preprocess_and_save_target,
    _assemble_and_save_final_feats,
)

2025-03-01 18:24:25,968 - src.config - INFO - config.py - Ambiente: dev
2025-03-01 18:24:26,593 - config - INFO - config.py - Ambiente: dev


In [2]:
storage = Storage(use_s3=USE_S3)

2025-03-01 18:24:27,898 - config - INFO - factory.py - Inicializando armazenamento local


# Users

In [3]:
from src.features.pp_users import (
    _process_history_columns,
    _process_timestamp,
    _extract_time_features
)

In [4]:
# users_df = _preprocess_and_save_users(DATA_PATH, storage)


In [5]:
df = concatenate_csv_files(USERS_DIRECTORY)

2025-03-01 18:24:29,593 - config - INFO - factory.py - Inicializando armazenamento local
2025-03-01 18:24:29,595 - src.config - INFO - utils.py - Encontrados 6 CSVs em /Users/mauricioaraujo/Eng_ML/FIAP/Fase_5-MLOps/ML_Engineer_Datathon/data/challenge-webmedia-e-globo-2023/files/treino
2025-03-01 18:24:30,609 - src.config - INFO - utils.py - Arquivo: treino_parte3.csv, linhas: 100000, cols: 10
2025-03-01 18:24:31,598 - src.config - INFO - utils.py - Arquivo: treino_parte2.csv, linhas: 100000, cols: 10
2025-03-01 18:24:32,737 - src.config - INFO - utils.py - Arquivo: treino_parte1.csv, linhas: 100000, cols: 10
2025-03-01 18:24:33,761 - src.config - INFO - utils.py - Arquivo: treino_parte5.csv, linhas: 100000, cols: 10
2025-03-01 18:24:34,802 - src.config - INFO - utils.py - Arquivo: treino_parte4.csv, linhas: 100000, cols: 10
2025-03-01 18:24:35,790 - src.config - INFO - utils.py - Arquivo: treino_parte6.csv, linhas: 77942, cols: 10
2025-03-01 18:24:36,064 - src.config - INFO - utils.py 

In [6]:
df.head()

Unnamed: 0,userId,userType,historySize,history,timestampHistory,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,timestampHistory_new
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,Non-Logged,2,"80aa7bb2-adce-4a55-9711-912c407927a1, d9e5f15d...","1657908085200, 1659634203762","0, 0","71998, 115232","81.58, 73.36","1, 1","1657908085200, 1659634203762"
1,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,Non-Logged,2,"19ba89fc-1e06-4c5d-9c57-4a3088dc0511, e273dba4...","1657111508570, 1657481309920","68, 12","131495, 43733","51.74, 35.49","1, 1","1657111508570, 1657481309920"
2,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,Non-Logged,2,"59a61a8a-cc52-453f-b1cd-2bd019e9d574, a0562805...","1657823890328, 1660141444328","55, 9","159042, 10336","62.19, 48.28","1, 1","1657823890328, 1660141444328"
3,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,Non-Logged,2,"233f8238-2ce0-470f-a9d5-0e0ac530382a, 037155f4...","1656963373076, 1657091888917","0, 0","193579, 20519","31.03, 31.9","1, 1","1656963373076, 1657091888917"
4,97e1439d485b0630e12818d3df84ff67d08475ef6ebeb0...,Logged,2,"385044ad-3876-4188-83fa-f560435c1a9c, 2f754502...","1657618607633, 1659536839832","57, 38","220000, 130000","52.65, 53.37","1, 1","1657618607633, 1659536839832"


In [7]:

df = df.sample(frac=SAMPLE_RATE, random_state=42)
df = _process_history_columns(df)
df = df.astype(USERS_DTYPES)
df = _process_timestamp(df)
df = _extract_time_features(df)
df["coldStart"] = df["historySize"] < COLD_START_THRESHOLD
df.rename(columns={"history": "pageId"}, inplace=True)
df.drop(columns=["timestampHistory", "timestampHistory_new"], inplace=True)

users_df = df.copy()

In [8]:
df.head()

Unnamed: 0,userId,userType,historySize,pageId,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,minutesSinceLastVisit,timestampHistoryDate,timestampHistoryTime,timestampHistoryWeekday,timestampHistoryHour,isWeekend,dayPeriod,coldStart
0,00007a4e5949a3dba7c977503c53e0873643fe17d0802a...,Non-Logged,1,3101f64e-7048-426b-9cb6-b5a3d8ce78f3,1,30000,10.48,1,0.0,2022-07-04,21:05:57,0,21,False,night,True
1,00014a8ad79d212f0410c3dd3260e37a093054b9d9a6a8...,Non-Logged,3,c0a176b2-622f-42a4-8e5d-2a08032f8e08,0,252934,61.42,1,0.0,2022-07-01,12:05:08,4,12,False,afternoon,True
2,00014a8ad79d212f0410c3dd3260e37a093054b9d9a6a8...,Non-Logged,3,97ac6c4c-5b3d-4e36-9816-0791226573e5,2,70000,53.84,1,1.0,2022-07-01,12:06:22,4,12,False,afternoon,True
3,00014a8ad79d212f0410c3dd3260e37a093054b9d9a6a8...,Non-Logged,3,ece7f38e-1744-4885-83d9-2ffb89e40b11,0,10000,11.37,1,115.0,2022-07-01,14:01:37,4,14,False,afternoon,True
4,00024d84a3d619b1972e9dd5efa9c930557d85f6463fd6...,Logged,1,c274ed4b-e679-41ce-9dfb-c53874916511,11,32436,46.58,1,0.0,2022-07-30,18:58:05,5,18,True,night,True


# News

In [9]:
from src.features.pp_news import (
    _download_resource,
    _extract_url_mid_section,
    _extract_location,
    _extract_theme,
    _preprocess_text,
)

In [10]:
selected_pageIds = list(users_df["pageId"].unique())
# news_df = _preprocess_and_save_news(DATA_PATH, selected_pageIds, storage)

In [11]:
_download_resource("stopwords", ["corpora/stopwords"])
_download_resource("wordnet", ["corpora/wordnet", "corpora/wordnet.zip"])
_download_resource("omw-1.4", ["corpora/omw-1.4", "corpora/omw-1.4.zip"])
df = concatenate_csv_files(NEWS_DIRECTORY)
df = df.rename(columns={"page": "pageId"})
df = df[df["pageId"].isin(selected_pageIds)]
for col in ["issued", "modified"]:
    df[col] = pd.to_datetime(df[col])
    df[f"{col}Date"] = df[col].dt.date
    df[f"{col}Time"] = df[col].dt.time
df["urlExtracted"] = df["url"].apply(_extract_url_mid_section)
df["local"] = df["urlExtracted"].apply(_extract_location)
df["localState"] = df["local"].str.split("/").str[0]
df["localRegion"] = df["local"].str.split("/").str[1]
df["theme"] = df["urlExtracted"].apply(_extract_theme)
df["themeMain"] = df["theme"].str.split("/").str[0]
df["themeSub"] = df["theme"].str.split("/").str[1]
df = df.drop(columns=NEWS_COLS_TO_DROP)

news_df = df.copy()





2025-03-01 18:24:42,966 - src.config - INFO - pp_news.py - Recurso 'stopwords' já baixado.
2025-03-01 18:24:42,970 - src.config - INFO - pp_news.py - Recurso 'wordnet' já baixado.
2025-03-01 18:24:42,974 - src.config - INFO - pp_news.py - Recurso 'omw-1.4' já baixado.
2025-03-01 18:24:42,974 - config - INFO - factory.py - Inicializando armazenamento local
2025-03-01 18:24:42,975 - src.config - INFO - utils.py - Encontrados 3 CSVs em /Users/mauricioaraujo/Eng_ML/FIAP/Fase_5-MLOps/ML_Engineer_Datathon/data/challenge-webmedia-e-globo-2023/itens/itens
2025-03-01 18:24:46,111 - src.config - INFO - utils.py - Arquivo: itens-parte2.csv, linhas: 100000, cols: 7
2025-03-01 18:24:48,429 - src.config - INFO - utils.py - Arquivo: itens-parte3.csv, linhas: 55603, cols: 7
2025-03-01 18:24:51,788 - src.config - INFO - utils.py - Arquivo: itens-parte1.csv, linhas: 100000, cols: 7
2025-03-01 18:24:51,985 - src.config - INFO - utils.py - Linhas após concatenação: 255603


In [12]:
news_df.head()

Unnamed: 0,pageId,issuedDate,issuedTime,modifiedDate,modifiedTime,localState,localRegion,themeMain,themeSub
8,56ed4cb1-65c2-42f7-80f4-ea73f0aad59c,2022-08-01,17:46:54,2022-08-01,18:10:14,ba,bahia,,
9,d770d1f3-01cf-4545-8bd0-22ef77d37986,2022-02-21,08:03:26,2022-02-21,14:19:57,ba,bahia,,
12,6446e307-8d5f-4b92-a8ec-48b9a9faa843,2020-01-21,15:01:25,2020-01-21,18:31:37,ce,ceara,,
14,a5e207ec-4208-4e92-ac02-d37475f7000d,2018-01-01,13:59:46,2018-01-01,13:59:47,df,distrito-federal,,
15,9ef14199-0b53-40d4-8fc7-e946bf929d68,2017-12-04,07:00:48,2017-12-04,07:00:50,df,distrito-federal,,


# Mix Feats

In [13]:
# mix_dfs = _preprocess_and_save_mix_feats(DATA_PATH, news_df, users_df, storage)

In [14]:
mix_df, gap_df, state_df, region_df, tm_df, ts_df = preprocess_mix_feats(news_df, users_df)
dfs = {
    "mix_feats/mix_df.parquet": mix_df,
    "mix_feats/gap_feats.parquet": gap_df,
    "mix_feats/state_feats.parquet": state_df,
    "mix_feats/region_feats.parquet": region_df,
    "mix_feats/theme_main_feats.parquet": tm_df,
    "mix_feats/theme_sub_feats.parquet": ts_df,
}
for rel_file, df in dfs.items():
    file_path = os.path.join(DATA_PATH, "features", rel_file)
    # _save_df_parquet(df, file_path, storage)

# Suggested Feats

In [15]:
mix_dfs = mix_df, gap_df, state_df, region_df, tm_df, ts_df

In [17]:
["mix_df", "gap_df", "state_df", "region_df", "tm_df", "ts_df"][1:]

['gap_df', 'state_df', 'region_df', 'tm_df', 'ts_df']

In [16]:
mix_dfs[1:]

(                                                   userId  \
 0       00007a4e5949a3dba7c977503c53e0873643fe17d0802a...   
 1       00014a8ad79d212f0410c3dd3260e37a093054b9d9a6a8...   
 2       00014a8ad79d212f0410c3dd3260e37a093054b9d9a6a8...   
 3       00014a8ad79d212f0410c3dd3260e37a093054b9d9a6a8...   
 4       00024d84a3d619b1972e9dd5efa9c930557d85f6463fd6...   
 ...                                                   ...   
 471537  ffee16e0d7c3fb56f17185c7e261254aff7588581adaae...   
 471538  fff0d25554deb27c5eb4d3bc6650bfe2e7dbfdbfcd8071...   
 471539  fff20236850ec3ad28e1cebf64e3cd5ee0b2afe46cfa89...   
 471540  fff63afe0a59f15604fcf2634b3a39245fb3c03b768484...   
 471541  fffbc41ebcd9f5302bbb0c79b9dd30faf7a52432c52133...   
 
                                       pageId  timeGapDays  timeGapHours  \
 0       3101f64e-7048-426b-9cb6-b5a3d8ce78f3         1866  44807.727222   
 1       c0a176b2-622f-42a4-8e5d-2a08032f8e08           78   1894.595556   
 2       97ac6c4c-5b3d-4e3

In [None]:

suggested = _assemble_and_save_suggested_feats(DATA_PATH, *mix_dfs[1:], storage)
target_df = _preprocess_and_save_target(DATA_PATH, users_df, mix_dfs[1], storage)
_assemble_and_save_final_feats(DATA_PATH, suggested, target_df, storage)
logger.info("Pipeline concluído com sucesso!")