In [1]:
"""Preprocessing for users features."""

import os
os.chdir("c:/Users/gufer/OneDrive/Documentos/FIAP/Fase_05/ML_Engineer_Datathon/src/features")

import pandas as pd
from utils import concatenate_csv_to_df
from constants import (
    USERS_TEMP_PATH, 
    USERS_N_CSV_FILES,
    COLS_TO_EXPLODE,
    USERS_DTYPES,
    COLD_START_THRESHOLD
)

os.chdir("c:/Users/gufer/OneDrive/Documentos/FIAP/Fase_05/ML_Engineer_Datathon/")

In [7]:
def pre_process_users() -> pd.DataFrame:
    """
    Pré-processamento dos dados de usuários.
    """
    # Concatena CSVs
    df_users = concatenate_csv_to_df(USERS_TEMP_PATH, USERS_N_CSV_FILES)

    # Transforma colunas de histórico de string para lista
    df_users[COLS_TO_EXPLODE] = df_users[COLS_TO_EXPLODE].apply(lambda col: col.str.split(','))

    # Explode o dataframe e remove espaços das strings
    df_users = df_users.explode(COLS_TO_EXPLODE)
    df_users[COLS_TO_EXPLODE] = df_users[COLS_TO_EXPLODE].apply(lambda col: col.str.strip())

    # Converte colunas para o tipo de dado mais apropriado
    df_users = df_users.astype(USERS_DTYPES)

    # Converte timestamp para datetime e ordena por usuário e data/hora
    df_users["timestampHistory"] = pd.to_datetime(df_users["timestampHistory"] / 1000, unit="s")
    df_users = df_users.sort_values(by=["userId", "timestampHistory"]).reset_index(drop=True)

    # Calcula diferença em horas desde o último acesso
    df_users["minutesSinceLastVisit"] = df_users.groupby("userId")["timestampHistory"].diff().dt.total_seconds() / 60.0
    df_users["minutesSinceLastVisit"] = df_users["minutesSinceLastVisit"].fillna(0).round()

    # Extrai informações temporais de timestamp
    df_users["timestampHistoryDate"] = df_users["timestampHistory"].dt.date
    df_users["timestampHistoryTime"] = df_users["timestampHistory"].dt.strftime("%H:%M")
    df_users["timestampHistoryWeekday"] = df_users["timestampHistory"].dt.dayofweek
    df_users["timestampHistoryHour"] = df_users["timestampHistory"].dt.hour

    # Cria indicador de fim de semana
    df_users["isWeekend"] = df_users["timestampHistoryWeekday"].apply(lambda x: True if x >= 5 else False)

    # Cria categorias para os períodos do dia
    df_users["dayPeriod"] = pd.cut(
        df_users["timestampHistoryHour"],
        bins=[-1, 5, 11, 17, 23],
        labels=["dawn", "morning", "afternoon", "night"],
        right=True
    )
    
    # Cria booleano para cold start
    df_users["coldStart"] = df_users["historySize"] < COLD_START_THRESHOLD

    # Renomeia coluna de chave secundária
    df_users.rename(columns={"history": "historyId"}, inplace=True)

    # Remove colunas desnecessárias
    df_users.drop(columns=["timestampHistory", "timestampHistory_new"], inplace=True)
    
    # Finaliza fazendo o downcast das colunas
    df_users['historySize'] = pd.to_numeric(df_users['historySize'], downcast='integer')
    df_users['numberOfClicksHistory'] = pd.to_numeric(df_users['numberOfClicksHistory'], downcast='integer')
    df_users['timeOnPageHistory'] = pd.to_numeric(df_users['timeOnPageHistory'], downcast='integer')
    df_users['pageVisitsCountHistory'] = pd.to_numeric(df_users['pageVisitsCountHistory'], downcast='integer')
    df_users['scrollPercentageHistory'] = pd.to_numeric(df_users['scrollPercentageHistory'], downcast='float')
    df_users['minutesSinceLastVisit'] = pd.to_numeric(df_users['minutesSinceLastVisit'], downcast='float')
    df_users['timestampHistoryWeekday'] = df_users['timestampHistoryWeekday'].astype('int16')
    df_users['timestampHistoryHour'] = df_users['timestampHistoryHour'].astype('int16')
    
    return df_users

In [9]:
def pre_process_users() -> pd.DataFrame:
    """
    Realiza o pré-processamento dos dados dos usuários:
    - Concatena CSVs.
    - Explode colunas de histórico.
    - Converte colunas para tipos apropriados.
    - Processa informações temporais.
    - Cria variáveis derivadas (ex: minutos desde o último acesso, flag de cold start).
    - Realiza downcasting das colunas.
    """
    
    # Concatena CSVs
    df_users = concatenate_csv_to_df(USERS_TEMP_PATH, USERS_N_CSV_FILES)
    
    # Processa colunas de histórico (explode e remove espaços)
    df_users = _process_history_columns(df_users)
    
    # Converte colunas para tipos apropriados
    df_users = df_users.astype(USERS_DTYPES)
    
    # Converte timestamp e ordena por usuário e data
    df_users = _process_timestamp(df_users)
    
    # Cria variáveis temporais derivadas
    df_users = _extract_time_features(df_users)
    
    # Cria indicador de fim de semana
    df_users["isWeekend"] = df_users["timestampHistoryWeekday"] >= 5
    
    # Classifica os períodos do dia
    df_users["dayPeriod"] = _classify_day_period(df_users)
    
    # Cria indicador de cold start
    df_users["coldStart"] = df_users["historySize"] < COLD_START_THRESHOLD
    
    # Renomeia a coluna de chave secundária
    df_users.rename(columns={"history": "historyId"}, inplace=True)
    
    # Remove colunas desnecessárias
    df_users.drop(columns=["timestampHistory", "timestampHistory_new"], inplace=True)
    
    # Realiza o downcasting das colunas numéricas
    df_users = _downcast_columns(df_users)
    
    return df_users


def _process_history_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Converte colunas de histórico de string para lista, explode e remove espaços."""
    # Transforma colunas de histórico de string para lista
    df[COLS_TO_EXPLODE] = df[COLS_TO_EXPLODE].apply(lambda col: col.str.split(','))
    
    # Explode o dataframe e remove espaços das strings
    df = df.explode(COLS_TO_EXPLODE)
    df[COLS_TO_EXPLODE] = df[COLS_TO_EXPLODE].apply(lambda col: col.str.strip())
    
    return df


def _process_timestamp(df: pd.DataFrame) -> pd.DataFrame:
    """Converte timestamp para datetime e ordena por usuário e data/hora."""
    df["timestampHistory"] = pd.to_datetime(df["timestampHistory"] / 1000, unit="s")
    df = df.sort_values(by=["userId", "timestampHistory"]).reset_index(drop=True)
    
    # Calcula diferença em minutos desde o último acesso
    df["minutesSinceLastVisit"] = df.groupby("userId")["timestampHistory"].diff().dt.total_seconds() / 60.0
    df["minutesSinceLastVisit"] = df["minutesSinceLastVisit"].fillna(0).round()
    
    return df


def _extract_time_features(df: pd.DataFrame) -> pd.DataFrame:
    """Extrai informações temporais do timestamp."""
    df["timestampHistoryDate"] = df["timestampHistory"].dt.date
    df["timestampHistoryTime"] = df["timestampHistory"].dt.strftime("%H:%M")
    df["timestampHistoryWeekday"] = df["timestampHistory"].dt.dayofweek
    df["timestampHistoryHour"] = df["timestampHistory"].dt.hour
    
    return df


def _classify_day_period(df: pd.DataFrame) -> pd.Series:
    """Classifica o período do dia com base na hora."""
    return pd.cut(
        df["timestampHistoryHour"],
        bins=[-1, 5, 11, 17, 23],
        labels=["dawn", "morning", "afternoon", "night"],
        right=True
    )


def _downcast_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Faz o downcast das colunas numéricas para reduzir uso de memória."""
    df['historySize'] = pd.to_numeric(df['historySize'], downcast='integer')
    df['numberOfClicksHistory'] = pd.to_numeric(df['numberOfClicksHistory'], downcast='integer')
    df['timeOnPageHistory'] = pd.to_numeric(df['timeOnPageHistory'], downcast='integer')
    df['pageVisitsCountHistory'] = pd.to_numeric(df['pageVisitsCountHistory'], downcast='integer')
    df['scrollPercentageHistory'] = pd.to_numeric(df['scrollPercentageHistory'], downcast='float')
    df['minutesSinceLastVisit'] = pd.to_numeric(df['minutesSinceLastVisit'], downcast='float')
    df['timestampHistoryWeekday'] = df['timestampHistoryWeekday'].astype('int16')
    df['timestampHistoryHour'] = df['timestampHistoryHour'].astype('int16')
    
    return df

In [10]:
df_users = pre_process_users()
df_users

Unnamed: 0,userId,userType,historySize,historyId,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,minutesSinceLastVisit,timestampHistoryDate,timestampHistoryTime,timestampHistoryWeekday,timestampHistoryHour,isWeekend,dayPeriod,coldStart
0,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,Logged,9,9442c690-21b3-42bc-9e5a-f880de0fa0a8,15,10603,46.23,1,0.0,2022-07-02,22:27,5,22,True,night,False
1,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,Logged,9,253ef8e5-15bf-45dc-ab5b-16689db9b16b,40,85168,56.58,1,4142.0,2022-07-05,19:29,1,19,False,night,False
2,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,Logged,9,553fada2-1c79-48f2-9010-f5c751b63bb1,55,168616,54.10,1,8233.0,2022-07-11,12:42,0,12,False,afternoon,False
3,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,Logged,9,9185d27e-9524-4d65-aa2c-f1d35fb0a1d4,34,70000,56.22,1,563.0,2022-07-11,22:05,0,22,False,night,False
4,000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951...,Logged,9,4e73dd4e-0e2b-4a11-8f5b-658619b08027,62,153094,52.20,1,4257.0,2022-07-14,21:02,3,21,False,night,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123946,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,Non-Logged,17,9bc6f6fa-94fc-4748-b4ff-40aaa6c4f525,1,164477,70.56,1,5828.0,2022-08-01,13:08,0,13,False,afternoon,False
8123947,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,Non-Logged,17,855d20b7-53f2-4678-a10f-55402d085018,0,20000,15.66,1,1362.0,2022-08-02,11:50,1,11,False,morning,False
8123948,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,Non-Logged,17,2c1e7356-9421-42e3-8880-831916c6c738,1,100926,56.24,1,2914.0,2022-08-04,12:23,3,12,False,afternoon,False
8123949,ffffee5eea1777ae6686e5286c79e1d3358ff76a73d4ee...,Non-Logged,17,39155663-356f-4d11-a471-32209ba83a6d,0,130000,56.08,1,6034.0,2022-08-08,16:58,0,16,False,afternoon,False


In [11]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8123951 entries, 0 to 8123950
Data columns (total 16 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   userId                   object  
 1   userType                 category
 2   historySize              int16   
 3   historyId                object  
 4   numberOfClicksHistory    int16   
 5   timeOnPageHistory        int32   
 6   scrollPercentageHistory  float64 
 7   pageVisitsCountHistory   int16   
 8   minutesSinceLastVisit    float32 
 9   timestampHistoryDate     object  
 10  timestampHistoryTime     object  
 11  timestampHistoryWeekday  int16   
 12  timestampHistoryHour     int16   
 13  isWeekend                bool    
 14  dayPeriod                category
 15  coldStart                bool    
dtypes: bool(2), category(2), float32(1), float64(1), int16(5), int32(1), object(4)
memory usage: 480.4+ MB
