- Ideia de feature: distância temporal entre a data da notícia e a data que o usuário consumiu a notícia

In [2]:
import os
os.chdir("c://Users//gufer//OneDrive//Documentos//FIAP//Fase_05//ML_Engineer_Datathon/")

In [20]:
import pandas as pd

news = pd.read_parquet("data/processed_data/features/news.parquet")
users = pd.read_parquet("data/processed_data/features/users.parquet")

In [11]:
news.issuedTime

29333    15:43:42
29039    21:17:49
46573    19:38:20
70146    12:38:20
16500    17:44:22
           ...   
38686    13:14:26
66356    21:11:03
53893    15:42:59
27862    11:27:59
77785    11:25:25
Name: issuedTime, Length: 25560, dtype: object

In [9]:
users.timestampHistoryTime

0         14:10
1         19:44
2         19:45
3         20:19
4         18:58
          ...  
808073    01:28
808074    00:44
808075    11:03
808076    06:40
808077    01:03
Name: timestampHistoryTime, Length: 808078, dtype: object

In [21]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 808078 entries, 0 to 808077
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   userId                   808078 non-null  object  
 1   userType                 808078 non-null  category
 2   historySize              808078 non-null  int16   
 3   pageId                   808078 non-null  object  
 4   numberOfClicksHistory    808078 non-null  int16   
 5   timeOnPageHistory        808078 non-null  int32   
 6   scrollPercentageHistory  808078 non-null  float64 
 7   pageVisitsCountHistory   808078 non-null  int16   
 8   minutesSinceLastVisit    808078 non-null  float32 
 9   timestampHistoryDate     808078 non-null  object  
 10  timestampHistoryTime     808078 non-null  object  
 11  timestampHistoryWeekday  808078 non-null  int16   
 12  timestampHistoryHour     808078 non-null  int16   
 13  isWeekend                808078 non-null  bo

In [22]:
news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25560 entries, 29333 to 77785
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   pageId          25560 non-null  object
 1   issuedDate      25560 non-null  object
 2   issuedTime      25560 non-null  object
 3   modifiedDate    25560 non-null  object
 4   modifiedTime    25560 non-null  object
 5   localState      19299 non-null  object
 6   localRegion     19299 non-null  object
 7   themeMain       7879 non-null   object
 8   themeSub        3045 non-null   object
 9   bodyCleaned     25560 non-null  object
 10  titleCleaned    25560 non-null  object
 11  captionCleaned  25560 non-null  object
dtypes: object(12)
memory usage: 2.5+ MB


In [None]:
def preprocess_mix_feats(df_news: pd.DataFrame, df_users: pd.DataFrame) -> pd.DataFrame:
    """
    Pré-processa e combina os dataframes de notícias e de usuários, criando features temporais,
    flags baseadas na quantidade de notícias consumidas por usuário e proporções relativas em 
    relação ao total de notícias consumidas.
    
    Parâmetros:
    -----------
    df_news : pd.DataFrame
        DataFrame contendo os dados das notícias, com colunas como 'issuedDate', 'issuedTime', 
        'localState', 'localRegion', 'themeMain' e 'themeSub', entre outras.
        
    df_users : pd.DataFrame
        DataFrame contendo o histórico de interação dos usuários, com colunas como 'timestampHistoryDate',
        'timestampHistoryTime', 'userId' e outras informações do comportamento do usuário.
        
    Retorna:
    --------
    pd.DataFrame
        DataFrame resultante do merge entre os dados dos usuários e das notícias, enriquecido com:
          - Diferenças temporais entre a publicação da notícia e o consumo.
          - Flags temporais (mesmo dia, mesmo dia da semana).
          - Contagens de notícias consumidas por usuário para as categorias: localState, localRegion, 
            themeMain e themeSub.
          - Proporções relativas dessas contagens em relação ao total de notícias consumidas pelo usuário.
    """
    MIX_FEATS_COLS = [
        "userId",
        "pageId",
        "issuedDate",
        "issuedTime",
        "issuedDatetime",
        "timestampHistoryDate",
        "timestampHistoryTime",
        "timestampHistoryDatetime",
        "localState",
        "localRegion",
        "themeMain",
        "themeSub",
    ]
    
    # Conversão das datas para datetime
    df_news['issuedDate'] = pd.to_datetime(df_news['issuedDate'], format='%Y-%m-%d')
    df_users['timestampHistoryDate'] = pd.to_datetime(df_users['timestampHistoryDate'], format='%Y-%m-%d')

    # Garantindo que os horários estão no formato correto
    df_news['issuedTime'] = pd.to_datetime(df_news['issuedTime'], format='%H:%M:%S', errors='coerce').dt.time
    df_users['timestampHistoryTime'] = pd.to_datetime(df_users['timestampHistoryTime'], format='%H:%M:%S', errors='coerce').dt.time
    
    # Conversão dos horários para timedelta e soma com a data para obter timestamp completo
    df_news['issuedDatetime'] = df_news['issuedDate'] + df_news['issuedTime'].apply(
        lambda t: pd.Timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) if pd.notnull(t) else pd.Timedelta(0)
    )
    df_users['timestampHistoryDatetime'] = df_users['timestampHistoryDate'] + df_users['timestampHistoryTime'].apply(
        lambda t: pd.Timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) if pd.notnull(t) else pd.Timedelta(0)
    )
    
    # Merge dos dataframes a partir da coluna 'pageId'
    df_mix = pd.merge(df_users, df_news, on='pageId', how='inner')
    df_mix = df_mix[MIX_FEATS_COLS]
    
    # Cálculo do gap temporal entre a data/hora de publicação e a data/hora de consumo
    timeGap = df_mix['timestampHistoryDatetime'] - df_mix['issuedDatetime']
    df_mix['timeGapDays'] = timeGap.dt.days
    df_mix['timeGapHours'] = timeGap / pd.Timedelta(hours=1)
    df_mix['timeGapMinutes'] = timeGap / pd.Timedelta(minutes=1)
    
    # Criação de flags baseadas em datas
    df_mix['isSameDay'] = df_mix['timestampHistoryDate'].dt.date == df_mix['issuedDate'].dt.date
    df_mix['newsWeekday'] = df_mix['issuedDate'].dt.weekday
    df_mix['userWeekday'] = df_mix['timestampHistoryDate'].dt.weekday
    df_mix['isSameWeekday'] = df_mix['newsWeekday'] == df_mix['userWeekday']
    
    # Agregação da contagem de notícias consumidas por cada usuário para as categorias desejadas
    df_mix['countLocalStateUser'] = df_mix.groupby('userId')['localState'].transform(lambda x: x.notnull().sum())
    df_mix['countLocalRegionUser'] = df_mix.groupby('userId')['localRegion'].transform(lambda x: x.notnull().sum())
    df_mix['countThemeMainUser'] = df_mix.groupby('userId')['themeMain'].transform(lambda x: x.notnull().sum())
    df_mix['countThemeSubUser'] = df_mix.groupby('userId')['themeSub'].transform(lambda x: x.notnull().sum())
    
    # Contagem total de notícias consumidas por cada usuário
    df_mix['totalUserNews'] = df_mix.groupby('userId')['pageId'].transform('count')
    
    # Cálculo das proporções relativas para cada categoria
    df_mix['relLocalState'] = df_mix['countLocalStateUser'] / df_mix['totalUserNews']
    df_mix['relLocalRegion'] = df_mix['countLocalRegionUser'] / df_mix['totalUserNews']
    df_mix['relThemeMain'] = df_mix['countThemeMainUser'] / df_mix['totalUserNews']
    df_mix['relThemeSub'] = df_mix['countThemeSubUser'] / df_mix['totalUserNews']
    
    return df_mix

preprocess_mix_feats(news, users)

Unnamed: 0,userId,pageId,issuedDate,issuedTime,issuedDatetime,timestampHistoryDate,timestampHistoryTime,timestampHistoryDatetime,localState,localRegion,...,isSameWeekday,countLocalState,countLocalRegion,countThemeMain,countThemeSub,totalUserNews,relLocalState,relLocalRegion,relThemeMain,relThemeSub
0,0006ea19bc2de7e755b09f3a5a8f8d677f0860e4b8f64e...,fc1d78dd-8bcd-4b58-8564-078c024e9a28,2020-07-09,23:01:36,2020-07-09 23:01:36,2022-07-02,NaT,2022-07-02,,,...,False,0,0,1,1,1,0.0,0.0,1.0,1.0
1,0007faff9dc5595d8bc98d13619c483c8ed73d5bf93822...,2775a8d1-78bc-4ad7-89bd-b5b6131e0ffe,2022-08-03,12:28:19,2022-08-03 12:28:19,2022-08-06,NaT,2022-08-06,pe,pernambuco,...,False,1,1,1,0,1,1.0,1.0,1.0,0.0
2,000daf04a745c346dc2c43d55a00a1d0361826c5a3d939...,66a9efac-fd43-4fd1-9824-c404b08efa5d,2022-08-01,11:40:33,2022-08-01 11:40:33,2022-07-18,NaT,2022-07-18,rj,sul-do-rio-costa-verde,...,True,2,2,0,0,2,1.0,1.0,0.0,0.0
3,000daf04a745c346dc2c43d55a00a1d0361826c5a3d939...,6a83890a-d9e9-4f6b-a6c6-90d031785bbf,2022-07-27,13:54:29,2022-07-27 13:54:29,2022-07-22,NaT,2022-07-22,pi,piaui,...,False,2,2,0,0,2,1.0,1.0,0.0,0.0
4,00124e39dfc38f8ef58500e19812f2ed5592022dc31629...,bd47bcf5-51ef-485f-92fd-f09a8e484bb1,2022-08-06,19:21:33,2022-08-06 19:21:33,2022-07-08,NaT,2022-07-08,sp,sao-paulo,...,False,1,1,0,0,1,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84400,ffe712e6ed65e6957dd8cc4d9bd99852fcdc2a652312ea...,79acc216-7fa2-44d7-94c2-04df106d45e7,2021-02-04,09:00:43,2021-02-04 09:00:43,2022-08-07,NaT,2022-08-07,sp,sao-paulo,...,False,1,1,0,0,1,1.0,1.0,0.0,0.0
84401,fff3ab996fd24e7ea351fcec4f4dd18cd159684a8d3ae8...,dec897ab-ba10-4083-903d-de674ea6cfbe,2022-07-11,08:30:22,2022-07-11 08:30:22,2022-07-21,NaT,2022-07-21,,,...,False,0,0,1,0,1,0.0,0.0,1.0,0.0
84402,fff8b83a17df3c162182b2f4a4ed6093f446fc869e0e2d...,d7b34225-e694-4564-9825-833e76ca13d0,2022-06-17,08:04:17,2022-06-17 08:04:17,2022-07-18,NaT,2022-07-18,,,...,False,0,0,1,0,1,0.0,0.0,1.0,0.0
84403,ffff2c95e9c668ba32f163d1b2573ae67c95ea49e7b380...,a1344ecc-8aff-42ad-abfc-08b84462e545,2021-03-30,19:46:38,2021-03-30 19:46:38,2022-08-06,NaT,2022-08-06,,,...,False,0,0,2,0,2,0.0,0.0,1.0,0.0


In [17]:
df_mix["count_localState"]

0        0
1        1
2        2
3        2
4        1
        ..
84400    1
84401    0
84402    0
84403    0
84404    0
Name: count_localState, Length: 84405, dtype: int64