- Ideia de feature: distância temporal entre a data da notícia e a data que o usuário consumiu a notícia

In [1]:
import os
os.chdir("c://Users//gufer//OneDrive//Documentos//FIAP//Fase_05//ML_Engineer_Datathon/")

In [3]:
import pandas as pd

news = pd.read_parquet("data/processed_data/features/news_feats.parquet")
users = pd.read_parquet("data/processed_data/features/users_feats.parquet")
mix = pd.read_parquet("data/processed_data/features/mix_df.parquet")

In [5]:
mix["localState"]

0        None
1          pe
2          rj
3          pi
4          sp
         ... 
84400      sp
84401    None
84402    None
84403    None
84404    None
Name: localState, Length: 84405, dtype: object

In [4]:
news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25560 entries, 29333 to 77785
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   pageId          25560 non-null  object
 1   issuedDate      25560 non-null  object
 2   issuedTime      25560 non-null  object
 3   modifiedDate    25560 non-null  object
 4   modifiedTime    25560 non-null  object
 5   localState      19299 non-null  object
 6   localRegion     19299 non-null  object
 7   themeMain       7879 non-null   object
 8   themeSub        3045 non-null   object
 9   bodyCleaned     25560 non-null  object
 10  titleCleaned    25560 non-null  object
 11  captionCleaned  25560 non-null  object
dtypes: object(12)
memory usage: 2.5+ MB


In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 808078 entries, 0 to 808077
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   userId                   808078 non-null  object  
 1   userType                 808078 non-null  category
 2   historySize              808078 non-null  int16   
 3   pageId                   808078 non-null  object  
 4   numberOfClicksHistory    808078 non-null  int16   
 5   timeOnPageHistory        808078 non-null  int32   
 6   scrollPercentageHistory  808078 non-null  float64 
 7   pageVisitsCountHistory   808078 non-null  int16   
 8   minutesSinceLastVisit    808078 non-null  float32 
 9   timestampHistoryDate     808078 non-null  object  
 10  timestampHistoryTime     808078 non-null  object  
 11  timestampHistoryWeekday  808078 non-null  int16   
 12  timestampHistoryHour     808078 non-null  int16   
 13  isWeekend                808078 non-null  bo

In [6]:
#COLS
MIX_FEATS_COLS = [
    "userId", 
    "pageId", 
    "issuedDate", 
    "issuedTime", 
    "issuedDatetime",
    "timestampHistoryDate", 
    "timestampHistoryTime", 
    "timestampHistoryDatetime",
    "localState", 
    "localRegion", 
    "themeMain", 
    "themeSub"
]

STATE_COLS = [
    "userId", 
    "localState", 
    "countLocalStateUser", 
    "relLocalState"
]

REGION_COLS = [
    "userId", 
    "localRegion", 
    "countLocalRegionUser", 
    "relLocalRegion"
]

THEME_MAIN_COLS = [
    "userId", 
    "themeMain", 
    "countThemeMainUser", 
    "relThemeMain"
]

THEME_SUB_COLS = [
    "userId", 
    "themeSub", 
    "countThemeSubUser", 
    "relThemeSub"
]

GAP_COLS = [
    "userId", 
    "pageId", 
    "timeGapDays", 
    "timeGapHours", 
    "timeGapMinutes", 
    "timeGapLessThanOneDay"
]

def preprocess_mix_feats(df_news: pd.DataFrame, df_users: pd.DataFrame):
    """
    Pré-processa e combina os dataframes de notícias e de usuários, criando features temporais,
    flags baseadas na quantidade de notícias consumidas por usuário e proporções relativas em 
    relação ao total de notícias consumidas.
    
    Parâmetros:
    -----------
    df_news : pd.DataFrame
        DataFrame contendo os dados das notícias, com colunas como 'issuedDate', 'issuedTime', 
        'localState', 'localRegion', 'themeMain' e 'themeSub', entre outras.
        
    df_users : pd.DataFrame
        DataFrame contendo o histórico de interação dos usuários, com colunas como 'timestampHistoryDate',
        'timestampHistoryTime', 'userId' e outras informações do comportamento do usuário.
        
    Retorna:
    --------
    tuple de pd.DataFrame
        gap_df, state_df, region_df, tm_df, ts_df, que são os DataFrames filtrados para cada grupo.
    """


    # Conversão das datas para datetime
    df_news['issuedDate'] = pd.to_datetime(df_news['issuedDate'], format='%Y-%m-%d')
    df_users['timestampHistoryDate'] = pd.to_datetime(df_users['timestampHistoryDate'], format='%Y-%m-%d')

    # Garantindo que os horários estão no formato correto
    df_news['issuedTime'] = pd.to_datetime(df_news['issuedTime'], format='%H:%M:%S', errors='coerce').dt.time
    df_users['timestampHistoryTime'] = pd.to_datetime(df_users['timestampHistoryTime'], format='%H:%M:%S', errors='coerce').dt.time

    # Conversão dos horários para timedelta e soma com a data para obter timestamp completo
    df_news['issuedDatetime'] = df_news['issuedDate'] + df_news['issuedTime'].apply(
        lambda t: pd.Timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) if pd.notnull(t) else pd.Timedelta(0)
    )
    df_users['timestampHistoryDatetime'] = df_users['timestampHistoryDate'] + df_users['timestampHistoryTime'].apply(
        lambda t: pd.Timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) if pd.notnull(t) else pd.Timedelta(0)
    )

    # Merge dos dataframes a partir da coluna 'pageId'
    df_mix = pd.merge(df_users, df_news, on='pageId', how='inner')
    df_mix = df_mix[MIX_FEATS_COLS]
    return df_mix

    # Cálculo do gap temporal entre a publicação e o consumo
    timeGap = df_mix['timestampHistoryDatetime'] - df_mix['issuedDatetime']
    df_mix['timeGapDays'] = timeGap.dt.days
    df_mix['timeGapHours'] = timeGap / pd.Timedelta(hours=1)
    df_mix['timeGapMinutes'] = timeGap / pd.Timedelta(minutes=1)
    df_mix["timeGapLessThanOneDay"] = df_mix['timeGapHours'] <= 24

    # Lista das colunas de categorias para contagem
    category_columns = ['localState', 'localRegion', 'themeMain', 'themeSub']

    # Contagem de notícias consumidas por cada usuário para cada categoria
    for col in category_columns:
        # Garante que a capitalização seja preservada (ex.: "localState" -> "LocalState")
        col_title = col[0].upper() + col[1:]
        count_col_name = f'count{col_title}User'
        df_mix[count_col_name] = df_mix.groupby(['userId', col])['pageId'].transform('count')

    # Contagem total de notícias consumidas por cada usuário
    df_mix['totalUserNews'] = df_mix.groupby('userId')['pageId'].transform('count')

    # Cálculo das proporções relativas para cada categoria
    for col in category_columns:
        col_title = col[0].upper() + col[1:]
        count_col_name = f'count{col_title}User'
        rel_col_name = f'rel{col_title}'
        df_mix[rel_col_name] = df_mix[count_col_name] / df_mix['totalUserNews']

    # Separação dos DataFrames
    gap_df = df_mix[GAP_COLS]
    gap_df = gap_df[gap_df["timeGapDays"] >= 0]
    gap_df = gap_df.reset_index(drop=True)

    state_df = df_mix[STATE_COLS]
    state_df = state_df[state_df["countLocalStateUser"] > 0]
    state_df = state_df.reset_index(drop=True)

    region_df = df_mix[REGION_COLS]
    region_df = region_df[region_df["countLocalRegionUser"] > 0]
    region_df = region_df.reset_index(drop=True)

    tm_df = df_mix[THEME_MAIN_COLS]
    tm_df = tm_df[tm_df["countThemeMainUser"] > 0]
    tm_df = tm_df.reset_index(drop=True)

    ts_df = df_mix[THEME_SUB_COLS]
    ts_df = ts_df[ts_df["countThemeSubUser"] > 0]
    ts_df = ts_df.reset_index(drop=True)

    unread_news = get_unread_news(df_news,df_users)

    return unread_news, gap_df, state_df, region_df, tm_df, ts_df

df_mix = preprocess_mix_feats(news, users)
#gap_df, state_df, region_df, tm_df, ts_df = preprocess_mix_feats(news, users)
def get_unread_news(news: pd.DataFrame, users: pd.DataFrame) -> pd.DataFrame:
    """
    Retorna um dataframe com os pares (userId, pageId) para os quais o usuário NÃO leu a notícia.
    
    Parâmetros:
    -----------
    news : pd.DataFrame
        DataFrame com as notícias disponíveis.
    users : pd.DataFrame
        DataFrame com o histórico completo de navegação dos usuários.
        
    Retorna:
    --------
    pd.DataFrame
        DataFrame com os pares (userId, pageId) de notícias não lidas.
        As demais colunas de 'news' também serão mantidas.
    """
    # 1. Obter os usuários únicos
    unique_users = users[['userId']].drop_duplicates()

    # 2. Criar chave auxiliar para realizar o cross join
    unique_users['key'] = 1
    news['key'] = 1

    # Realizar o cross join: cada usuário com todas as notícias
    user_news = pd.merge(unique_users, news, on='key').drop('key', axis=1)

    # 3. Fazer o left join com o histórico de navegação (para identificar os pares já lidos)
    # Se o par (userId, pageId) existir no histórico, a coluna _merge será "both".
    merged = pd.merge(
        user_news,
        users[['userId', 'pageId']],
        on=['userId', 'pageId'],
        how='left',
        indicator=True
    )

    # 4. Selecionar apenas os pares onde o usuário NÃO leu a notícia
    unread_news = merged[merged['_merge'] == 'left_only'].drop('_merge', axis=1)
    
    # Opcional: Remover a chave auxiliar se ainda existir
    if 'key' in unread_news.columns:
        unread_news = unread_news.drop('key', axis=1)
    
    return unread_news

In [18]:
# Definição das colunas
MIX_FEATS_COLS = [
    "userId", "pageId", "issuedDate", "issuedTime", "issuedDatetime",
    "timestampHistoryDate", "timestampHistoryTime", "timestampHistoryDatetime",
    "localState", "localRegion", "themeMain", "themeSub", "coldStart"                
]
STATE_COLS = ["userId", "localState", "countLocalStateUser", "relLocalState", "coldStart"]
REGION_COLS = ["userId", "localRegion", "countLocalRegionUser", "relLocalRegion", "coldStart"]
THEME_MAIN_COLS = ["userId", "themeMain", "countThemeMainUser", "relThemeMain", "coldStart"]
THEME_SUB_COLS = ["userId", "themeSub", "countThemeSubUser", "relThemeSub", "coldStart"]
GAP_COLS = [
    "userId", "pageId", "timeGapDays", "timeGapHours",
    "timeGapMinutes", "timeGapLessThanOneDay"
]

def _process_datetime(df_news: pd.DataFrame, df_users: pd.DataFrame):
    """Converte datas e horários e cria os timestamps completos."""
    df_news['issuedDate'] = pd.to_datetime(df_news['issuedDate'], format='%Y-%m-%d')
    df_users['timestampHistoryDate'] = pd.to_datetime(df_users['timestampHistoryDate'], format='%Y-%m-%d')
    
    df_news['issuedTime'] = pd.to_datetime(
        df_news['issuedTime'], format='%H:%M:%S', errors='coerce'
    ).dt.time
    df_users['timestampHistoryTime'] = pd.to_datetime(
        df_users['timestampHistoryTime'], format='%H:%M:%S', errors='coerce'
    ).dt.time

    df_news['issuedDatetime'] = df_news['issuedDate'] + df_news['issuedTime'].apply(
        lambda t: pd.Timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) if pd.notnull(t) else pd.Timedelta(0)
    )
    df_users['timestampHistoryDatetime'] = df_users['timestampHistoryDate'] + df_users['timestampHistoryTime'].apply(
        lambda t: pd.Timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) if pd.notnull(t) else pd.Timedelta(0)
    )
    return df_news, df_users

def _compute_time_gap(df_mix: pd.DataFrame):
    gap = df_mix['timestampHistoryDatetime'] - df_mix['issuedDatetime']
    df_mix['timeGapDays'] = gap.dt.days
    df_mix['timeGapHours'] = gap / pd.Timedelta(hours=1)
    df_mix['timeGapMinutes'] = gap / pd.Timedelta(minutes=1)
    df_mix['timeGapLessThanOneDay'] = df_mix['timeGapHours'] <= 24
    return df_mix

def _compute_category_counts(df_mix: pd.DataFrame, category_columns=None):
    if category_columns is None:
        category_columns = ['localState', 'localRegion', 'themeMain', 'themeSub']
    for col in category_columns:
        col_title = col[0].upper() + col[1:]
        count_col = f'count{col_title}User'
        df_mix[count_col] = df_mix.groupby(['userId', col])['pageId'].transform('count')
    df_mix['totalUserNews'] = df_mix.groupby('userId')['pageId'].transform('count')
    for col in category_columns:
        col_title = col[0].upper() + col[1:]
        count_col = f'count{col_title}User'
        rel_col = f'rel{col_title}'
        df_mix[rel_col] = df_mix[count_col] / df_mix['totalUserNews']
    return df_mix

def _split_dataframes(df_mix: pd.DataFrame):
    gap_df = df_mix[GAP_COLS].copy()
    gap_df = gap_df[gap_df["timeGapDays"] >= 0].reset_index(drop=True)
    
    state_df = df_mix[STATE_COLS].copy()
    state_df = state_df[state_df["countLocalStateUser"] > 0].reset_index(drop=True)
    
    region_df = df_mix[REGION_COLS].copy()
    region_df = region_df[region_df["countLocalRegionUser"] > 0].reset_index(drop=True)
    
    tm_df = df_mix[THEME_MAIN_COLS].copy()
    tm_df = tm_df[tm_df["countThemeMainUser"] > 0].reset_index(drop=True)
    
    ts_df = df_mix[THEME_SUB_COLS].copy()
    ts_df = ts_df[ts_df["countThemeSubUser"] > 0].reset_index(drop=True)
    
    return gap_df, state_df, region_df, tm_df, ts_df

def _get_unread_news_for_user(news: pd.DataFrame, users: pd.DataFrame, user_id: str) -> pd.DataFrame:
    """
    Retorna um DataFrame com as notícias não lidas para um único usuário.

    Parâmetros:
    -----------
    news : pd.DataFrame
        DataFrame com as notícias disponíveis.
    users : pd.DataFrame
        DataFrame com o histórico completo dos usuários.
    user_id : str
        Identificador do usuário para o qual se deseja obter as notícias não lidas.

    Retorna:
    --------
    pd.DataFrame
        DataFrame com as notícias que o usuário ainda não leu, com a coluna 'userId' adicionada.
    """
    read_pages = users.loc[users['userId'] == user_id, 'pageId'].unique()
    unread = news[~news['pageId'].isin(read_pages)].copy()
    unread['userId'] = user_id
    unread = unread[['userId', 'pageId']].reset_index(drop=True)
    return unread

def preprocess_mix_feats(df_news: pd.DataFrame, df_users: pd.DataFrame):
    """
    Pré-processa e combina os dataframes de notícias e usuários, criando features temporais,
    contagens de categorias e separando os dados em diferentes DataFrames.

    Parâmetros:
    -----------
    df_news : pd.DataFrame
        DataFrame com dados das notícias (ex.: 'issuedDate', 'issuedTime', 'localState', etc.).
    df_users : pd.DataFrame
        DataFrame com o histórico dos usuários (ex.: 'timestampHistoryDate', 'timestampHistoryTime', 'userId', etc.).

    Retorna:
    --------
    tuple:
        gap_df, state_df, region_df, tm_df, ts_df
        - gap_df: DataFrame com informações do gap temporal.
        - state_df: DataFrame com informações de 'localState'.
        - region_df: DataFrame com informações de 'localRegion'.
        - tm_df: DataFrame com informações de 'themeMain'.
        - ts_df: DataFrame com informações de 'themeSub'.
    """
    df_news, df_users = _process_datetime(df_news, df_users)
    
    # Merge inline entre usuários e notícias
    df_mix = pd.merge(df_users, df_news, on='pageId', how='inner')[MIX_FEATS_COLS]
    
    df_mix = _compute_time_gap(df_mix)
    df_mix = _compute_category_counts(df_mix)
    gap_df, state_df, region_df, tm_df, ts_df = _split_dataframes(df_mix)
    
    return gap_df, state_df, region_df, tm_df, ts_df

# Processing everything
(gap_df, state_df, region_df, tm_df, ts_df) = preprocess_mix_feats(news, users)
# Exemplo de chamada na API para um único usuário:
unread_news_for_user = _get_unread_news_for_user(news, users, user_id='ffe133162533bd67689c667be6c302b7342f8a682d28d7')

In [19]:
unread_news_for_user

Unnamed: 0,userId,pageId
0,ffe133162533bd67689c667be6c302b7342f8a682d28d7,d0d625d8-a157-4fe6-b59f-cb5ed2617f55
1,ffe133162533bd67689c667be6c302b7342f8a682d28d7,57afae65-8687-4806-bc75-0386de3041dd
2,ffe133162533bd67689c667be6c302b7342f8a682d28d7,097410ee-2282-4be7-b7df-5f2bbe4b8450
3,ffe133162533bd67689c667be6c302b7342f8a682d28d7,c631079e-a313-4dca-9302-d9a5d5f7f920
4,ffe133162533bd67689c667be6c302b7342f8a682d28d7,425f1934-e202-4937-b49c-27ab42671c77
...,...,...
25555,ffe133162533bd67689c667be6c302b7342f8a682d28d7,289b46c8-0df7-4a50-86ac-233598fdd479
25556,ffe133162533bd67689c667be6c302b7342f8a682d28d7,c8a4bf31-b968-4cb3-a1de-81d2c3255cbc
25557,ffe133162533bd67689c667be6c302b7342f8a682d28d7,d4ffd0bf-bb72-4577-83cf-0b87f3d89729
25558,ffe133162533bd67689c667be6c302b7342f8a682d28d7,39cd30b3-2a9c-4086-877d-6df0c97e734c


In [13]:
region_df

Unnamed: 0,userId,localRegion,countLocalRegionUser,relLocalRegion,coldStart
0,0007faff9dc5595d8bc98d13619c483c8ed73d5bf93822...,pernambuco,1.0,1.000000,True
1,000daf04a745c346dc2c43d55a00a1d0361826c5a3d939...,sul-do-rio-costa-verde,1.0,0.500000,False
2,000daf04a745c346dc2c43d55a00a1d0361826c5a3d939...,piaui,1.0,0.500000,False
3,00124e39dfc38f8ef58500e19812f2ed5592022dc31629...,sao-paulo,1.0,1.000000,True
4,001a6375507d3128ddbdca3038952e943aa740a4013fa7...,sao-paulo,2.0,0.666667,False
...,...,...,...,...,...
53513,ffe133162533bd67689c667be6c302b7342f8a682d28d7...,tocantins,2.0,0.064516,False
53514,ffe133162533bd67689c667be6c302b7342f8a682d28d7...,piaui,2.0,0.064516,False
53515,ffe133162533bd67689c667be6c302b7342f8a682d28d7...,bauru-marilia,2.0,0.064516,False
53516,ffe16673da1661f159e8b8e4e5234513ae383a3f67c02f...,bauru-marilia,1.0,1.000000,True


In [None]:
ts_df

Unnamed: 0,userId,themeSub,countThemeSubUser,relThemeSub
0,0006ea19bc2de7e755b09f3a5a8f8d677f0860e4b8f64e...,coronavirus,1.0,1.000000
1,001f0e200640102d46755cdb87bced740ef1d4a4ee353e...,prefeitura-de-goiania,1.0,0.058824
2,002d54131f16dcc589e0630d1d4563dae7e947ffd6d8ee...,2022,1.0,1.000000
3,003506e8858582121df42d576784fba6473b21e42692c9...,musica,1.0,1.000000
4,003a1f9224dbcf66de78c9f1044de810fb9d53e1b1b30e...,ucrania-russia,2.0,0.500000
...,...,...,...,...
8423,ffbe67664a9f0867cebadc44e6dbead29ec59021bb4d98...,cinema,1.0,0.166667
8424,ffd8a476ee9846b6bcc47696001615cd75b756f7cb2ccf...,pets,1.0,0.076923
8425,ffd8a476ee9846b6bcc47696001615cd75b756f7cb2ccf...,coronavirus,1.0,0.076923
8426,ffe133162533bd67689c667be6c302b7342f8a682d28d7...,eleicoes,2.0,0.064516


In [43]:
tm_df

Unnamed: 0,userId,themeMain,countThemeMainUser,relThemeMain
0,0006ea19bc2de7e755b09f3a5a8f8d677f0860e4b8f64e...,bemestar,1.0,1.000000
1,0007faff9dc5595d8bc98d13619c483c8ed73d5bf93822...,educacao,1.0,1.000000
2,001a6375507d3128ddbdca3038952e943aa740a4013fa7...,o-que-fazer-em-sao-paulo,1.0,0.333333
3,001f0e200640102d46755cdb87bced740ef1d4a4ee353e...,politica,1.0,0.058824
4,001f0e200640102d46755cdb87bced740ef1d4a4ee353e...,mundo,1.0,0.058824
...,...,...,...,...
30957,ffe553cc357d794c744defb60886eac9f257128ee1852c...,pop-arte,1.0,1.000000
30958,fff3ab996fd24e7ea351fcec4f4dd18cd159684a8d3ae8...,economia,1.0,1.000000
30959,fff8b83a17df3c162182b2f4a4ed6093f446fc869e0e2d...,meio-ambiente,1.0,1.000000
30960,ffff2c95e9c668ba32f163d1b2573ae67c95ea49e7b380...,pop-arte,1.0,0.500000


In [42]:
gap_df

Unnamed: 0,userId,pageId,timeGapDays,timeGapHours,timeGapMinutes,timeGapLessThanOneDay
0,0006ea19bc2de7e755b09f3a5a8f8d677f0860e4b8f64e...,fc1d78dd-8bcd-4b58-8564-078c024e9a28,722,17329.842500,1.039791e+06,False
1,0007faff9dc5595d8bc98d13619c483c8ed73d5bf93822...,2775a8d1-78bc-4ad7-89bd-b5b6131e0ffe,3,73.677778,4.420667e+03,False
2,001a6375507d3128ddbdca3038952e943aa740a4013fa7...,26135428-2804-4db4-aef5-f9fcf756b0a4,12,288.616667,1.731700e+04,False
3,001a6375507d3128ddbdca3038952e943aa740a4013fa7...,eb5879d5-af89-46dd-b06c-2df5a0d60af3,373,8954.940000,5.372964e+05,False
4,001a6375507d3128ddbdca3038952e943aa740a4013fa7...,e0b8e742-f94f-4361-b662-9e9d24c99803,3,93.891111,5.633467e+03,False
...,...,...,...,...,...,...
44902,ffe712e6ed65e6957dd8cc4d9bd99852fcdc2a652312ea...,79acc216-7fa2-44d7-94c2-04df106d45e7,549,13186.777222,7.912066e+05,False
44903,fff3ab996fd24e7ea351fcec4f4dd18cd159684a8d3ae8...,dec897ab-ba10-4083-903d-de674ea6cfbe,10,250.335556,1.502013e+04,False
44904,fff8b83a17df3c162182b2f4a4ed6093f446fc869e0e2d...,d7b34225-e694-4564-9825-833e76ca13d0,31,754.817222,4.528903e+04,False
44905,ffff2c95e9c668ba32f163d1b2573ae67c95ea49e7b380...,a1344ecc-8aff-42ad-abfc-08b84462e545,493,11837.485833,7.102492e+05,False


In [41]:
region_df

Unnamed: 0,userId,localRegion,countLocalRegionUser,relLocalRegion
0,0007faff9dc5595d8bc98d13619c483c8ed73d5bf93822...,pernambuco,1.0,1.000000
1,000daf04a745c346dc2c43d55a00a1d0361826c5a3d939...,sul-do-rio-costa-verde,1.0,0.500000
2,000daf04a745c346dc2c43d55a00a1d0361826c5a3d939...,piaui,1.0,0.500000
3,00124e39dfc38f8ef58500e19812f2ed5592022dc31629...,sao-paulo,1.0,1.000000
4,001a6375507d3128ddbdca3038952e943aa740a4013fa7...,sao-paulo,2.0,0.666667
...,...,...,...,...
53513,ffe133162533bd67689c667be6c302b7342f8a682d28d7...,tocantins,2.0,0.064516
53514,ffe133162533bd67689c667be6c302b7342f8a682d28d7...,piaui,2.0,0.064516
53515,ffe133162533bd67689c667be6c302b7342f8a682d28d7...,bauru-marilia,2.0,0.064516
53516,ffe16673da1661f159e8b8e4e5234513ae383a3f67c02f...,bauru-marilia,1.0,1.000000


In [40]:
state_df

Unnamed: 0,userId,localState,countLocalStateUser,relLocalState
0,0007faff9dc5595d8bc98d13619c483c8ed73d5bf93822...,pe,1.0,1.000000
1,000daf04a745c346dc2c43d55a00a1d0361826c5a3d939...,rj,1.0,0.500000
2,000daf04a745c346dc2c43d55a00a1d0361826c5a3d939...,pi,1.0,0.500000
3,00124e39dfc38f8ef58500e19812f2ed5592022dc31629...,sp,1.0,1.000000
4,001a6375507d3128ddbdca3038952e943aa740a4013fa7...,sp,2.0,0.666667
...,...,...,...,...
53513,ffe133162533bd67689c667be6c302b7342f8a682d28d7...,to,2.0,0.064516
53514,ffe133162533bd67689c667be6c302b7342f8a682d28d7...,pi,2.0,0.064516
53515,ffe133162533bd67689c667be6c302b7342f8a682d28d7...,sp,5.0,0.161290
53516,ffe16673da1661f159e8b8e4e5234513ae383a3f67c02f...,sp,1.0,1.000000


In [30]:
gap_df

Unnamed: 0,userId,pageId,timeGapDays,timeGapHours,timeGapMinutes,timeGapLessThanOneDay
0,0006ea19bc2de7e755b09f3a5a8f8d677f0860e4b8f64e...,fc1d78dd-8bcd-4b58-8564-078c024e9a28,722,17329.842500,1.039791e+06,False
1,0007faff9dc5595d8bc98d13619c483c8ed73d5bf93822...,2775a8d1-78bc-4ad7-89bd-b5b6131e0ffe,3,73.677778,4.420667e+03,False
5,001a6375507d3128ddbdca3038952e943aa740a4013fa7...,26135428-2804-4db4-aef5-f9fcf756b0a4,12,288.616667,1.731700e+04,False
6,001a6375507d3128ddbdca3038952e943aa740a4013fa7...,eb5879d5-af89-46dd-b06c-2df5a0d60af3,373,8954.940000,5.372964e+05,False
7,001a6375507d3128ddbdca3038952e943aa740a4013fa7...,e0b8e742-f94f-4361-b662-9e9d24c99803,3,93.891111,5.633467e+03,False
...,...,...,...,...,...,...
84400,ffe712e6ed65e6957dd8cc4d9bd99852fcdc2a652312ea...,79acc216-7fa2-44d7-94c2-04df106d45e7,549,13186.777222,7.912066e+05,False
84401,fff3ab996fd24e7ea351fcec4f4dd18cd159684a8d3ae8...,dec897ab-ba10-4083-903d-de674ea6cfbe,10,250.335556,1.502013e+04,False
84402,fff8b83a17df3c162182b2f4a4ed6093f446fc869e0e2d...,d7b34225-e694-4564-9825-833e76ca13d0,31,754.817222,4.528903e+04,False
84403,ffff2c95e9c668ba32f163d1b2573ae67c95ea49e7b380...,a1344ecc-8aff-42ad-abfc-08b84462e545,493,11837.485833,7.102492e+05,False


In [None]:
# FINAL FEATURE DF


In [10]:
df_mix["relLocalState"].value_counts()

relLocalState
1.000000    13741
0.500000     6492
0.000000     5242
0.666667     4587
0.750000     2968
            ...  
0.350000       20
0.263158       19
0.294118       17
0.062500       16
0.133333       15
Name: count, Length: 329, dtype: int64