# Features generation

## Packages importing

In [1]:
import pandas as pd
import numpy as np
import time
import datetime
import calendar

## Original dataframes loading to test

In [2]:
df_pageviews = pd.read_csv('data/pageviews.csv', parse_dates=['FEC_EVENT'])
df_devicedata = pd.read_csv("./data/device_data.csv", parse_dates=["FEC_EVENT"])
year = 2018
mes_snapshot = 9

def universo_train(df):
    universo = pd.DataFrame()
    universo['user_id'] = df.USER_ID.unique()
    universo['snapshot_mes'] = mes_snapshot
    return universo

universo = universo_train(df_pageviews)

In [3]:
df_conversiones = pd.read_csv("./data/conversiones.csv").drop_duplicates().reset_index(drop=True)

## Features generation per se

Metodo que calcula la cantidad de adopciones que tuvo cada cliente por las siguientes ventanas temporales:
 - ultimo mes (qty_1)
 - ultimos tres meses (qty_3)
 - ultimos seis meses (qty_6)
 - ultimos nueve meses (qty_9)

In [4]:
def qty_adoptions(universo, df_conversiones):
    mes_snapshot = universo['snapshot_mes'][0]
    df_conversiones = df_conversiones[df_conversiones['mes'] <= mes_snapshot]
    df_conversiones = df_conversiones[df_conversiones['mes'] > mes_snapshot - 9]
    df_conversiones = df_conversiones.astype({'mes': 'int32', 'USER_ID': 'int64'})

    qty_1 = [mes_snapshot]
    qty_3 = [mes_snapshot, mes_snapshot-1, mes_snapshot-2]
    qty_6 = [mes_snapshot, mes_snapshot-1, mes_snapshot-2]
    qty_6 = [mes_snapshot, mes_snapshot-1, mes_snapshot-2, mes_snapshot-3, mes_snapshot-4, mes_snapshot-5]
    
    df_qty_1 = df_conversiones[df_conversiones['mes'].isin(qty_1)].groupby(['USER_ID']).count()
    df_qty_1.drop(['anio'], axis='columns', inplace=True)
    df_qty_3 = df_conversiones[df_conversiones['mes'].isin(qty_3)].groupby(['USER_ID']).count()
    df_qty_3.drop(['anio'], axis='columns', inplace=True)
    df_qty_6 = df_conversiones[df_conversiones['mes'].isin(qty_6)].groupby(['USER_ID']).count()
    df_qty_6.drop(['anio'], axis='columns', inplace=True)
    df_qty_9 = df_conversiones.groupby(['USER_ID']).count()
    df_qty_9.drop(['anio'], axis='columns', inplace=True)

    universo = pd.merge(universo, df_qty_1, how='left', left_on=['user_id'], right_on=['USER_ID'])
    universo.rename(columns={'mes': 'qty_adopciones_1M'}, inplace=True)
    
    universo = pd.merge(universo, df_qty_3, how='left', left_on=['user_id'], right_on=['USER_ID'])
    universo.rename(columns={'mes': 'qty_adopciones_3M'}, inplace=True)
    
    universo = pd.merge(universo, df_qty_6, how='left', left_on=['user_id'], right_on=['USER_ID'])
    universo.rename(columns={'mes': 'qty_adopciones_6M'}, inplace=True)
    
    universo = pd.merge(universo, df_qty_9, how='left', left_on=['user_id'], right_on=['USER_ID'])
    universo.rename(columns={'mes': 'qty_adopciones_9M'}, inplace=True)
    
    universo = universo.fillna(0)
    return universo

Metodo que calcula la cantidad de dias desde la ultima adopcion

Ojo! Las adopciones las tenemos a nivel mes, por lo que se tomo el dia de cierre de dicho mes

In [5]:
def qty_days_last_adoption(universo, df_conversiones):
    # today = datetime.datetime.today() # should today be first day of incoming month?
    today = datetime.datetime(year=year, month=mes_snapshot+1, day=1)
    
    df_conversiones = df_conversiones[df_conversiones['mes'] <= mes_snapshot]
    df_conversiones = df_conversiones[df_conversiones['mes'] > mes_snapshot - 9]
    df_conversiones = df_conversiones.astype({'mes': 'int32', 'anio': 'int32', 'USER_ID': 'int64'})

    df_conversiones['date'] = df_conversiones.apply(lambda row: 
                                                    datetime.datetime(year=row['anio'], 
                                                                      month=row['mes'], 
                                                                      day=calendar.monthrange(row['anio'], row['mes'])[1]),
                                                   axis=1)
    df_conversiones = df_conversiones.groupby(['USER_ID']).max().reset_index()
    df_conversiones['qty_dias_ultima_adopcion'] = df_conversiones.apply(lambda row: ((today - row['date']).days), axis=1)
    df_conversiones.drop(['date'], axis='columns', inplace=True)
    
    universo = pd.merge(universo, df_conversiones, how='left', left_on=['user_id'], right_on=['USER_ID'])
    universo = universo.fillna(0)
    return universo

Metodo que calcula la cantidad de dias desde la ultima visita.
Los NaNs se fillean en cero.

In [6]:
def qty_days_last_visit(universo, df_pageviews):
    today = datetime.datetime(year=year, month=mes_snapshot+1, day=1)
    
    df_pageviews['mes'] = pd.DatetimeIndex(df_pageviews['FEC_EVENT']).month
    df_pageviews = df_pageviews[df_pageviews.mes <= mes_snapshot]
    df_pageviews = df_pageviews[df_pageviews.mes > mes_snapshot - 9]
    
    df_ = df_pageviews.groupby(['USER_ID']).max()
    df_.drop(df_.columns.difference(['USER_ID', 'FEC_EVENT']), axis=1, inplace=True)
    df_['qty_dias_ultimo_page_ingreso'] = df_.apply(lambda row: (today - row['FEC_EVENT']).days, axis=1)
    df_.drop(['FEC_EVENT'], axis='columns', inplace=True)

    universo = pd.merge(universo, df_, how='left', left_on=['user_id'], right_on=['USER_ID'])
    universo = universo.fillna(0)
    return universo

In [7]:
# this function should not go since it'd include the cases when adoption is nans (replaced by zero)

# def qty_days_last_visit_vs_adoption_bis(universo, df_pageviews, df_conversiones):
#     universo = qty_days_last_adoption(universo, df_conversiones)
#     universo = qty_days_last_visit(universo, df_pageviews)
#     universo['qty_days_last_visit_vs_adoption'] = universo.apply(lambda row: abs(row['qty_days_last_adoption'] - row['qty_days_last_visit']), axis=1)
    
#     return universo

Metodo que calcula la cantidad de dias entre la ultima adopcion y la ultima visita. Se toma la diferencia a valor absoluto.

Los NaNs se fillean en cero.

In [8]:
def qty_days_last_visit_vs_adoption(universo, df_pageviews, df_conversiones):
    df_pageviews['mes'] = pd.DatetimeIndex(df_pageviews['FEC_EVENT']).month
    df_pageviews = df_pageviews[df_pageviews.mes <= mes_snapshot]
    df_pageviews = df_pageviews[df_pageviews.mes > mes_snapshot - 9]
    df_ = df_pageviews.groupby(['USER_ID']).max()
    df_.drop(df_.columns.difference(['USER_ID', 'FEC_EVENT']), axis=1, inplace=True)

    df_conversiones = df_conversiones[df_conversiones['mes'] <= mes_snapshot]
    df_conversiones = df_conversiones[df_conversiones['mes'] > mes_snapshot - 9]
    df_conversiones = df_conversiones.astype({'mes': 'int32', 'anio': 'int32', 'USER_ID': 'int64'})
    df_conversiones['date'] = df_conversiones.apply(lambda row: 
                                                    datetime.datetime(year=row['anio'], 
                                                                      month=row['mes'], 
                                                                      day=calendar.monthrange(row['anio'], row['mes'])[1]),
                                                   axis=1)
    df_conversiones = df_conversiones.groupby(['USER_ID']).max().reset_index()

    df_conversiones = pd.merge(df_conversiones, df_, how='left', on=['USER_ID'])
    df_conversiones['qty_días_adoption_ultimo_page_ingreso'] = df_conversiones.apply(lambda row: abs((row['date'] - row['FEC_EVENT']).days), axis=1)
    df_conversiones.drop(['FEC_EVENT', 'date', 'mes', 'anio'], axis='columns', inplace=True)

    universo = pd.merge(universo, df_conversiones, how='left', left_on=['user_id'], right_on=['USER_ID'])
    universo.drop(['USER_ID'], axis='columns', inplace=True)
    universo = universo.fillna(0)
    return universo