In [None]:
from datetime import date
import calendar as c
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 1500)
pd.options.display.max_colwidth = 100000

#### Para testear las funciones, levanto los dataframes originales

In [None]:
df_pageviews = pd.read_csv('data/pageviews.csv', parse_dates=['FEC_EVENT'])
df_adoption = pd.read_csv("./data/conversiones.csv")
year = 2018
mes_snapshot = 9

def universo_train(df):
    universo = pd.DataFrame()
    universo['user_id'] = df.USER_ID.unique()
    universo['snapshot_mes'] = mes_snapshot
    return universo

universo = universo_train(df_pageviews)

In [None]:
def _sum_campo_user(df_pageviews, campo):
    temp = pd.crosstab(df_pageviews.user_id, df_pageviews[campo])
    temp.columns = [campo + "_" + str(v) for v in temp.columns]
    temp = temp.reset_index()
    temp = _df_refactor(temp)
    return temp

In [None]:
def _df_refactor(df_temp):
    df_union = pd.merge(universo, df_temp, how='left', left_on=['user_id'], right_on=['user_id'])
    df_union = df_union.drop(['snapshot_mes'], axis=1)
    df_union = df_union.sort_values('user_id')
    return df_union

In [None]:
def _sum_total_user(df_pageviews, campo):
    temp = df_pageviews.groupby("user_id")[campo].sum().to_frame().reset_index()
    temp = _df_refactor(temp)
#    temp = temp.drop(['user_id'], axis=1)
    return temp

In [None]:
def _df_complete_columns(df_complete_original, df_incomplete):
    df_complete = df_complete_original.copy()
    df_complete[:] = 0
    df = pd.DataFrame()
    for a_column in df_complete:
        if a_column in df_incomplete.columns:
            df[a_column] = df_incomplete[a_column]
        else:
            df[a_column] = df_complete[a_column]
    return df

In [None]:
def _df_repeat_columns(df_complete_original, df_incomplete):
    users = df_complete_original['user_id']
    df_complete = df_complete_original
    df = df_incomplete[[df_incomplete.columns[1] for i in range(df_complete.shape[1]-1)]]
    df['user_id'] = users
    return df

In [None]:
def _universo_last_adoption_month(universo, df_adoption):
    df_last_adoption = df_adoption.groupby('USER_ID')[('mes','anio')].max().reset_index()
    return df_last_adoption

In [None]:
def _get_valid_pageviews_last_adoption(universo, df_pageviews, df_adoption):
    df_last_adoption = _universo_last_adoption_month(universo, df_adoption)
    df_last_adoption_page = pd.merge(df_last_adoption, df_pageviews, how='left', left_on=['USER_ID'], right_on=['USER_ID'])

    df_last_adoption_page['last_adoption_month'] = pd.to_datetime(dict(year=df_last_adoption_page.anio, month=df_last_adoption_page.mes, day=1))
    df_last_adoption_page['valid_data_from'] = df_last_adoption_page.last_adoption_month + pd.DateOffset(months=-1)
    df_last_adoption_page['valid_data_to'] = df_last_adoption_page.last_adoption_month
    
    df_last_adoption_page = df_last_adoption_page[df_last_adoption_page['FEC_EVENT'] >= df_last_adoption_page['valid_data_from']]
    df_last_adoption_page = df_last_adoption_page[df_last_adoption_page['FEC_EVENT'] < df_last_adoption_page['valid_data_to']]
    
    df_last_adoption_page = df_last_adoption_page.drop(['last_adoption_month'], axis=1)
    df_last_adoption_page = df_last_adoption_page.drop(['valid_data_from'], axis=1)
    df_last_adoption_page = df_last_adoption_page.drop(['valid_data_to'], axis=1)
    df_last_adoption_page = df_last_adoption_page.drop(['FEC_EVENT'], axis=1)
    df_last_adoption_page = df_last_adoption_page.drop(['anio'], axis=1)
    df_last_adoption_page = df_last_adoption_page.drop(['mes'], axis=1)
    df_last_adoption_page = df_last_adoption_page.rename(columns={"USER_ID": "user_id"})

    return df_last_adoption_page

In [None]:
def _get_valid_pageviews_last_month(universo, df_pageviews, mes_snapshot):
    df_last_month = pd.merge(universo, df_pageviews, how='left', left_on=['user_id'], right_on=['USER_ID'])
    
    df_last_month['valid_data_to'] = pd.to_datetime(dict(year=year, month=df_last_month.snapshot_mes, day=1))
    df_last_month['valid_data_from'] = df_last_month.valid_data_to + pd.DateOffset(months=-1)
    
    df_last_month = df_last_month[df_last_month['FEC_EVENT'] >= df_last_month['valid_data_from']]
    df_last_month = df_last_month[df_last_month['FEC_EVENT'] < df_last_month['valid_data_to']]

    
    return df_last_month

In [None]:
def _all_pageview_last_adoption_and_last_month(universo, df_pageviews, df_adoption, mes_snapshot, campo):
    pageviews_last_adoption = _get_valid_pageviews_last_adoption(universo, df_pageviews, df_adoption)
    pageviews_last_adoption_sum = _sum_campo_user(pageviews_last_adoption, campo)
    pageviews_last_adoption_sum_tot = _sum_total_user(pageviews_last_adoption, campo)
    
    pageviews_last_month = _get_valid_pageviews_last_month(universo, df_pageviews, mes_snapshot)
    pageviews_last_month_sum = _sum_campo_user(pageviews_last_month, campo)
    pageviews_last_month_sum_tot = _sum_total_user(pageviews_last_month, campo)
    
    return pageviews_last_adoption_sum, pageviews_last_adoption_sum_tot, pageviews_last_month_sum, pageviews_last_month_sum_tot


In [None]:
def _ratio_campo_df(df_numerador, df_denominador, name):
    #Eliminar columnar user_id para que no la divida
    users = df_numerador['user_id']
    df_numerador = df_numerador.drop(['user_id'], axis = 1)
    df_denominador = df_denominador.drop(['user_id'], axis = 1)
    
    #División element-wise
    temp = df_numerador.div(df_denominador, axis=0, fill_value = 0)
    temp.columns = [name + "_" + str(v) for v in temp.columns]

    #Agrega columna user_id
    temp['user_id'] = users

    #Completa Nan
    temp = temp.fillna(-999)
    return temp

In [None]:
def _ratio_campo_all(universo, df_pageviews, df_adoption, mes_snapshot, campo):
    pageviews_last_adoption_sum, pageviews_last_adoption_sum_tot, pageviews_last_month_sum, pageviews_last_month_sum_tot = _all_pageview_last_adoption_and_last_month(universo, df_pageviews, df_adoption, mes_snapshot, campo)
    
    ratio_preferencia = _ratio_campo_df(pageviews_last_adoption_sum, pageviews_last_month_sum, 'ratio_adoption_preferencia')
    ratio_interes = _ratio_campo_df(pageviews_last_adoption_sum_tot, pageviews_last_month_sum_tot, 'ratio_adoption_interes')

    df_union = pd.merge(universo, ratio_preferencia, how='left', left_on=['user_id'], right_on=['user_id'])
    df_union = pd.merge(df_union, ratio_interes, how='left', left_on=['user_id'], right_on=['user_id'])

    return df_union


In [None]:
def ratio_adoption_page_all(universo, df_pageviews, df_adoption, mes_snapshot):
    campo = 'PAGE'
    return _ratio_campo_all(universo, df_pageviews, df_adoption, mes_snapshot, campo)

In [None]:
def ratio_adoption_ccategory_all(universo, df_pageviews, df_adoption, mes_snapshot):
    campo = 'CONTENT_CATEGORY'
    return _ratio_campo_all(universo, df_pageviews, df_adoption, mes_snapshot, campo)

In [None]:
def ratio_adoption_cctop_all(universo, df_pageviews, df_adoption, mes_snapshot):
    campo = 'CONTENT_CATEGORY_TOP'
    return _ratio_campo_all(universo, df_pageviews, df_adoption, mes_snapshot, campo)

In [None]:
def ratio_adoption_ccbottom_all(universo, df_pageviews, df_adoption, mes_snapshot):
    campo = 'CONTENT_CATEGORY_BOTTOM'
    return _ratio_campo_all(universo, df_pageviews, df_adoption, mes_snapshot, campo)

In [None]:
def ratio_adoption_site_id_all(universo, df_pageviews, df_adoption, mes_snapshot):
    campo = 'SITE_ID'
    return _ratio_campo_all(universo, df_pageviews, df_adoption, mes_snapshot, campo)

In [None]:
def ratio_adoption_mobile_device_all(universo, df_devicedata, df_adoption, mes_snapshot):
    campo = 'IS_MOBILE_DEVICE'
    return _ratio_campo_all(universo, df_devicedata, df_adoption, mes_snapshot, campo)

In [None]:
def ratio_adoption_connection_speed_all(universo, df_devicedata, df_adoption, mes_snapshot):
    campo = 'CONNECTION_SPEED'
    return _ratio_campo_all(universo, df_devicedata, df_adoption, mes_snapshot, campo)

In [None]:
def ratio_adoption_on_site_search_term_all(universo, df_devicedata, df_adoption, mes_snapshot):
    campo = 'ON_SITE_SEARCH_TERM'
    return _ratio_campo_all(universo, df_devicedata, mes_snapshot, campo)