In [26]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.preprocessing import LabelEncoder 

In [3]:
#Importamos los set de datos.
eventos = pd.read_csv("../events_up_to_01062018.csv", low_memory=False)
labels = pd.read_csv("../labels_training_set.csv")

In [4]:
list(eventos)

['timestamp',
 'event',
 'person',
 'url',
 'sku',
 'model',
 'condition',
 'storage',
 'color',
 'skus',
 'search_term',
 'staticpage',
 'campaign_source',
 'search_engine',
 'channel',
 'new_vs_returning',
 'city',
 'region',
 'country',
 'device_type',
 'screen_resolution',
 'operating_system_version',
 'browser_version']

In [11]:
#Creamos un dataframe de features
features = labels

In [31]:
features.shape

(19414, 65)

In [13]:
#Feature: Es un usuario que retorno alguna vez?

df = eventos.groupby(['person'])['new_vs_returning'].agg('count').reset_index()
df['retornoAlSitio'] = df["new_vs_returning"].apply(lambda x: True if x>0 else False)
del df['new_vs_returning']
features = pd.merge(features, df, on='person', how= 'left')


In [14]:
#Feature: Cantidad de tipo de estado de dispositivo que busco.

df = eventos.groupby('person')['condition'].value_counts().unstack()
df = df.rename(columns={'Bom': 'cantidad_visualizaciones_buen_estado', 
                 'Bom - Sem Touch ID': 'cantidad_visualizaciones_buen_estado_sin_touch', 
                 'Novo': 'cantidad_visualizaciones_nuevo',
                 'Muito Bom': 'cantidad_visualizaciones_muy_buen_estado',
                 'Excelente': 'cantidad_visualizaciones_excelente'})

df = df.fillna(0)
features = pd.merge(features, df, on='person', how= 'left')

In [16]:
#Feature: Cantidad de eventos generados totales

df = eventos.groupby(['person'])['event'].agg('count').to_frame('cantidad_eventos_generados').reset_index()
features = pd.merge(features, df, on='person', how= 'left')


In [17]:
#Features: Cantidad por tipo de evento generado
df = eventos.groupby('person')['event'].value_counts().unstack()
df = df.fillna(0)
df = df.rename(columns={'ad campaign hit': 'cantidadIngresosPorCampania', 
                 'brand listing': 'cantidadDeVisualizacionesPorMarca', 
                 'checkout': 'cantidadDeCheckouts',
                 'conversion': 'cantidadConversiones',
                 'generic listing': 'cantidadDeVisualizacionesDeLaHome',
                 'lead': 'cantidadDeGeneracionDeNotificaciones',
                 'search engine hit': 'cantidadDeIngresosPorBuscador',
                 'searched products': 'cantidadDeProductosBuscados',
                 'staticpage': 'cantidadDeVisitas',
                 'viewed product': 'cantidadDeVisualizacionesDeProducto',
                 'visited site': 'cantidadDeIngresosPorUrl',
                })
features = pd.merge(features, df, on='person', how= 'left')

In [18]:
#Features: Cantidad de visualizaciones por marca
eventos_con_marca = eventos
eventos_con_marca['marca'] = eventos_con_marca['model'].str.split(' ').str[0]
df = eventos_con_marca.groupby('person')['marca'].value_counts().unstack()
df = df.fillna(0)
df = df.rename(columns={'Asus': 'cantidadDeVisualizacionesAsus', 
                 'LG': 'cantidadVisualizacionesLG', 
                 'Lenovo': 'cantidadVisualizacionesLenovo',
                 'Outros': 'cantidadVisualizacionesOtros',
                 'Quantum': 'cantidadVisualizacionesQuantum',
                 'Samsung': 'cantidadVisualizacionesSamsung',
                 'Sony': 'cantidadVisualizacionesSony',
                 'Xiaomi': 'cantidadVisualizacionesXiaomi',
                 'iPad': 'cantidadVisualizacionesiPad',
                 'iPhone': 'cantidadDeVisualizacionesiPhone'
                })
features = pd.merge(features, df, on='person', how= 'left')

In [19]:
#Features: Region del usuario
df = eventos.loc[eventos.event.str.contains('visited site'),]
df = df.groupby(['person']).agg({'region': 'first'})
df = df.fillna('NA')
features = pd.merge(features, df, on='person', how= 'left')

In [20]:
#Features: Ciudad del usuario
df = eventos.loc[eventos.event.str.contains('visited site'),]
df = df.groupby(['person']).agg({'city': 'first'})
df = df.fillna('NA')
features = pd.merge(features, df, on='person', how= 'left')

In [21]:
#Features: Pais del usuario
df = eventos.loc[eventos.event.str.contains('visited site'),]
df = df.groupby(['person']).agg({'country': 'first'})
df = df.fillna('NA')
features = pd.merge(features, df, on='person', how= 'left')

In [22]:
#Features: Region del usuario
df = eventos.loc[eventos.event.str.contains('visited site'),]
df = df.groupby(['person']).agg({'device_type': 'first'})
df = df.fillna('NA')
features = pd.merge(features, df, on='person', how= 'left')

In [23]:
#Features: Cantidad por tipo de canal proveniente  generado
df = eventos.groupby('person')['channel'].value_counts().unstack()
df = df.fillna(0)
df = df.rename(columns={'Direct': 'cantidadDeIngresosPorCanalDirecto', 
                 'Email': 'cantidadDeIngresosPorCanalEmail', 
                 'Organic': 'cantidadDeIngresosPorCanalOrganico',
                 'Paid': 'cantidadDeIngresosPorCanalOrganicoPago',
                 'Referral': 'cantidadDeIngresosPorCanalReferido',
                 'Social': 'cantidadDeIngresosPorCanalRedSocial',
                 'Unknown': 'cantidadDeIngresosPorCanalDesconocido'
                })
features = pd.merge(features, df, on='person', how= 'left')

In [24]:
#Features: Cantidad por tipo de campania proveniente  generado
df = eventos.groupby('person')['campaign_source'].value_counts().unstack()
df = df.fillna(0)
features = pd.merge(features, df, on='person', how= 'left')


In [27]:
#Encodeamos las features que nos numeros
lb_make = LabelEncoder()

features['device_type'] = lb_make.fit_transform(features['device_type'].astype(str))
features['city'] = lb_make.fit_transform(features['city'].astype(str))
features['region'] = lb_make.fit_transform(features['region'].astype(str))
features['country'] = lb_make.fit_transform(features['country'].astype(str))

In [28]:
#Compleamos los campos NA con 0
features = features.fillna(0)

In [29]:
features.to_csv("../set_a_entrenar.csv")

In [30]:
features

Unnamed: 0,person,label,retornoAlSitio,cantidad_visualizaciones_buen_estado,cantidad_visualizaciones_buen_estado_sin_touch,cantidad_visualizaciones_excelente,cantidad_visualizaciones_muy_buen_estado,cantidad_visualizaciones_nuevo,cantidad_eventos_generados,cantidadIngresosPorCampania,...,google,indexa,manifest,mercadopago,onsite,rakuten,rtbhouse,voxus,yotpo,zanox
0,0566e9c1,0,True,16.0,2.0,1.0,6.0,0.0,68,6.0,...,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6ec7ee77,0,True,0.0,0.0,0.0,0.0,0.0,2,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,abe7a2fb,0,True,19.0,0.0,3.0,10.0,0.0,96,9.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,34728364,0,True,13.0,0.0,7.0,4.0,0.0,37,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,87ed62de,0,True,6.0,0.0,5.0,0.0,0.0,17,5.0,...,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,db2c4d27,1,True,29.0,4.0,36.0,56.0,0.0,564,3.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,cde431db,0,True,9.0,0.0,0.0,4.0,0.0,24,2.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,be65035b,0,True,0.0,0.0,0.0,0.0,0.0,8,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,a4178891,0,True,6.0,0.0,0.0,0.0,0.0,11,4.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,d066f64c,0,True,1.0,0.0,11.0,1.0,0.0,22,2.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
