In [2]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.preprocessing import LabelEncoder 

In [3]:
#Importamos los set de datos.
eventos = pd.read_csv("../events_up_to_01062018.csv", low_memory=False)
labels = pd.read_csv("../trocafone_kaggle_test.csv")

In [4]:
list(eventos)

['timestamp',
 'event',
 'person',
 'url',
 'sku',
 'model',
 'condition',
 'storage',
 'color',
 'skus',
 'search_term',
 'staticpage',
 'campaign_source',
 'search_engine',
 'channel',
 'new_vs_returning',
 'city',
 'region',
 'country',
 'device_type',
 'screen_resolution',
 'operating_system_version',
 'browser_version']

In [21]:
#Creamos un dataframe de features
features = labels

In [23]:
features.shape

(19415, 1)

In [24]:
#Feature: Es un usuario que retorno alguna vez?

df = eventos.groupby(['person'])['new_vs_returning'].agg('count').reset_index()
df['retornoAlSitio'] = df["new_vs_returning"].apply(lambda x: True if x>0 else False)
del df['new_vs_returning']
features = pd.merge(features, df, on='person', how= 'left')


In [25]:
#Feature: Cantidad de tipo de estado de dispositivo que busco.

df = eventos.groupby('person')['condition'].value_counts().unstack()
df = df.rename(columns={'Bom': 'cantidad_visualizaciones_buen_estado', 
                 'Bom - Sem Touch ID': 'cantidad_visualizaciones_buen_estado_sin_touch', 
                 'Novo': 'cantidad_visualizaciones_nuevo',
                 'Muito Bom': 'cantidad_visualizaciones_muy_buen_estado',
                 'Excelente': 'cantidad_visualizaciones_excelente'})

df = df.fillna(0)
features = pd.merge(features, df, on='person', how= 'left')

In [26]:
#Feature: Cantidad de eventos generados totales

df = eventos.groupby(['person'])['event'].agg('count').to_frame('cantidad_eventos_generados').reset_index()
features = pd.merge(features, df, on='person', how= 'left')


In [27]:
#Features: Cantidad por tipo de evento generado
df = eventos.groupby('person')['event'].value_counts().unstack()
df = df.fillna(0)
df = df.rename(columns={'ad campaign hit': 'cantidadIngresosPorCampania', 
                 'brand listing': 'cantidadDeVisualizacionesPorMarca', 
                 'checkout': 'cantidadDeCheckouts',
                 'conversion': 'cantidadConversiones',
                 'generic listing': 'cantidadDeVisualizacionesDeLaHome',
                 'lead': 'cantidadDeGeneracionDeNotificaciones',
                 'search engine hit': 'cantidadDeIngresosPorBuscador',
                 'searched products': 'cantidadDeProductosBuscados',
                 'staticpage': 'cantidadDeVisitas',
                 'viewed product': 'cantidadDeVisualizacionesDeProducto',
                 'visited site': 'cantidadDeIngresosPorUrl',
                })
features = pd.merge(features, df, on='person', how= 'left')

In [28]:
#Features: Cantidad de visualizaciones por marca
eventos_con_marca = eventos
eventos_con_marca['marca'] = eventos_con_marca['model'].str.split(' ').str[0]
df = eventos_con_marca.groupby('person')['marca'].value_counts().unstack()
df = df.fillna(0)
df.columns = ['cantidadDeVisualizacionesPor' + str(col) for col in df.columns]
features = pd.merge(features, df, on='person', how= 'left')

In [29]:
#Features: Region del usuario
df = eventos.loc[eventos.event.str.contains('visited site'),]
df = df.groupby(['person']).agg({'region': 'first'})
df = df.fillna('NA')
features = pd.merge(features, df, on='person', how= 'left')

In [30]:
#Features: Ciudad del usuario
df = eventos.loc[eventos.event.str.contains('visited site'),]
df = df.groupby(['person']).agg({'city': 'first'})
df = df.fillna('NA')
features = pd.merge(features, df, on='person', how= 'left')

In [31]:
#Features: Pais del usuario
df = eventos.loc[eventos.event.str.contains('visited site'),]
df = df.groupby(['person']).agg({'country': 'first'})
df = df.fillna('NA')
features = pd.merge(features, df, on='person', how= 'left')

In [32]:
#Features: Region del usuario
df = eventos.loc[eventos.event.str.contains('visited site'),]
df = df.groupby(['person']).agg({'device_type': 'first'})
df = df.fillna('NA')
features = pd.merge(features, df, on='person', how= 'left')

In [33]:
#Features: Cantidad por tipo de canal proveniente  generado
df = eventos.groupby('person')['channel'].value_counts().unstack()
df = df.fillna(0)
df.columns = ['cantidadDeIngresosPorCanal' + str(col) for col in df.columns]
features = pd.merge(features, df, on='person', how= 'left')

In [34]:
#Features: Cantidad por tipo de campania proveniente  generado
df = eventos.groupby('person')['campaign_source'].value_counts().unstack()
df = df.fillna(0)
df.columns = ['cantidadDeIngresosPorCamapania' + str(col) for col in df.columns]
features = pd.merge(features, df, on='person', how= 'left')


In [35]:
#Feature: Cantidad de visualizaciones realizadas por linea del equipo. Ejemplo: Cuantos Galaxy vizualizo, Cuantos Xperia visualizo
eventos_con_modelo = eventos
eventos_con_modelo['linea'] = eventos_con_modelo['model'].str.split(' ').str[0] + '' + eventos_con_modelo['model'].str.split(' ').str[1].fillna('')
df = eventos_con_marca.groupby('person')['linea'].value_counts().unstack()
df.columns = ['cantidadDeVisualizacionesDeLinea' + str(col) for col in df.columns]
df = df.fillna(0)
features = pd.merge(features, df, on='person', how= 'left')


In [36]:
#Feature: Similar al feature en el cual contamos por visualizaciones por marca, contamos por cantidad de marca por conversion
eventos_conversion_con_marca = eventos.loc[eventos.event == 'conversion',]
eventos_conversion_con_marca['marca'] = eventos_conversion_con_marca['model'].str.split(' ').str[0]
df = eventos_conversion_con_marca.groupby('person')['marca'].value_counts().unstack()
df.columns = ['cantidadDeConversionDe' + str(col) for col in df.columns]
df = df.fillna(0)
features = pd.merge(features, df, on='person', how= 'left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [37]:
#Feature: Similar al feature en el cual contamos por visualizaciones por marca, contamos por cantidad de liena por conversion
eventos_conversion_con_marca = eventos.loc[eventos.event == 'conversion',]
eventos_conversion_con_marca['linea'] = eventos_conversion_con_marca['model'].str.split(' ').str[0] + '' + eventos_conversion_con_marca['model'].str.split(' ').str[1].fillna('')
df = eventos_conversion_con_marca.groupby('person')['linea'].value_counts().unstack()
df.columns = ['cantidadDeConversionDeLinea' + str(col) for col in df.columns]
df = df.fillna(0)
features = pd.merge(features, df, on='person', how= 'left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [38]:
#Encodeamos las features que nos numeros
lb_make = LabelEncoder()

features['device_type'] = lb_make.fit_transform(features['device_type'].astype(str))
features['city'] = lb_make.fit_transform(features['city'].astype(str))
features['region'] = lb_make.fit_transform(features['region'].astype(str))
features['country'] = lb_make.fit_transform(features['country'].astype(str))

In [39]:
#Compleamos los campos NA con 0
features = features.fillna(0)

In [40]:
list(features)

['person',
 'retornoAlSitio',
 'cantidad_visualizaciones_buen_estado',
 'cantidad_visualizaciones_buen_estado_sin_touch',
 'cantidad_visualizaciones_excelente',
 'cantidad_visualizaciones_muy_buen_estado',
 'cantidad_visualizaciones_nuevo',
 'cantidad_eventos_generados',
 'cantidadIngresosPorCampania',
 'cantidadDeVisualizacionesPorMarca',
 'cantidadDeCheckouts',
 'cantidadConversiones',
 'cantidadDeVisualizacionesDeLaHome',
 'cantidadDeGeneracionDeNotificaciones',
 'cantidadDeIngresosPorBuscador',
 'cantidadDeProductosBuscados',
 'cantidadDeVisitas',
 'cantidadDeVisualizacionesDeProducto',
 'cantidadDeIngresosPorUrl',
 'cantidadDeVisualizacionesPorAsus',
 'cantidadDeVisualizacionesPorLG',
 'cantidadDeVisualizacionesPorLenovo',
 'cantidadDeVisualizacionesPorMotorola',
 'cantidadDeVisualizacionesPorOutros',
 'cantidadDeVisualizacionesPorQuantum',
 'cantidadDeVisualizacionesPorSamsung',
 'cantidadDeVisualizacionesPorSony',
 'cantidadDeVisualizacionesPorXiaomi',
 'cantidadDeVisualizacione

In [41]:
features.to_csv("../set_a_predecir.csv")

In [42]:
features

Unnamed: 0,person,retornoAlSitio,cantidad_visualizaciones_buen_estado,cantidad_visualizaciones_buen_estado_sin_touch,cantidad_visualizaciones_excelente,cantidad_visualizaciones_muy_buen_estado,cantidad_visualizaciones_nuevo,cantidad_eventos_generados,cantidadIngresosPorCampania,cantidadDeVisualizacionesPorMarca,...,cantidadDeConversionDeLineaiPhone4S,cantidadDeConversionDeLineaiPhone5,cantidadDeConversionDeLineaiPhone5c,cantidadDeConversionDeLineaiPhone5s,cantidadDeConversionDeLineaiPhone6,cantidadDeConversionDeLineaiPhone6S,cantidadDeConversionDeLineaiPhone7,cantidadDeConversionDeLineaiPhone8,cantidadDeConversionDeLineaiPhoneSE,cantidadDeConversionDeLineaiPhoneX
0,4886f805,True,0.0,0.0,4.0,1.0,0.0,9,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0297fc1e,True,244.0,2.0,46.0,119.0,0.0,567,29.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2d681dd8,True,14.0,0.0,0.0,0.0,0.0,26,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,cccea85e,True,200.0,1.0,191.0,347.0,1.0,836,15.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4c8a8b93,True,56.0,6.0,76.0,39.0,2.0,257,14.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,29ebb414,True,2.0,0.0,4.0,3.0,0.0,35,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,3dc1950f,True,167.0,2.0,157.0,266.0,6.0,672,3.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8ea4c165,True,24.0,0.0,52.0,24.0,0.0,159,13.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,d8cfe234,True,19.0,0.0,12.0,12.0,0.0,62,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,d6bc64df,True,88.0,10.0,20.0,39.0,0.0,213,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
