In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Intenciones:
1. Cargar los datos de entrada
1. Acondicionar los datos. Separar datos de entrenamiento de validación
1. Explorar distintos features.
1. Armar un DataFrame de entrenamiento y otro de validación (poner nombre acá)
1. Explorar distintos features
1. Elegir varios features a mano
1. Armar algoritmo para elegir features (random forests?) => ver esto
1. Entrenar diferentes modelos y de diferentes formas.
1. Iterar sobre los puntos anteriores
1. Registrar los cambios y evoluciones
1. Sacar conclusiones
## Modificar esta lista acorde a lo que se vaya haciendo.
### Nota: La intención es dejar comentados varios campos a modo de debug que generan salidas muy verbosas pero pueden resultar útiles en alún momento (ejemplo, .head() )

## 1) Cargar los datos de entrada

In [2]:
#_PATH_DATASET = '../input/all/' # No borrar esto, por compatibilidad para correrlo con datasets en otros directorios
_PATH_DATASET = '/home/diego/Cursos/FIUBA/git/fiuba-trocafone-tp2-final-set/' # No borrar esto, por compatibilidad para correrlo con datasets en otros directorios

In [3]:
input_data_raw = pd.read_csv(_PATH_DATASET + 'events_up_to_01062018.csv')
input_data_raw.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [4]:
# Se cargan los labels
knownLabels = pd.read_csv(_PATH_DATASET + 'labels_training_set.csv')
knownLabels['resultKnown'] = 1
knownLabels.head()

Unnamed: 0,person,label,resultKnown
0,0566e9c1,0,1
1,6ec7ee77,0,1
2,abe7a2fb,0,1
3,34728364,0,1
4,87ed62de,0,1


In [5]:
knownLabels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19414 entries, 0 to 19413
Data columns (total 3 columns):
person         19414 non-null object
label          19414 non-null int64
resultKnown    19414 non-null int64
dtypes: int64(2), object(1)
memory usage: 455.1+ KB


# 2) Acondicionar los datos (previo al armado del DataFrame final)

In [6]:
# Acondicionar los datos antes de convertirlos
input_data_raw['timestamp'] = pd.to_datetime(input_data_raw['timestamp']) # Convierto a timestamp

In [7]:
print('Total de datos: ' + str(input_data_raw.shape))
print('Total de labels: ' + str(knownLabels.shape))

Total de datos: (2341681, 23)
Total de labels: (19414, 3)


In [8]:
# Hubo un cambio grande, a todos los trato por igual y luego en algún momento los separo por label 0 o 1 o
# en testing, -1
input_train = input_data_raw# [input_data_raw.person.isin(knownLabels.person)] # Datos etiquetados
input_to_be_validated = input_data_raw[(~input_data_raw.person.isin(knownLabels.person))] # Datos para predecir

In [9]:
# Para simplificar el preprocesamiento, hago un array único en donde las labels desconocidas son las -1
unknownLabels = pd.DataFrame(input_to_be_validated['person'])

In [10]:
unknownLabels['label'] = 0

In [11]:
unknownLabels = unknownLabels.groupby('person')

In [12]:
unknownLabels = unknownLabels.count()
unknownLabels['resultKnown'] = 0
unknownLabels['label'].values[:] = 0
#unknownLabels.label = 0 # Si seteo a 0, no queda como un DataFrame

In [13]:
unknownLabels.head()

Unnamed: 0_level_0,label,resultKnown
person,Unnamed: 1_level_1,Unnamed: 2_level_1
00091926,0,0
00091a7a,0,0
000ba417,0,0
000e4d9e,0,0
000e619d,0,0


In [14]:
print('Total de labels desconocidas: ' + str(unknownLabels.shape))

Total de labels desconocidas: (19415, 2)


In [15]:
knownLabels = knownLabels.set_index('person')
#unknownLabels = unknownLabels.set_index('person')

joinedLabels = [knownLabels, unknownLabels]

In [16]:
joinedLabels = pd.concat(joinedLabels)
joinedLabels.head()

Unnamed: 0_level_0,label,resultKnown
person,Unnamed: 1_level_1,Unnamed: 2_level_1
0566e9c1,0,1
6ec7ee77,0,1
abe7a2fb,0,1
34728364,0,1
87ed62de,0,1


In [17]:
print('Total de labels: ' + str(joinedLabels.shape))

Total de labels: (38829, 2)


In [18]:
print('Total de datos con label conocidos: ' + str(input_train.shape[0])) # Datos 
print('Total de datos a predecir ' + str(input_to_be_validated.shape[0]))
print('Total de datos (debe coincidir) ' + str(input_train.shape[0] + input_to_be_validated.shape[0]))

Total de datos con label conocidos: 2341681
Total de datos a predecir 1169795
Total de datos (debe coincidir) 3511476


# Acondiciono los datos por persona (ahora tengo una entrada por evento y no por persona)

In [19]:
# input_train.describe()
# input_train.info()

### Nota: SKU: Stock-keeping unit o SKU, sin traducción literal al español podría denominarse «código de artículo» o «número de referencia» o incluso un código de barras. Asignado a un elemento para poder identificarlo en el inventario físico y/o financiero.

# ACLARACIÓN:
## Borrar lo que no necesito de acá luego, en esto hay mucho borrador, lo importante es llegar al DataFrame de features...

In [20]:
input_train = input_train.sort_values(['person']) # ordeno por persona, no es necesario si voy a stackear

In [21]:
# Me aseguro de que todos los eventos tienen el campo persona de 8 caracteres
input_train['person'].apply(lambda x: len(x)).value_counts() # => OK

8    2341681
Name: person, dtype: int64

In [22]:
runTests = False

In [23]:
# Desprolijo, después aprovechar esto.
if runTests == True:
    in2 = input_train.set_index('person')
    in2.head()

In [24]:
if runTests == True:
    # input_train.sort_values(['person', 'timestamp'])
    # input_train.groupby('person').count()
    input_train_features = input_train.groupby('person').count() # Este no será el dato final

In [25]:
#input_train_features.columns

In [26]:
if runTests == True:
    total = input_train_features.join(knownLabels, on = 'person', how = 'left')

In [27]:
# total.shape
# total.label.head()

In [28]:
if runTests == True:
    x_data = total.drop('label', axis = 1)
    y_labels = total.label
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(x_data,y_labels,test_size=0.3,random_state=101)
    x_data.columns

## Esto también es una prueba

In [29]:
if runTests == True:
    import tensorflow as tf

In [30]:
if runTests == True:
    #feat_cols = tf.convert_to_tensor(x_data) # No anda, da errores al momento de entrenar (model.train)
    ts = tf.feature_column.numeric_column("timestamp")
    event = tf.feature_column.numeric_column("event")
    url = tf.feature_column.numeric_column("url")
    model = tf.feature_column.numeric_column("model")
    condition = tf.feature_column.numeric_column("condition")
    storage = tf.feature_column.numeric_column("storage")
    color = tf.feature_column.numeric_column("color")
    search_term = tf.feature_column.numeric_column("search_term")

    feat_cols = [ts, event, url]

In [31]:
if runTests == True:
    type(feat_cols)

In [32]:
if runTests == True:
    input_func=tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=100,num_epochs=None,shuffle=True)

In [33]:
if runTests == True:
    model = tf.estimator.LinearClassifier(feature_columns=feat_cols)

In [34]:
if runTests == True:
    model.train(input_fn=input_func,steps=5000)

In [35]:
if runTests == True:
    pred_fn = tf.estimator.inputs.pandas_input_fn(x=X_test,batch_size=len(X_test),shuffle=False)

In [36]:
if runTests == True:
    predictions = list(model.predict(input_fn=pred_fn))

In [37]:
if runTests == True:
    final_preds = []
    for pred in predictions:
        final_preds.append(pred['class_ids'][0])

In [38]:
if runTests == True:
    from sklearn.metrics import classification_report

In [39]:
if runTests == True:
    print(classification_report(y_test,final_preds))

In [40]:
if runTests == True:
    input_train[input_train['person'] == '0008ed71'].sort_values(['timestamp'])

In [41]:
if runTests == True:
    input_train.loc[1505383]['skus']

In [42]:
if runTests == True:
    input_train[input_train['skus'].isnull()]

In [43]:
if runTests == True:
    input_train.loc[2122051].skus

## FIN DE PRUEBAS. Sigo con el espacio de features nuevamente

# 3) Explorar distintos features

## Creo el nuevo vector de features.
### El mismo consiste en un DataFrame del que se evaluarán luego cuáles usar

In [44]:
new_vector_features = pd.DataFrame(joinedLabels.index) # Creo un vector con una entrada por persona
#new_vector_features['_'] = 0
new_vector_features = new_vector_features.set_index('person')
# new_vector_features.head()

In [45]:
new_vector_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38829 entries, 0566e9c1 to fffd1246
Empty DataFrame

In [46]:
# Agrego los labels
new_vector_features = new_vector_features.join(joinedLabels, on = 'person', how = 'left')

In [47]:
new_vector_features[new_vector_features['resultKnown']!=1].shape

(19415, 2)

## Estudio los features útiles. Crear lista de ideas

In [48]:
input_train.columns.values # Lista de features crudos disponibles. Desde ellos debo realizar los diferentes

array(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version'], dtype=object)

## Demo de groupby, por alguna duda en particular

In [49]:
df = pd.DataFrame({'A': 'a a b a'.split(), 'B': [1,2,3,1.5], 'C': [4,6, 5, 100]})
g = df.groupby('A')
g.head()

Unnamed: 0,A,B,C
0,a,1.0,4
1,a,2.0,6
2,b,3.0,5
3,a,1.5,100


In [50]:
g.apply(lambda x: x.sum())

Unnamed: 0_level_0,A,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,aaa,4.5,110
b,b,3.0,5


## Features de timestamp

In [51]:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.GroupBy.apply.html
# Ejecuto una función para cada grupo:
grouped_input_train = input_train.groupby('person')
grouped_input_train.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
1507286,2018-05-17 12:27:47,checkout,0008ed71,,3372.0,Samsung Galaxy S6 Flat,Muito Bom,32GB,Dourado,,...,,,,,,,,,,
2336761,2018-05-17 16:21:54,visited site,0008ed71,,,,,,,,...,,Referral,Returning,Unknown,Unknown,Brazil,Computer,1920x1080,Windows 10,Chrome 66.0
2122051,2018-05-17 16:22:06,generic listing,0008ed71,,,,,,,"6594,6651,6664,7253,2820,6706,6721,12606,480,1...",...,,,,,,,,,,
1505383,2018-05-17 16:28:37,checkout,0008ed71,,7505.0,LG G4 H818P,Bom,32GB,Preto,,...,,,,,,,,,,
2336760,2018-05-17 13:44:59,visited site,0008ed71,,,,,,,,...,,Referral,New,Unknown,Unknown,Brazil,Computer,1920x1080,Windows 10,Chrome 66.0
227610,2018-05-23 01:09:18,ad campaign hit,00091926,/comprar/iphone/6s,,,,,,,...,,,,,,,,,,
244622,2018-05-17 00:56:34,viewed product,00091926,,10309.0,iPhone 7 Plus,Muito Bom,256GB,Ouro Rosa,,...,,,,,,,,,,
175432,2018-05-31 02:33:58,viewed product,00091926,,6846.0,iPhone 6S,Muito Bom,16GB,Dourado,,...,,,,,,,,,,
244443,2018-05-17 01:13:22,viewed product,00091926,,1389.0,iPhone 6 Plus,Excelente,16GB,Dourado,,...,,,,,,,,,,
252233,2018-05-07 00:37:12,viewed product,00091926,,10910.0,Samsung Galaxy A7 2017,Bom,32GB,Rosa,,...,,,,,,,,,,


In [52]:
input_train['model'].unique()[0:10]

array(['Samsung Galaxy S6 Flat', nan, 'LG G4 H818P', 'iPhone SE',
       'iPhone 7 Plus', 'iPhone 6S', 'iPhone 6 Plus',
       'Samsung Galaxy A7 2017', 'iPhone 6S Plus', 'iPhone 7'], dtype=object)

In [53]:
input_train['url'].unique()[0:10]

array([nan, '/comprar/iphone/6s', '/comprar/samsung/galaxy-s6-edge-plus',
       '/comprar/iphone', '/comprar/iphone/7-plus',
       '/comprar/iphone/6-plus', '/comprar/iphone/iphone-6s-plus',
       '/comprar/samsung/galaxy-s7-edge',
       '/comprar/samsung/galaxy-s6-flat', '/comprar/samsung'], dtype=object)

In [54]:
input_train['storage'].unique()

array(['32GB', nan, '64GB', '256GB', '16GB', '128GB', '8GB', '4GB', '512MB'], dtype=object)

In [55]:
input_train['color'].unique()[0:5]

array(['Dourado', nan, 'Preto', 'Cinza espacial', 'Ouro Rosa'], dtype=object)

In [56]:
#input_train['skus'].unique()
#input_train['sku'].unique()

In [57]:
input_train['search_term'].unique()

array([nan, 'Iphone 7s', 'Galaxy s8', ..., 'iPhone 6 Rose', '5S Mini',
       'J5 Golden'], dtype=object)

In [58]:
input_train['staticpage'].unique()

array([nan, 'Conditions', 'CustomerService', 'Quiosks', 'FaqEcommerce',
       'galaxy-s8', 'how-to-buy', 'TermsAndConditionsReturnEcommerce',
       'AboutUs', 'TermsAndConditionsEcommerce', 'trust-trocafone',
       'club-trocafone', 'how-to-sell', 'black_friday', 'PrivacyEcommerce'], dtype=object)

In [59]:
input_train['campaign_source'].unique()

array([nan, 'google', 'rtbhouse', 'criteo', 'afilio', 'zanox', 'rakuten',
       'voxus', 'buscape', 'indexa', 'emblue', 'Facebook', 'FacebookAds',
       'FacebookSocial', 'bing', 'manifest', 'onsite', 'datacrush',
       'yotpo', 'blog', 'mercadopago', 'afiliado', 'MARKETING SOCIAL',
       'gizmodo'], dtype=object)

In [60]:
input_train['search_engine'].unique()

array([nan, 'Google', 'Yahoo', 'Bing', 'Ask'], dtype=object)

In [61]:
input_train['channel'].unique()

array([nan, 'Referral', 'Direct', 'Paid', 'Organic', 'Social', 'Email',
       'Unknown'], dtype=object)

In [62]:
input_train['new_vs_returning'].unique()

array([nan, 'Returning', 'New'], dtype=object)

In [63]:
input_train['city'].unique()

array([nan, 'Unknown', 'Carlos Barbosa', ..., 'Marcos Parente',
       'Cha Grande', 'Barra dos Coqueiros'], dtype=object)

In [64]:
input_train['region'].unique()[0:5] # Sólo muestro un par de elementos

array([nan, 'Unknown', 'Rio Grande do Sul', 'Minas Gerais', 'Sao Paulo'], dtype=object)

In [65]:
input_train['country'].unique()[0:5]

array([nan, 'Brazil', 'Unknown', 'United States', 'Bolivia'], dtype=object)

In [66]:
input_train['device_type'].unique()

array([nan, 'Computer', 'Smartphone', 'Tablet', 'Unknown'], dtype=object)

In [67]:
input_train['screen_resolution'].unique()[0:10]

array([nan, '1920x1080', '1024x768', '360x640', '1536x864', '1366x768',
       '320x534', '412x846', '1360x768', '1242x698'], dtype=object)

In [68]:
input_train['operating_system_version'].unique()[0:10]

array([nan, 'Windows 10 ', 'Windows 7 ', 'Android 5.1.1', 'Android 7',
       'Android 6', 'Android 7.1.1', 'Android 6.0.1', 'Windows 8 ',
       'iOS 11.3'], dtype=object)

In [69]:
input_train['browser_version'].unique()[0:10]

array([nan, 'Chrome 66.0', 'Chrome Mobile 43.0', 'Chrome Mobile 66.0',
       'Chrome 67.0', 'Chrome Mobile 39', 'Chrome 64.0', 'Chrome 65.0',
       'Chrome 63.0', 'Chrome Mobile 57.0'], dtype=object)

In [70]:
input_train.columns.values

array(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version'], dtype=object)

> ## a) Features del timestamp

In [71]:
#input_train['day'] = pd.DataFrame(input_train['timestamp'].apply(lambda x: x.day))#, dummy_na=False) # Convierto a One Hot encoding, el último evita NaN
input_train['day'] = input_train['timestamp'].dt.day
input_train['weekDay']=input_train['timestamp'].dt.weekday
input_train['month'] =input_train['timestamp'].dt.month

In [72]:
input_train['isweekEnd']=input_train['weekDay']<=4 # 0 es lunes, con esto verifico de lunes a viernes
input_train['isFortnite']=input_train['day']<=15 # Verifico quincena

In [73]:
grouped_input_train = input_train.groupby('person')

In [74]:
#Paso a segundos la interacción. Al final no lo usé, pero era la forma correcta parece ser
df_timeDelta = pd.DataFrame((grouped_input_train['timestamp'].max() - grouped_input_train['timestamp'].min()).rename('timeDelta').dt.total_seconds())
df_timeDelta.head()
new_vector_features = new_vector_features.join(df_timeDelta, on = 'person', how = 'right')

In [75]:
# Tiempo más cercano al timestamp al primero de junio
last_inter_to_june = pd.DataFrame((pd.to_datetime('06/01/2018') - grouped_input_train['timestamp'].max()).rename('last_inter_to_june').dt.total_seconds())
last_inter_to_june.head()
new_vector_features = new_vector_features.join(last_inter_to_june, on = 'person', how = 'right')

In [77]:
# Tiempo más lejano al timestamp al primero de junio
first_inter_to_june = pd.DataFrame((pd.to_datetime('06/01/2018') - grouped_input_train['timestamp'].min()).rename('first_inter_to_june').dt.total_seconds())
first_inter_to_june.head()
new_vector_features = new_vector_features.join(first_inter_to_june, on = 'person', how = 'right')

In [78]:
# Cantidad de interacciones en el tiempo especificado
new_vector_features['interactions'] = grouped_input_train['timestamp'].count()

In [80]:
new_vector_features.head() # Features hasta ahora

Unnamed: 0_level_0,label,resultKnown,timeDelta,last_inter_to_june,first_inter_to_june,interactions
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0566e9c1,0,1,772618.0,26916.0,799534.0,68
6ec7ee77,0,1,0.0,336876.0,336876.0,2
abe7a2fb,0,1,9335392.0,174046.0,9509438.0,96
34728364,0,1,742549.0,388950.0,1131499.0,37
87ed62de,0,1,1409.0,1123199.0,1124608.0,17


In [81]:
#one_hot_weekday['isWeekend'] = one_hot_weekday[True]

In [82]:
#one_hot_weekday['isNotWeekend'] = one_hot_weekday[False]

In [83]:
#one_hot_weekday = one_hot_weekday.drop[True]


In [84]:
# Básico: Creo un histograma de eventos
one_hot_weekday = pd.get_dummies(input_train['isweekEnd'], dummy_na=False) # Convierto a One Hot encoding, el último evita NaN
one_hot_weekday['person'] = input_train['person'] # Agrego una columna
one_hot_weekday = one_hot_weekday.set_index('person')
#one_hot_events.groupby('person').hist(bins = 3)
one_hot_weekday = one_hot_weekday.groupby('person') 

one_hot_weekday = one_hot_weekday.apply(lambda x: x.sum())

one_hot_weekday['isNotWeekend'] = one_hot_weekday[False]
one_hot_weekday['isWeekend'] = one_hot_weekday[True]
one_hot_weekday = one_hot_weekday.drop(columns = [False, True])

#one_hot_weekday['weekDay' = one_hot_weekday[False]
#Por las dudas, si quiero seleccionar la columna NaN, tengo que poner one_hot_events[np.nan]

# Agregar al vector de features los que crea necesarios
new_vector_features = new_vector_features.join(one_hot_weekday, on = 'person', how = 'left')

In [85]:
# Básico: Creo un histograma de eventos
one_hot_fortnite = pd.get_dummies(input_train['isFortnite'], dummy_na=False) # Convierto a One Hot encoding, el último evita NaN
one_hot_fortnite['person'] = input_train['person'] # Agrego una columna
one_hot_fortnite = one_hot_fortnite.set_index('person')
#one_hot_events.groupby('person').hist(bins = 3)
one_hot_fortnite = one_hot_fortnite.groupby('person') 

one_hot_fortnite = one_hot_fortnite.apply(lambda x: x.sum())

one_hot_fortnite['isFortnite'] = one_hot_fortnite[False]
one_hot_fortnite['isNotFortnite'] = one_hot_fortnite[True]
one_hot_fortnite = one_hot_fortnite.drop(columns = [False, True])

# Agregar al vector de features los que crea necesarios
new_vector_features = new_vector_features.join(one_hot_fortnite, on = 'person', how = 'left')

In [86]:
one_hot_month = pd.get_dummies(input_train['month'], dummy_na=False) # Convierto a One Hot encoding, el último evita NaN


In [87]:
# Básico: Creo un histograma de eventos
one_hot_month = pd.get_dummies(input_train['month'], dummy_na=False) # Convierto a One Hot encoding, el último evita NaN
one_hot_month['person'] = input_train['person'] # Agrego una columna
one_hot_month = one_hot_month.set_index('person')
#one_hot_events.groupby('person').hist(bins = 3)
one_hot_month = one_hot_month.groupby('person') 

one_hot_month = one_hot_month.apply(lambda x: x.sum())

one_hot_month = one_hot_month.rename(columns={1: 'month1', 2: 'month2', 3: 'month3', 4: 'month4', 5:'month5'}) 

#one_hot_weekday['isNotWeekend'] = one_hot_weekday[False]
#one_hot_weekday['isWeekend'] = one_hot_weekday[True]
#one_hot_weekday = one_hot_weekday.drop(columns = [False, True])

#one_hot_weekday['weekDay' = one_hot_weekday[False]
#Por las dudas, si quiero seleccionar la columna NaN, tengo que poner one_hot_events[np.nan]

# Agregar al vector de features los que crea necesarios
new_vector_features = new_vector_features.join(one_hot_month, on = 'person', how = 'left')

In [88]:
new_vector_features.head()

Unnamed: 0_level_0,label,resultKnown,timeDelta,last_inter_to_june,first_inter_to_june,interactions,isNotWeekend,isWeekend,isFortnite,isNotFortnite,month1,month2,month3,month4,month5
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0566e9c1,0,1,772618.0,26916.0,799534.0,68,5,63,68,0,0,0,0,0,68
6ec7ee77,0,1,0.0,336876.0,336876.0,2,0,2,2,0,0,0,0,0,2
abe7a2fb,0,1,9335392.0,174046.0,9509438.0,96,32,64,32,64,0,12,16,38,30
34728364,0,1,742549.0,388950.0,1131499.0,37,4,33,37,0,0,0,0,0,37
87ed62de,0,1,1409.0,1123199.0,1124608.0,17,1,16,17,0,0,0,0,0,17


In [89]:
# Ver si le agrego info del día, y tiempo al próximo feriado, al último o cosas así
# Ideas: Por mes? Por día? Cantidad de eventos en fines de semana? Cantidad de eventos en días de semana?
# Cuenta de algún evento particular?
# Histograma de eventos de días de semana vs fines de semana?
# media y varianza de acuerdo al día del mes?
# Algo relacionado a la hora?

> ## b) Features de events

In [90]:
# Diferentes eventos
input_train['event'].unique()

array(['checkout', 'visited site', 'generic listing', 'ad campaign hit',
       'viewed product', 'brand listing', 'search engine hit',
       'conversion', 'searched products', 'staticpage', 'lead'], dtype=object)

In [91]:
# Básico: Creo un histograma de eventos
one_hot_events = pd.get_dummies(input_train['event'], dummy_na=False) # Convierto a One Hot encoding, el último evita NaN
one_hot_events['person'] = input_train['person'] # Agrego una columna
one_hot_events = one_hot_events.set_index('person')
#one_hot_events.groupby('person').hist(bins = 3)
one_hot_events = one_hot_events.groupby('person') 

one_hot_events = one_hot_events.apply(lambda x: x.sum())
#Por las dudas, si quiero seleccionar la columna NaN, tengo que poner one_hot_events[np.nan]

# Agregar al vector de features los que crea necesarios
new_vector_features = new_vector_features.join(one_hot_events, on = 'person', how = 'left')

In [92]:
# Recordar agregar url, cuando aparece es porque tiene ad_campain_hit
# Algo relacionado a profundidad del hit?
# Algo relacionado a la varianza del hit?

> ## c) Features de Condition

In [93]:
input_train['condition'].unique() # Sem touch ID es sin lector de huellas, normalmente los IPhone|

array(['Muito Bom', nan, 'Bom', 'Excelente', 'Novo', 'Bom - Sem Touch ID'], dtype=object)

In [94]:
# Función que retorna un valor de acuerdo a la calidad del celular:
# nan, 'Muito Bom', 'Bom', 'Excelente', 'Bom - Sem Touch ID', 'Novo'
def getCondition(str):
    if not str:
        return 0
    elif str == 'Bom - Sem Touch ID': # Ojo que en la lista del TP1 no aparecen pero acá si
        return 1
    elif str == 'Bom':
        return 2
    elif str == 'Muito Bom':
        return 3
    elif str == 'Excelente':
        return 4
    elif str == 'Novo':
        return 5
    else:
        assert('Entrada desconocida')

getCondition('Bom') # Test

2

In [95]:
cond_temp = input_train['condition'].apply(lambda x: getCondition(x))
one_hot_condition = pd.get_dummies(cond_temp, dummy_na=False) # Si pongo true me genera una columna más con los nan
one_hot_condition['person'] = input_train['person'] # Agrego una columna
one_hot_condition = one_hot_condition.set_index('person')
one_hot_condition = one_hot_condition.groupby('person') 
one_hot_condition = one_hot_condition.apply(lambda x: x.sum())
new_vector_features = new_vector_features.join(one_hot_condition, on = 'person', how = 'left')

# Normalizar?
# Varianza y valor medio?

> ## Feature de ...

In [96]:
# Ver que hacer con model: marca, modelo, rango de precios estimado?
# Marca del sistema operativo
def getBrand(str):
    return str.partition(' ')[0]
getBrand('Asus zenpad')

'Asus'

In [97]:
# Storage
# TODO: Ver si conviene ir logarítmicamente, o cómo. Que hago con el 512? Hay que probar de las dos formas
# La otra es asignarle un número entero
# No debería tener un 0?
def getCapacity(str):
    my_list = [ '32GB', '64GB', '128GB', '256GB', '8GB', '16GB', '4GB', '512MB']
    to_replace = [  32,     64,     128,     256,     8,     16,     4,     0.512 ]
    if str in my_list:
        return to_replace[my_list.index(str)]
    else:
        return 0
getCapacity('32GB') # Test

32

In [98]:
# Qué hacer con color?

In [99]:
# Qué hacer con respecto a la campaña:
# Qué hacer con search_term, staticpage, campaign_source, search_engine
# Channel

In [100]:
# New vs Returning
# No hay un New por cada persona, pero anda por ahí (19414 vs 19126) y puede haber varios returning
#input_train[input_train['new_vs_returning'] =='Returning']['person'].value_counts()
#input_train[input_train['event'] =='visited site']['person'].value_counts()

In [101]:
# Relativo a la ubicación: 
#   Qué hacer con city, region, country

In [102]:
# Relativo al sistema operativo:
#   Qué hacer con device_type, screen_resolution, operative_system_version, browser_version

In [103]:
new_vector_features.head(100)

Unnamed: 0_level_0,label,resultKnown,timeDelta,last_inter_to_june,first_inter_to_june,interactions,isNotWeekend,isWeekend,isFortnite,isNotFortnite,...,search engine hit,searched products,staticpage,viewed product,visited site,1.0,2.0,3.0,4.0,5.0
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0566e9c1,0,1,772618.0,26916.0,799534.0,68,5,63,68,0,...,1,0,1,23,17,2,16,6,1,0
6ec7ee77,0,1,0.0,336876.0,336876.0,2,0,2,2,0,...,0,0,0,0,1,0,0,0,0,0
abe7a2fb,0,1,9335392.0,174046.0,9509438.0,96,32,64,32,64,...,4,6,0,31,22,0,19,10,3,0
34728364,0,1,742549.0,388950.0,1131499.0,37,4,33,37,0,...,1,0,0,24,4,0,13,4,7,0
87ed62de,0,1,1409.0,1123199.0,1124608.0,17,1,16,17,0,...,0,0,0,9,1,0,6,0,5,0
db2c4d27,1,1,7089888.0,2538923.0,9628811.0,564,78,486,429,135,...,92,13,20,121,114,4,29,56,36,0
cde431db,0,1,513185.0,49898.0,563083.0,24,0,24,24,0,...,1,2,2,8,3,0,9,4,0,0
be65035b,0,1,165497.0,4679833.0,4845330.0,8,2,6,0,8,...,0,3,0,0,3,0,0,0,0,0
a4178891,0,1,871.0,806878.0,807749.0,11,0,11,11,0,...,0,0,0,4,1,0,6,0,0,0
d066f64c,0,1,1031.0,1146913.0,1147944.0,22,0,22,22,0,...,2,0,0,12,1,0,1,1,11,0


In [104]:
input_train.iloc[:,0:2].head()

Unnamed: 0,timestamp,event
1507286,2018-05-17 12:27:47,checkout
2336761,2018-05-17 16:21:54,visited site
2122051,2018-05-17 16:22:06,generic listing
1505383,2018-05-17 16:28:37,checkout
2336760,2018-05-17 13:44:59,visited site


In [105]:
input_train[['event', 'url', 'timestamp']][~input_train.url.isnull()].head(10)

Unnamed: 0,event,url,timestamp
227610,ad campaign hit,/comprar/iphone/6s,2018-05-23 01:09:18
100351,ad campaign hit,/comprar/samsung/galaxy-s6-edge-plus,2018-05-08 23:30:44
172395,ad campaign hit,/comprar/iphone,2018-05-31 19:51:49
62056,ad campaign hit,/comprar/iphone,2018-05-29 01:58:14
208388,ad campaign hit,/comprar/iphone/7-plus,2018-05-20 01:10:39
182822,ad campaign hit,/comprar/iphone/6-plus,2018-05-23 01:09:25
208774,ad campaign hit,/comprar/iphone/iphone-6s-plus,2018-05-20 01:01:27
114267,ad campaign hit,/comprar/samsung/galaxy-s7-edge,2018-05-09 00:25:05
210999,ad campaign hit,/comprar/samsung/galaxy-s6-edge-plus,2018-05-22 00:55:28
67536,ad campaign hit,/comprar/samsung/galaxy-s6-flat,2018-05-11 02:23:38


In [106]:
input_train['event'][~input_train.url.isnull()].unique()

array(['ad campaign hit'], dtype=object)

In [107]:
input_train['url'][~input_train.url.isnull()].unique()

array(['/comprar/iphone/6s', '/comprar/samsung/galaxy-s6-edge-plus',
       '/comprar/iphone', '/comprar/iphone/7-plus',
       '/comprar/iphone/6-plus', '/comprar/iphone/iphone-6s-plus',
       '/comprar/samsung/galaxy-s7-edge',
       '/comprar/samsung/galaxy-s6-flat', '/comprar/samsung',
       '/comprar/samsung/galaxy-s8-plus', '/comprar/samsung/galaxy-s8',
       '/comprar/iphone/', '/comprar/samsung/a3-duos', '/',
       '/comprar/samsung/galaxy-s5', '/comprar/lg/g3-d855',
       '/comprar/iphone/iphone-se',
       '/comprar/samsung/samsung-gran-prime-duos-tv',
       '/comprar/samsung/galaxy-s7', '/comprar/samsung/galaxy-win-duos',
       '/comprar/motorola/moto-g-2a-geracao', '/comprar/iphone/iphone-5c',
       '/comprar/samsung/galaxy-j1-mini',
       '/comprar/samsung/galaxy-s4-mini-duos', '/comprar/motorola',
       '/comprar/samsung/galaxy-s5-duos', '/comprar/motorola/',
       '/comprar/samsung/galaxy-j7-prime', '/comprar/iphone/6',
       '/comprar/iphone/iphone-5s',
    

# https://blog.datadive.net/selecting-good-features-part-iii-random-forests/

# Uso de varias técnicas: https://www.kaggle.com/niklasdonges/end-to-end-project-with-python

# Lo usa mal, copié partes, modifiqué otras. Copiar de nuevo y reemplazar COMO VA!

In [108]:
new_vector_features_to_predict = new_vector_features.drop('resultKnown', axis = 1)[new_vector_features['resultKnown']==0] # Datos para predecir
new_vector_features_to_predict.shape

(19415, 30)

In [109]:
new_vector_features_known = new_vector_features.drop('resultKnown', axis = 1)[new_vector_features['resultKnown']==1] # Datos para predecir
new_vector_features_known.shape

(19414, 30)

In [110]:
new_vector_features_known.to_csv(_PATH_DATASET + 'out_features_processed_train.csv', sep=',', encoding='utf-8')
new_vector_features_to_predict.to_csv(_PATH_DATASET + 'out_features_processed_to_predict.csv', sep=',', encoding='utf-8')

In [113]:
#X_train = new_vector_features.drop("label", axis=1)
#Y_train = new_vector_features["label"]
#X_test  = test_df.drop("PassengerId", axis=1).copy()
dasdfs desde acá me fuí a la otra notebook, fuerzo error. sirve para tener referencias

SyntaxError: invalid syntax (<ipython-input-113-26245e3454a6>, line 4)

In [None]:
x_data = new_vector_features.drop('label', axis = 1)
y_labels = new_vector_features.label
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_data,y_labels,test_size=0.3,random_state=101)
x_data.columns

In [None]:
# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

In [None]:
'''
This estimator implements regularized linear models with stochastic
gradient descent (SGD) learning: the gradient of the loss is estimated
each sample at a time and the model is updated along the way with a
decreasing strength schedule (aka learning rate). SGD allows minibatch
(online/out-of-core) learning, see the partial_fit method.
For best results using the default learning rate schedule, the data should
have zero mean and unit variance.
'''
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)

In [None]:
'''
Ajustar el clasificador
Fit linear model with Stochastic Gradient Descent.
'''
sgd.fit(X_train, y_train)

In [None]:
'''
Predict class labels for samples in X.
'''
Y_pred = sgd.predict(X_test)
sgd.score(X_train, y_train)



In [None]:
'''
Returns the mean accuracy on the given test data and labels.
'''
acc_sgd = round(sgd.score(X_test, y_test) * 100, 2)

acc_sgdPred = round(sgd.score(X_test, Y_pred) * 100, 2)

print(round(acc_sgd,2,), "%")
print(round(acc_sgdPred,2,), "%")

In [None]:
# Random Forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)

Y_prediction = random_forest.predict(X_test)

random_forest.score(X_test, y_test)
acc_random_forest = round(random_forest.score(X_test, y_test) * 100, 2)
print(round(acc_random_forest,2,), "%")

In [None]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

Y_pred = logreg.predict(X_test)

acc_log = round(logreg.score(X_test, y_test) * 100, 2)
print(round(acc_log,2,), "%")



In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)

Y_pred = knn.predict(X_test)

acc_knn = round(knn.score(X_train, y_train) * 100, 2)
print(round(acc_knn,2,), "%")

In [None]:
# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)

Y_pred = gaussian.predict(X_test)

acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)
print(round(acc_gaussian,2,), "%")

In [None]:


# Perceptron
perceptron = Perceptron(max_iter=5)
perceptron.fit(X_train, y_train)

Y_pred = perceptron.predict(X_test)

acc_perceptron = round(perceptron.score(X_train, y_train) * 100, 2)
print(round(acc_perceptron,2,), "%")



In [None]:
# Linear SVC
linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)

Y_pred = linear_svc.predict(X_test)

acc_linear_svc = round(linear_svc.score(X_train, y_train) * 100, 2)
print(round(acc_linear_svc,2,), "%")

In [None]:


# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

Y_pred = decision_tree.predict(X_test)

acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)
print(round(acc_decision_tree,2,), "%")



In [None]:
results = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 
              'Decision Tree'],
    'Score': [acc_linear_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

In [None]:
y_test.values

In [None]:
sum(abs(Y_pred - y_test))

In [None]:
Y_pred.shape

In [None]:
296/5835

In [None]:


from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(rf, X_train, y_train, cv=10, scoring = "accuracy")



In [None]:
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

# Feature importance

In [None]:
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')

In [None]:
importances.head(20)

In [None]:
importances.plot.bar()

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
predictions = cross_val_predict(random_forest, X_train, y_train, cv=3)
confusion_matrix(y_train, predictions)

In [None]:
from sklearn.metrics import precision_recall_curve

# getting the probabilities of our predictions
y_scores = random_forest.predict_proba(X_test)
y_scores = y_scores[:,1]

precision, recall, threshold = precision_recall_curve(y_test, y_scores)

In [None]:
import matplotlib.pyplot as plt
def plot_precision_and_recall(precision, recall, threshold):
    plt.plot(threshold, precision[:-1], "r-", label="precision", linewidth=5)
    plt.plot(threshold, recall[:-1], "b", label="recall", linewidth=5)
    plt.xlabel("threshold", fontsize=19)
    plt.legend(loc="upper right", fontsize=19)
    plt.ylim([0, 1])

plt.figure(figsize=(14, 7))
plot_precision_and_recall(precision, recall, threshold)
plt.show()

In [None]:
knownLabels.head()