In [1]:
import pandas as pd
import matplotlib as mp
import numpy as np
from datetime import timedelta
import datetime as dt

%matplotlib inline

#Root de archivos de datos
root_dir = '../data/'

#Lee menos data si es testing
is_testing = False

# Cantidad minima de aparicion de equipos en un dataframe
min_devices = 5

# Identificador unico de equipos
device_uuid = ['ref_hash']

# Ventanas de tiempo
windows = pd.DataFrame({
    'begin_date': [dt.datetime(2019,4,18),dt.datetime(2019,4,21),dt.datetime(2019,4,24)],
    'window_nr':[1,2,3]
})

# Armado de Features

Se utilizan 3 ventanas de tiempo y se procesan los dataframes para obtener features de esas 3 ventanas por separado. 

Como resultado se obtiene un dataset de features mas grande cuyos datos no se solapan en tiempo. Por lo que son validos para la prediccion. 

Se realiza la lectura y limpiado de los dataframes principales, con los mismos se realiza el filtrado y armado de los features. 

---
## 1. Lectura de DataFrames

In [2]:
#Optimizado para menos memoria
auction_dtypes = {
    'device_id': np.int64,
    'source_id': np.int8
}

auctions = pd.read_csv(root_dir + 'auctions.csv.gzip',
                       compression = 'gzip',
                       dtype = auction_dtypes,
                       usecols=list(auction_dtypes.keys()) + ['date'],
                       parse_dates = ['date'])
auctions.rename({'device_id':'ref_hash',
                 'date':'created'}, inplace=True, axis='columns')
auctions['n'] = 1

# Para hacer pruebas ocupando menos memoria se hace un sampleo aleatorio de la mitad del dataframe y se elimina el resto
if is_testing:
    auctions = auctions.sample(frac=.30)
    print("Is Testing, #records:", auctions['n'].sum())
else:
    print("Not Testing, #records:", auctions['n'].sum())

Not Testing, #records: 47409528


In [3]:
# Labels a submitir con las predicciones
to_predict = pd.read_csv(root_dir + 'target_competencia_ids.csv')
to_predict = to_predict.apply(lambda x: np.int64(x['ref_hash'][0:x['ref_hash'].find('_')]), axis='columns').drop_duplicates().to_frame()
to_predict.columns = ['ref_hash']

In [4]:
unique_hashes = pd.read_csv(root_dir+'unique_hashes.csv')

---
## 2. Arreglo de los datos

### Separo las semanas de entrenamiento

Se utiliza el siguiente metodo: 

1. Ventana del 18 al 20 inclusive (1) -> Predice valores entre el 21 y 24 (2)
2. Ventana del 21 al 23 inclusive (2) -> Predice valores entre el 24 y 26 (3)

In [5]:
#Agrego la ventana de tiempo
auctions = auctions.loc[auctions['ref_hash'].isin(unique_hashes['ref_hash'])]\
                   .sort_values(by='created')
auctions = pd.merge_asof(auctions,windows,left_on='created',right_on='begin_date').drop('begin_date', axis='columns')

### 2.1 Arreglo de Auctions

In [6]:
%%time
#Elimino los registros con menos de un minimo de entradas, ya que no hay mucho que predecir en estos casos
group = ['ref_hash','window_nr']
orig_count = auctions['n'].sum()
auctions = auctions.groupby(group, sort = False)\
                   .filter(lambda x: x['n'].sum() >= min_devices)\
                   .sort_values(by=group+['created'])
last_count = auctions['n'].sum()
print('Eliminados:', orig_count-last_count)

Eliminados: 736875
CPU times: user 10min 24s, sys: 16.8 s, total: 10min 40s
Wall time: 10min 40s


In [7]:
%%time
auctions['next_date'] = auctions.groupby(group, as_index = False, sort=False)['created'].transform(lambda x: x.shift(-1))
auctions = auctions.loc[(~auctions['next_date'].isnull())]
auctions['secs_to_next'] = (auctions['next_date'] - auctions['created']).transform(lambda x: round(x.total_seconds()))

CPU times: user 18min 4s, sys: 19.6 s, total: 18min 23s
Wall time: 25min 14s


In [8]:
auctions.head()

Unnamed: 0,created,ref_hash,source_id,n,window_nr,next_date,secs_to_next
9146132,2019-04-19 19:40:28.465866,41863526108385,8,1,1,2019-04-20 02:52:26.892880,25918
11162626,2019-04-20 02:52:26.892880,41863526108385,3,1,1,2019-04-20 02:59:02.509230,396
11194917,2019-04-20 02:59:02.509230,41863526108385,3,1,1,2019-04-20 03:06:01.675788,419
11229465,2019-04-20 03:06:01.675788,41863526108385,3,1,1,2019-04-20 03:08:57.388160,176
11244486,2019-04-20 03:08:57.388160,41863526108385,3,1,1,2019-04-20 03:11:26.463903,149


## Lectura y limpieza de otros dataframes

Esto se hace despues pues preparar Auctions requiere de toda la memoria posible libre

In [9]:
events_dtypes = {
    'event_id': np.int64,
    'ref_hash': np.int64,
    'application_id': np.int64,
    'attributed': np.bool,
    'device_os_version': 'object',
    'device_brand': 'object',
    'device_model': 'object',
    'device_city': 'object',
    'session_user_agent': 'object',
    'trans_id': 'category',
    'user_agent': 'object',
    'event_uuid': 'object',
    'carrier': 'object',
    'kind': 'object',
    'device_os': 'object',
    'wifi': np.bool,
    'connection_type': 'object',
    #'ip_address': np.int64,
    #'device_language': 'category'
}
events_cols = list(events_dtypes.keys()) + ['date']
events = pd.read_csv(root_dir + 'events.csv.gzip', 
                     compression='gzip',
                     dtype=events_dtypes,
                     usecols=events_cols,
                     parse_dates=['date'])

events.rename({'date':'created'}, inplace=True, axis='columns')
events['n'] = 1
# Para hacer pruebas ocupando menos memoria se hace un sampleo aleatorio de la mitad del dataframe y se elimina el resto
if is_testing:
    events = events.sample(frac=.30)
    print("Is Testing, #records:", events['n'].sum())
else:
    print("Not Testing, #records:", events['n'].sum())

Not Testing, #records: 7744581


In [10]:
installs_dtypes = {
    'application_id': np.int32,
    'ref_hash': np.int64, 
    #'click_hash':'category',
    'attributed': 'category',
    'implicit': 'category',
    #'device_countrycode': 'object', 
    'device_brand': 'object',
    'device_model': 'object', 
    'session_user_agent': 'object', 
    'user_agent': 'object', 
    'event_uuid':'object',
    'kind': 'object',
    'wifi': 'object', 
    'trans_id': 'object', 
    #'ip_address':'object', 
    'device_language': 'object'
}
install_cols = list(installs_dtypes.keys()) + ['created']
installs = pd.read_csv(root_dir + 'installs.csv.gzip', 
                       compression='gzip', 
                       usecols=install_cols,
                       dtype= installs_dtypes,
                      parse_dates=['created'])

installs['n'] = 1
print('#records:', installs['n'].sum())

#records: 481511


In [11]:
clicks_dtypes = {
    'advertiser_id': np.int64, 
    'action_id': np.float, 
    'source_id': np.int8, 
    'latitude' : np.float64, 
    'longitude': np.float64, 
    'wifi_connection': np.bool, 
    'carrier_id': 'object', 
    'trans_id': 'object',
    'os_minor':'object', 
    'agent_device' : 'object', 
    'os_major': 'object', 
    'specs_brand': 'category', 
    'brand': 'object',
    'timeToClick': np.float64, 
    'touchX': 'object', 
    'touchY': 'object', 
    'ref_hash':np.int64,
    'created' : 'object'
}
clicks = pd.read_csv(root_dir + 'clicks.csv.gzip', 
                     compression='gzip',
                     usecols=list(clicks_dtypes.keys()) + ['created'],
                     dtype=clicks_dtypes,
                     parse_dates=['created'])
clicks['n'] = 1
print('#records:', clicks['n'].sum())

#records: 64296


In [12]:
installs = installs.loc[installs['ref_hash'].isin(unique_hashes['ref_hash'])]
installs.sort_values(by='created',inplace=True)
installs = pd.merge_asof(installs,windows,left_on='created',right_on='begin_date').drop('begin_date', axis='columns')

In [13]:
clicks = clicks.loc[clicks['ref_hash'].isin(unique_hashes['ref_hash'])]
clicks['created'] = clicks['created'].dt.tz_localize(None)
clicks.sort_values(by='created',inplace=True)
clicks = pd.merge_asof(clicks,windows,left_on='created',right_on='begin_date').drop('begin_date', axis='columns')

In [14]:
events = events.loc[events['ref_hash'].isin(unique_hashes['ref_hash'])]
events.sort_values(by='created',inplace=True)
events = pd.merge_asof(events,windows,left_on='created',right_on='begin_date').drop('begin_date', axis='columns')

### Arreglo de installs

In [15]:
installs['kind'] = installs['kind'].str.replace(' ','_')
installs['kind'] = installs['kind'].str.replace('af_app_open ','af_app_opened')
installs['kind'] = installs['kind'].str.replace('af_app_opend','af_app_opened')
installs['kind'] = installs['kind'].str.lower()

In [16]:
installs.isnull().sum()


created                    0
application_id             0
ref_hash                   0
attributed                 0
implicit                   0
device_brand          204813
device_model           26871
session_user_agent     14828
user_agent            150111
event_uuid            377704
kind                  377704
wifi                  186016
trans_id              472140
device_language        27552
n                          0
window_nr                  0
dtype: int64

In [17]:
for i in ['device_brand','device_model','session_user_agent','user_agent','kind','wifi','device_language']:
    installs[i] = installs[i].fillna('unknown')
installs['device_brand'] = installs['device_brand'].astype('category')
installs['device_model'] = installs['device_model'].astype('category')
installs['session_user_agent'] = installs['session_user_agent'].astype('category')
installs['user_agent'] = installs['user_agent'].astype('category')
installs['kind'] = installs['kind'].astype('category')
installs['wifi'] = installs['wifi'].astype('category')
installs['device_language'] = installs['device_language'].astype('category')

In [18]:
%%time
installs.sort_values(by=group+['created'],inplace=True)

installs['next_date'] = installs.groupby(group, as_index = False, sort=False)['created'].transform(lambda x: x.shift(-1))
installs = installs.loc[(~installs['next_date'].isnull())]
installs['secs_to_next'] = (installs['next_date'] - installs['created']).transform(lambda x: round(x.total_seconds()))

CPU times: user 4min 3s, sys: 3.82 s, total: 4min 7s
Wall time: 4min 5s


In [19]:
%%time
events.sort_values(by=group+['created'],inplace=True)

CPU times: user 13.5 s, sys: 1.31 s, total: 14.8 s
Wall time: 14.8 s


In [20]:
installs.head()


Unnamed: 0,created,application_id,ref_hash,attributed,implicit,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,device_language,n,window_nr,next_date,secs_to_next
47506,2019-04-18 21:11:50.326,121,41863526108385,False,True,6.115025880051902e+18,1.658417010837625e+18,http-kit/2.0,Dalvik/2.1.0 (Linux; U; Android 8.0.0; LDN-LX3...,2f8be0cc-297e-4c9c-a097-1096aa5824b5,open,False,,6.977049253562486e+18,1,1,2019-04-18 21:11:51.966,2
47508,2019-04-18 21:11:51.966,121,41863526108385,False,False,6.115025880051902e+18,1.658417010837625e+18,http-kit/2.0,Dalvik/2.1.0 (Linux; U; Android 8.0.0; LDN-LX3...,,unknown,False,,6.977049253562486e+18,1,1,2019-04-18 21:17:11.946,320
47813,2019-04-18 21:17:11.946,65,41863526108385,False,False,6.115025880051902e+18,1.658417010837625e+18,http-kit/2.0,Dalvik/2.1.0 (Linux; U; Android 8.0.0; LDN-LX3...,,unknown,False,,6.977049253562486e+18,1,1,2019-04-18 21:17:16.531,5
362730,2019-04-24 20:39:14.142,145,448610188195811,False,True,2.987569314309514e+18,4.898310199028851e+18,http-kit/2.0,Dalvik/2.1.0 (Linux; U; Android 9; Redmi Note ...,7cbdf64a-a8cf-4a55-af4a-8b7181403feb,af_app_opened,False,,6.977049253562486e+18,1,3,2019-04-24 20:39:55.008,41
208551,2019-04-21 22:31:11.662,65,475635010681369,False,True,4.533745055360187e+18,1.7261492746485394e+18,http-kit/2.0,Dalvik/2.1.0 (Linux; U; Android 8.1.0; Hisense...,56caed98-1785-43e7-8209-bbeb747f9268,af_app_opened,True,,6.977049253562486e+18,1,2,2019-04-21 22:31:18.581,7


### Arreglo de Clicks

In [21]:
clicks.touchX = clicks.touchX.apply(lambda x: np.float64(x))
clicks.touchY = clicks.touchY.apply(lambda x: np.float64(x))
clicks['created'] = clicks['created'].dt.tz_localize(None)

In [22]:
for i in ['carrier_id','os_minor', 'agent_device', 'brand', 'os_major']:
    clicks[i] = clicks[i].fillna('unknown').astype('category')

In [23]:
clicks.isnull().sum()


advertiser_id          0
action_id          63835
source_id              0
created                0
latitude               0
longitude              0
wifi_connection        0
carrier_id             0
trans_id               0
os_minor               0
agent_device           0
os_major               0
specs_brand            0
brand                  0
timeToClick        25885
touchX             20428
touchY             20428
ref_hash               0
n                      0
window_nr          19730
dtype: int64

In [24]:
clicks.head()

Unnamed: 0,advertiser_id,action_id,source_id,created,latitude,longitude,wifi_connection,carrier_id,trans_id,os_minor,agent_device,os_major,specs_brand,brand,timeToClick,touchX,touchY,ref_hash,n,window_nr
0,1,,1,2019-04-12 00:00:01.981,1.714386,0.871408,False,1.0,OHtntFaZBW28OcZvxtGT0rLBLqmccrM,1.5176438893491397e+18,unknown,5.131615556736863e+18,3576558787748411622,0.0,19.296,0.952,0.07,3455656761384645032,1,
1,2,,1,2019-04-12 00:00:29.662,1.712736,0.869157,True,24.0,XZZOyg3ZOEFw9MFeBJWF7dNUl7Dl8AQ,3.575963029724781e+18,unknown,5.754947116114108e+18,3576558787748411622,0.0,625.859,0.474,11.684,2751776655120553475,1,
2,1,,0,2019-04-12 00:00:53.624,1.714547,0.871535,False,13.0,ptDi6mZepLORp2kxILbv_g5WxOVgqHI,1.2885781261232225e+18,unknown,3.908390200756879e+18,71913840936116953,unknown,29.908,0.13,0.179,6931142988092289392,1,
3,1,,1,2019-04-12 00:01:07.438,1.837799,0.829634,True,23.0,MhX94IlfeUwwWATXfHlLHKGPwSepPv4,2.2387361390161664e+18,unknown,3.581232574980917e+18,3576558787748411622,unknown,377.566,0.446,0.51,2114792698570967449,1,
4,1,,1,2019-04-12 00:01:08.535,1.733375,0.90813,True,1.0,BubcDEi6KKiYNUoN3XZmHioWNYna5N0,5.310344816890522e+18,unknown,3.581232574980917e+18,3576558787748411622,unknown,141.725,0.747,0.022,3949216411894914793,1,


### Arreglo de Events

In [25]:
for i in ['device_os_version','device_brand','device_model','device_city','session_user_agent','user_agent','carrier','kind','device_os','connection_type']:
    events[i] = events[i].fillna('unknown').astype('category')

In [26]:
events.isnull().sum()


created                     0
event_id                    0
ref_hash                    0
application_id              0
attributed                  0
device_os_version           0
device_brand                0
device_model                0
device_city                 0
session_user_agent          0
trans_id              7700153
user_agent                  0
event_uuid              29631
carrier                     0
kind                        0
device_os                   0
wifi                        0
connection_type             0
n                           0
window_nr                   0
dtype: int64

---
## 3. Armado de Features

A continuacion se comienzan a extraer los distintos features que formaran el set de entrenamiento. 
Se cruza cada ventana con los labels que se desean predecir. 

Para entrenar el set debe decidirse como utilizar los datos de las distintas ventanas. 

Se tomaran los datos en 2 ventanas diferentes y se validaran con dos ventanas diferentes, de esta forma no deberia haber problemas de solapamiento y se maximiza la cantidad de datos.

Para el set final puede armarse un set de labels con los valores de los ultimos 3 dias.

In [27]:
max_dates = pd.DataFrame({'w_max_date': [dt.datetime(2019,4,21),dt.datetime(2019,4,24),dt.datetime(2019,4,27)], 'window_nr' : [1,2,3]})

In [28]:
temp = unique_hashes
temp['window_nr'] = 1
devices = temp.copy()
temp['window_nr'] = 2
devices = devices.append(temp)
temp['window_nr'] = 3
devices = devices.append(temp)

In [29]:
devices['window_nr'].value_counts()

3    662110
2    662110
1    662110
Name: window_nr, dtype: int64

In [30]:
devices = devices.set_index(group)

In [31]:
sample = auctions.head(100)

In [32]:
sample.head()

Unnamed: 0,created,ref_hash,source_id,n,window_nr,next_date,secs_to_next
9146132,2019-04-19 19:40:28.465866,41863526108385,8,1,1,2019-04-20 02:52:26.892880,25918
11162626,2019-04-20 02:52:26.892880,41863526108385,3,1,1,2019-04-20 02:59:02.509230,396
11194917,2019-04-20 02:59:02.509230,41863526108385,3,1,1,2019-04-20 03:06:01.675788,419
11229465,2019-04-20 03:06:01.675788,41863526108385,3,1,1,2019-04-20 03:08:57.388160,176
11244486,2019-04-20 03:08:57.388160,41863526108385,3,1,1,2019-04-20 03:11:26.463903,149


### Tiempo promedio de arribos

Se desea saber el tiempo promedio entre arribos de los dispositivos a las encuestas.

In [33]:
%%time
time = auctions.groupby(group,sort=False)['secs_to_next'].mean()\
               .rename('secs_to_next_mean')\
               .transform(lambda x: round(x))

CPU times: user 6.05 s, sys: 3.05 s, total: 9.1 s
Wall time: 9.35 s


In [34]:
devices = devices.merge(time, how='left', left_index=True, right_index=True)

In [35]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean
ref_hash,window_nr,Unnamed: 2_level_1
2564673204772915246,1,765.0
4441121667607578179,1,111.0
7721769811471055264,1,281.0
6416039086842158968,1,
1258642015983312729,1,8365.0


### Tiempo desde ultima aparicion

In [36]:
%%time
time = auctions.groupby(group,sort=False)\
               .tail(1)\
               .merge(max_dates, on='window_nr', how='left')\
               .set_index(group)

time = (time['w_max_date'] - time['created']).transform(lambda x: round(x.total_seconds())).rename('secs_since_last_arrival')

CPU times: user 14.3 s, sys: 2.58 s, total: 16.9 s
Wall time: 23.8 s


In [37]:
time.head()

ref_hash         window_nr
41863526108385   1             70206
135153013040192  1             70659
161514654074162  1            248758
186034136943920  1             74393
                 2             37583
Name: secs_since_last_arrival, dtype: int64

In [38]:
devices = devices.merge(time, how='left', left_index=True, right_index=True)

In [39]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1
2564673204772915246,1,765.0,88625.0
4441121667607578179,1,111.0,29052.0
7721769811471055264,1,281.0,333.0
6416039086842158968,1,,
1258642015983312729,1,8365.0,121224.0


### Cantidad de apariciones en encuestas en la ventana

In [40]:
%%time
time = auctions.groupby(group ,sort=False)['n'].sum().rename('auctions_total')

CPU times: user 2.29 s, sys: 984 ms, total: 3.27 s
Wall time: 3.32 s


In [41]:
time.head()

ref_hash         window_nr
41863526108385   1            34
135153013040192  1             7
161514654074162  1             5
186034136943920  1             6
                 2            58
Name: auctions_total, dtype: int64

In [42]:
devices = devices.merge(time, how='left', left_index=True, right_index=True)

In [43]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2564673204772915246,1,765.0,88625.0,216.0
4441121667607578179,1,111.0,29052.0,1188.0
7721769811471055264,1,281.0,333.0,908.0
6416039086842158968,1,,,
1258642015983312729,1,8365.0,121224.0,12.0


### Cantidad de apariciones en encuestas en la ultima hora

In [44]:
%%time
time = auctions.merge(max_dates, on='window_nr', how='left')
time = time.loc[time['created'] >= (time['w_max_date'] - timedelta(hours=1))]\
           .groupby(group, sort=False)['n'].sum()\
           .rename('auctions_last_hour')

CPU times: user 14.5 s, sys: 10.4 s, total: 24.9 s
Wall time: 25 s


In [45]:
time.head()

ref_hash         window_nr
345999128501141  2             3
360710529886978  3             2
416301579449694  1             2
686608884458246  3            11
717556230663455  1            74
Name: auctions_last_hour, dtype: int64

In [46]:
devices = devices.merge(time, how='left', left_index=True, right_index=True)

In [47]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2564673204772915246,1,765.0,88625.0,216.0,
4441121667607578179,1,111.0,29052.0,1188.0,
7721769811471055264,1,281.0,333.0,908.0,23.0
6416039086842158968,1,,,,
1258642015983312729,1,8365.0,121224.0,12.0,


### Cantidad de apariciones en encuestas en el fin de semana

In [48]:
%%time
weekend = [20,21]
time = auctions.loc[(auctions['created'].dt.day).isin(weekend)]\
               .groupby(group,sort=False)['n'].sum()\
               .rename('amount_auctions_in_weekend')

CPU times: user 3.58 s, sys: 824 ms, total: 4.41 s
Wall time: 4.69 s


In [49]:
time.head()

ref_hash         window_nr
41863526108385   1            33
135153013040192  1             7
186034136943920  1             4
                 2            11
360710529886978  1             2
Name: amount_auctions_in_weekend, dtype: int64

In [50]:
devices = devices.merge(time, how='left', left_index=True, right_index=True)

In [51]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2564673204772915246,1,765.0,88625.0,216.0,,
4441121667607578179,1,111.0,29052.0,1188.0,,640.0
7721769811471055264,1,281.0,333.0,908.0,23.0,218.0
6416039086842158968,1,,,,,
1258642015983312729,1,8365.0,121224.0,12.0,,


### Ultima aparicion es weekend

In [52]:
weekend = [20,21]
time = auctions.groupby(group, sort=False)\
               .tail(1)\
               .set_index(group)['created'].transform(lambda x: x.day in weekend)\
               .rename('is_last_weekend')

In [53]:
time.head()

ref_hash         window_nr
41863526108385   1             True
135153013040192  1             True
161514654074162  1            False
186034136943920  1             True
                 2            False
Name: is_last_weekend, dtype: bool

In [54]:
devices = devices.merge(time, how='left', left_index=True, right_index=True)

In [55]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2564673204772915246,1,765.0,88625.0,216.0,,,False
4441121667607578179,1,111.0,29052.0,1188.0,,640.0,True
7721769811471055264,1,281.0,333.0,908.0,23.0,218.0,True
6416039086842158968,1,,,,,,
1258642015983312729,1,8365.0,121224.0,12.0,,,False


### Cantidad de sources distintos

In [56]:
temp = auctions.groupby(by = group +['source_id'], sort=False).agg({'n':'sum'}).reset_index()
temp['n'] = 1
temp = temp.groupby(by = group).agg({'n' : 'sum'})
temp.rename(columns = {'n': 'amount_dif_src' },inplace = True)

In [57]:
temp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,amount_dif_src
ref_hash,window_nr,Unnamed: 2_level_1
41863526108385,1,3
135153013040192,1,1
161514654074162,1,2
186034136943920,1,1
186034136943920,2,1


In [58]:
devices = devices.merge(temp, how = 'left', left_index=True, right_index=True)

In [59]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2564673204772915246,1,765.0,88625.0,216.0,,,False,3.0
4441121667607578179,1,111.0,29052.0,1188.0,,640.0,True,5.0
7721769811471055264,1,281.0,333.0,908.0,23.0,218.0,True,3.0
6416039086842158968,1,,,,,,,
1258642015983312729,1,8365.0,121224.0,12.0,,,False,1.0


## Cantidad de eventos

In [60]:
temp = events.groupby(group, sort=False)\
             .agg({'n':'sum'})\
             .rename(columns = {'n': 'amount_events' })

In [61]:
temp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,amount_events
ref_hash,window_nr,Unnamed: 2_level_1
40621409780134,2,9
41863526108385,1,88
41863526108385,2,8
41863526108385,3,57
69039685746313,2,4


In [62]:
devices = devices.merge(temp, how = 'left', left_index=True, right_index=True)

In [63]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2564673204772915246,1,765.0,88625.0,216.0,,,False,3.0,
4441121667607578179,1,111.0,29052.0,1188.0,,640.0,True,5.0,
7721769811471055264,1,281.0,333.0,908.0,23.0,218.0,True,3.0,99.0
6416039086842158968,1,,,,,,,,
1258642015983312729,1,8365.0,121224.0,12.0,,,False,1.0,


### Tiempo desde ultimo evento

In [64]:
temp = events.groupby(group,sort=False)\
             .agg({'created':'max'})\
             .reset_index()\
             .merge(max_dates, on='window_nr', how='left')\
             .set_index(group)
temp = (temp['w_max_date'] - temp['created']).transform(lambda x: round(x.total_seconds())).rename('secs_since_last_event')

In [65]:
temp.head()

ref_hash        window_nr
40621409780134  2             32279
41863526108385  1             66453
                2            250014
                3             65123
69039685746313  2             23093
Name: secs_since_last_event, dtype: int64

In [66]:
devices = devices.merge(temp, how = 'left', left_index=True, right_index=True)

In [67]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events,secs_since_last_event
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2564673204772915246,1,765.0,88625.0,216.0,,,False,3.0,,
4441121667607578179,1,111.0,29052.0,1188.0,,640.0,True,5.0,,
7721769811471055264,1,281.0,333.0,908.0,23.0,218.0,True,3.0,99.0,196475.0
6416039086842158968,1,,,,,,,,,
1258642015983312729,1,8365.0,121224.0,12.0,,,False,1.0,,


### Alguno de los eventos fueron con wifi?

In [68]:
temp = events.groupby(group, sort=False).agg({'wifi':'sum'})
temp['wifi'] = temp['wifi'].astype(bool)

In [None]:
temp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,wifi
ref_hash,window_nr,Unnamed: 2_level_1
40621409780134,2,False
41863526108385,1,False
41863526108385,2,False
41863526108385,3,False
69039685746313,2,True


In [None]:
devices = devices.merge(temp, how = 'left', left_index=True, right_index=True)
#devices['wifi'].fillna(?)

In [None]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events,secs_since_last_event,wifi
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2564673204772915246,1,765.0,88625.0,216.0,,,False,3.0,,,
4441121667607578179,1,111.0,29052.0,1188.0,,640.0,True,5.0,,,
7721769811471055264,1,281.0,333.0,908.0,23.0,218.0,True,3.0,99.0,196475.0,False
6416039086842158968,1,,,,,,,,,,
1258642015983312729,1,8365.0,121224.0,12.0,,,False,1.0,,,


### Tiempo promedio de clicks

In [None]:
temp = clicks.groupby(by = ['ref_hash', 'window_nr'])\
             .agg({'timeToClick' : 'mean'})\
             .rename(columns = {'timeToClick' : 'timeToClick_mean'})

In [None]:
devices = devices.merge(temp, how = 'left', left_index=True, right_index=True)
#devices['timeToClick_mean'].fillna(?)

### Cantidad de advertiser que tiene cada ref_hash

In [None]:
temp = clicks.groupby(group + ['advertiser_id'],sort=False).agg({'n':'sum'}).reset_index()
temp['n'] = 1
temp = temp.groupby(group)\
           .agg({'n' : 'sum'})\
           .rename(columns = {'n': 'amount_dif_advertisers' })

In [None]:
temp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,amount_dif_advertisers
ref_hash,window_nr,Unnamed: 2_level_1
693609737448534,2.0,1
1461247282174365,1.0,1
2204225481747532,2.0,1
5470466329076030,1.0,1
5534899846690585,1.0,1


In [None]:
devices = devices.merge(temp, how = 'left', left_index=True, right_index=True)

### Cantidad de Installs

In [None]:
temp = installs.groupby(group,sort=False)['n'].sum().rename('amount_installs')

In [None]:
temp.head()

ref_hash         window_nr
41863526108385   1            3
448610188195811  3            1
475635010681369  2            1
622102439689666  2            1
955103142671534  2            1
Name: amount_installs, dtype: int64

In [None]:
devices = devices.merge(temp, how = 'left', left_index=True, right_index=True)

In [None]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events,secs_since_last_event,wifi,timeToClick_mean,amount_dif_advertisers,amount_installs
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2564673204772915246,1,765.0,88625.0,216.0,,,False,3.0,,,,,,
4441121667607578179,1,111.0,29052.0,1188.0,,640.0,True,5.0,,,,,,
7721769811471055264,1,281.0,333.0,908.0,23.0,218.0,True,3.0,99.0,196475.0,False,,,
6416039086842158968,1,,,,,,,,,,,,,
1258642015983312729,1,8365.0,121224.0,12.0,,,False,1.0,,,,,,


### Tiempo medio entre Installs

In [None]:
temp = installs.groupby(group,sort=False)['secs_to_next']\
               .apply(lambda x: round(x.mean()))\
               .rename('secs_to_next_install_mean')

In [None]:
temp.head()

ref_hash         window_nr
41863526108385   1            109.0
448610188195811  3             41.0
475635010681369  2              7.0
622102439689666  2              0.0
955103142671534  2              2.0
Name: secs_to_next_install_mean, dtype: float64

In [None]:
devices = devices.merge(temp, how = 'left', left_index=True, right_index=True)

In [None]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events,secs_since_last_event,wifi,timeToClick_mean,amount_dif_advertisers,amount_installs,secs_to_next_install_mean
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2564673204772915246,1,765.0,88625.0,216.0,,,False,3.0,,,,,,,
4441121667607578179,1,111.0,29052.0,1188.0,,640.0,True,5.0,,,,,,,
7721769811471055264,1,281.0,333.0,908.0,23.0,218.0,True,3.0,99.0,196475.0,False,,,,
6416039086842158968,1,,,,,,,,,,,,,,
1258642015983312729,1,8365.0,121224.0,12.0,,,False,1.0,,,,,,,


### Ultimo evento

In [None]:
temp = events.groupby(group,sort=False)\
             .tail(1)\
             .set_index(group)['event_id']\
             .rename('last_event')

In [None]:
temp.head()

ref_hash        window_nr
40621409780134  2            363
41863526108385  1              1
                2              2
                3              2
69039685746313  2            287
Name: last_event, dtype: int64

In [None]:
devices = devices.merge(temp, how = 'left', left_index=True, right_index=True)

In [None]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events,secs_since_last_event,wifi,timeToClick_mean,amount_dif_advertisers,amount_installs,secs_to_next_install_mean,last_event
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2564673204772915246,1,765.0,88625.0,216.0,,,False,3.0,,,,,,,,
4441121667607578179,1,111.0,29052.0,1188.0,,640.0,True,5.0,,,,,,,,
7721769811471055264,1,281.0,333.0,908.0,23.0,218.0,True,3.0,99.0,196475.0,False,,,,,2.0
6416039086842158968,1,,,,,,,,,,,,,,,
1258642015983312729,1,8365.0,121224.0,12.0,,,False,1.0,,,,,,,,


### Anteultimo evento

In [None]:
temp = events.groupby(group,sort=False)\
             .nth(-2)['event_id']\
             .rename('before_last_event')

In [None]:
temp.head()

ref_hash        window_nr
40621409780134  2            364
41863526108385  1              1
                2              2
                3              2
69039685746313  2            287
Name: before_last_event, dtype: int64

In [None]:
devices = devices.merge(temp, how = 'left', left_index=True, right_index=True)

In [None]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events,secs_since_last_event,wifi,timeToClick_mean,amount_dif_advertisers,amount_installs,secs_to_next_install_mean,last_event,before_last_event
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2564673204772915246,1,765.0,88625.0,216.0,,,False,3.0,,,,,,,,,
4441121667607578179,1,111.0,29052.0,1188.0,,640.0,True,5.0,,,,,,,,,
7721769811471055264,1,281.0,333.0,908.0,23.0,218.0,True,3.0,99.0,196475.0,False,,,,,2.0,2.0
6416039086842158968,1,,,,,,,,,,,,,,,,
1258642015983312729,1,8365.0,121224.0,12.0,,,False,1.0,,,,,,,,,


### Ante anteultimo evento

In [None]:
temp = events.groupby(group,sort=False)\
             .nth(-3)['event_id']\
             .rename('before_before_last_event')

In [None]:
temp.head()

ref_hash        window_nr
40621409780134  2              7
41863526108385  1              1
                2              2
                3              0
69039685746313  2            287
Name: before_before_last_event, dtype: int64

In [None]:
devices = devices.merge(temp, how = 'left', left_index=True, right_index=True)

In [None]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events,secs_since_last_event,wifi,timeToClick_mean,amount_dif_advertisers,amount_installs,secs_to_next_install_mean,last_event,before_last_event,before_before_last_event
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2564673204772915246,1,765.0,88625.0,216.0,,,False,3.0,,,,,,,,,,
4441121667607578179,1,111.0,29052.0,1188.0,,640.0,True,5.0,,,,,,,,,,
7721769811471055264,1,281.0,333.0,908.0,23.0,218.0,True,3.0,99.0,196475.0,False,,,,,2.0,2.0,2.0
6416039086842158968,1,,,,,,,,,,,,,,,,,
1258642015983312729,1,8365.0,121224.0,12.0,,,False,1.0,,,,,,,,,,


### El evento es día Martes o Miércoles ?
#### La cantidad de eventos se mantiene constante excepto por un pico producido el martes y el miércoles.

In [None]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events,secs_since_last_event,wifi,timeToClick_mean,amount_dif_advertisers,amount_installs,secs_to_next_install_mean,last_event,before_last_event,before_before_last_event
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2564673204772915246,1,765.0,88625.0,216.0,,,False,3.0,,,,,,,,,,
4441121667607578179,1,111.0,29052.0,1188.0,,640.0,True,5.0,,,,,,,,,,
7721769811471055264,1,281.0,333.0,908.0,23.0,218.0,True,3.0,99.0,196475.0,False,,,,,2.0,2.0,2.0
6416039086842158968,1,,,,,,,,,,,,,,,,,
1258642015983312729,1,8365.0,121224.0,12.0,,,False,1.0,,,,,,,,,,


In [None]:
def pond_M_X_ev(date):
    dn = date.day_name()
    return (dn == 'Tuesday') | (dn == 'Wednesday')

In [None]:
e1 = events
print("Cantidad ref_hash"+ str(e1['ref_hash'].count()))

dev = devices.copy()

e1['pond_M_X_ev'] = e1['created'].apply(pond_M_X_ev)

dev2 = e1[group + ['pond_M_X_ev']]
temp = dev2.set_index(group)
temp.head(2)

devices = devices.merge(temp, how = 'left', left_index=True, right_index=True)

#devices.drop('created',axis=1,inplace=True)

Cantidad ref_hash7737150


In [None]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events,secs_since_last_event,wifi,timeToClick_mean,amount_dif_advertisers,amount_installs,secs_to_next_install_mean,last_event,before_last_event,before_before_last_event,pond_M_X_ev
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
40621409780134,1,,,,,,,,,,,,,,,,,,
40621409780134,2,,,,,,,,9.0,32279.0,False,,,,,363.0,364.0,7.0,False
40621409780134,2,,,,,,,,9.0,32279.0,False,,,,,363.0,364.0,7.0,False
40621409780134,2,,,,,,,,9.0,32279.0,False,,,,,363.0,364.0,7.0,False
40621409780134,2,,,,,,,,9.0,32279.0,False,,,,,363.0,364.0,7.0,False


### Ponderacion por horario de evento

In [None]:
devices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour,amount_auctions_in_weekend,is_last_weekend,amount_dif_src,amount_events,secs_since_last_event,wifi,timeToClick_mean,amount_dif_advertisers,amount_installs,secs_to_next_install_mean,last_event,before_last_event,before_before_last_event,pond_M_X_ev
ref_hash,window_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
40621409780134,1,,,,,,,,,,,,,,,,,,
40621409780134,2,,,,,,,,9.0,32279.0,False,,,,,363.0,364.0,7.0,False
40621409780134,2,,,,,,,,9.0,32279.0,False,,,,,363.0,364.0,7.0,False
40621409780134,2,,,,,,,,9.0,32279.0,False,,,,,363.0,364.0,7.0,False
40621409780134,2,,,,,,,,9.0,32279.0,False,,,,,363.0,364.0,7.0,False


In [None]:
def pond_hour_ev(ts):
    date = ts.to_pydatetime()
    hs = date.hour
    r0 = range(0,4)
    r1 = range(4,10)
    r2 = range(11,13)
    r3 = range(13,24)
    if(hs in r0) |( hs in r2):
        return 0.5
    else:
        if(hs in r1):
            return 0.2
        else:
            if(hs in r3):
                return 0.9

In [None]:
temp = events[group + ['created']]
temp['pond_hour_ev'] = temp['created'].apply(pond_hour_ev)
temp = temp[group + ['pond_hour_ev']]
temp = temp.set_index(group)
temp.head(2)

devices = devices.merge(temp, how = 'left', left_index=True, right_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
devices.head()

---
## 4. Guardo Features 
Aplico arreglo: 
* Si el dispositivo no aparecio en la ventana, se elimina la fila
* Si el dispositivo no tiene valores en los eventos de count agrego count 0

In [None]:
#Chequear casos que queden con medias en null
devices = devices.dropna(how='all')

In [None]:
devices.isnull().sum()

In [None]:
devices.dtypes

In [None]:
#Fix means
means = [x for x in devices.columns if 'mean' in x]
for i in means:
    print("Filling ",i, " with window mean")
    devices[i] = devices.groupby('window_nr')[i].transform(lambda x: x.fillna(round(x.mean())))

#Fix Strings
obj = list(devices.dtypes[devices.dtypes == 'object'].index)
for i in obj:
    print("Filling ",i, " with False")
    devices[i] = devices[i].fillna(False)

print('Filling the rest with 0')
devices = devices.fillna(0)

In [None]:
devices.isnull().sum()

In [None]:
devices = devices.drop_duplicates()

In [None]:
devices.to_csv('training_set5.csv',header=True,index=True)

In [None]:
devices.head(20)