In [1]:
import pandas as pd
import matplotlib as mp
import numpy as np
from datetime import timedelta
import datetime as dt

%matplotlib inline

#Root de archivos de datos
root_dir = '../data/'

#Lee menos data si es testing
is_testing = False

# Cantidad minima de aparicion de equipos en un dataframe
min_devices = 5

# Identificador unico de equipos
device_uuid = ['ref_hash']

# Ventanas de tiempo
windows = pd.DataFrame({
    'begin_date': [dt.datetime(2019,4,18),dt.datetime(2019,4,21),dt.datetime(2019,4,24)],
    'window_nr':[1,2,3]
})

# Armado de Features

Se utilizan 3 ventanas de tiempo y se procesan los dataframes para obtener features de esas 3 ventanas por separado. 

Como resultado se obtiene un dataset de features mas grande cuyos datos no se solapan en tiempo. Por lo que son validos para la prediccion. 

Se realiza la lectura y limpiado de los dataframes principales, con los mismos se realiza el filtrado y armado de los features. 

---
## 1. Lectura de DataFrames

In [2]:
#Optimizado para menos memoria
auction_dtypes = {
    'device_id': np.int64,
    'source_id': np.int8
}

auctions = pd.read_csv(root_dir + 'auctions.csv.gzip',
                       compression = 'gzip',
                       dtype = auction_dtypes,
                       usecols=list(auction_dtypes.keys()) + ['date'],
                       parse_dates = ['date'])
auctions.rename({'device_id':'ref_hash',
                 'date':'created'}, inplace=True, axis='columns')
auctions['n'] = 1

# Para hacer pruebas ocupando menos memoria se hace un sampleo aleatorio de la mitad del dataframe y se elimina el resto
if is_testing:
    auctions = auctions.sample(frac=.30)
    print("Is Testing, #records:", auctions['n'].sum())
else:
    print("Not Testing, #records:", auctions['n'].sum())

Not Testing, #records: 47409528


In [3]:
installs_dtypes = {
    'application_id': np.int32,
    'ref_hash': np.int64, 
    #'click_hash':'category',
    'attributed': 'category',
    'implicit': 'category',
    #'device_countrycode': 'object', 
    'device_brand': 'object',
    'device_model': 'object', 
    'session_user_agent': 'object', 
    'user_agent': 'object', 
    'event_uuid':'object',
    'kind': 'object',
    'wifi': 'object', 
    'trans_id': 'object', 
    #'ip_address':'object', 
    'device_language': 'object'
}
install_cols = list(installs_dtypes.keys()) + ['created']
installs = pd.read_csv(root_dir + 'installs.csv.gzip', 
                       compression='gzip', 
                       usecols=install_cols,
                       dtype= installs_dtypes,
                      parse_dates=['created'])

installs['n'] = 1
print('#records:', installs['n'].sum())

#records: 481511


In [24]:
clicks_dtypes = {
    'advertiser_id': np.int64, 
    'action_id': np.float, 
    'source_id': np.int8, 
    'latitude' : np.float64, 
    'longitude': np.float64, 
    'wifi_connection': np.bool, 
    'carrier_id': 'object', 
    'trans_id': 'object',
    'os_minor':'object', 
    'agent_device' : 'object', 
    'os_major': 'object', 
    'specs_brand': 'category', 
    'brand': 'object',
    'timeToClick': np.float64, 
    'touchX': 'object', 
    'touchY': 'object', 
    'ref_hash':np.int64,
    'created' : 'object'
}
clicks = pd.read_csv(root_dir + 'clicks.csv.gzip', 
                     compression='gzip',
                     usecols=list(clicks_dtypes.keys()) + ['created'],
                     dtype=clicks_dtypes,
                     parse_dates=['created'])
clicks['n'] = 1
print('#records:', clicks['n'].sum())

#records: 64296


In [5]:
events_dtypes = {
    'event_id': np.int64,
    'ref_hash': np.int64,
    'application_id': np.int64,
    'attributed': np.bool,
    'device_os_version': 'object',
    'device_brand': 'object',
    'device_model': 'object',
    'device_city': 'object',
    'session_user_agent': 'object',
    'trans_id': 'category',
    'user_agent': 'object',
    'event_uuid': 'object',
    'carrier': 'object',
    'kind': 'object',
    'device_os': 'object',
    'wifi': np.bool,
    'connection_type': 'object',
    #'ip_address': np.int64,
    #'device_language': 'category'
}
events_cols = list(events_dtypes.keys()) + ['date']
events = pd.read_csv(root_dir + 'events.csv.gzip', 
                     compression='gzip',
                     dtype=events_dtypes,
                     usecols=events_cols,
                     parse_dates=['date'])

events.rename({'date':'created'}, inplace=True, axis='columns')
events['n'] = 1
# Para hacer pruebas ocupando menos memoria se hace un sampleo aleatorio de la mitad del dataframe y se elimina el resto
if is_testing:
    events = events.sample(frac=.30)
    print("Is Testing, #records:", events['n'].sum())
else:
    print("Not Testing, #records:", events['n'].sum())

Not Testing, #records: 7744581


In [6]:
# Labels a submitir con las predicciones
to_predict = pd.read_csv(root_dir + 'target_competencia_ids.csv')
to_predict = to_predict.apply(lambda x: np.int64(x['ref_hash'][0:x['ref_hash'].find('_')]), axis='columns').drop_duplicates().to_frame()
to_predict.columns = ['ref_hash']

In [7]:
unique_hashes = pd.read_csv(root_dir+'unique_hashes.csv')

---
## 2. Arreglo de los datos

### Separo las semanas de entrenamiento

Se utiliza el siguiente metodo: 

1. Ventana del 18 al 20 inclusive (1) -> Predice valores entre el 21 y 24 (2)
2. Ventana del 21 al 23 inclusive (2) -> Predice valores entre el 24 y 26 (3)

In [8]:
#Agrego la ventana de tiempo
auctions = auctions.loc[auctions['ref_hash'].isin(unique_hashes['ref_hash'])]
auctions.sort_values(by='created',inplace=True)
auctions = pd.merge_asof(auctions,windows,left_on='created',right_on='begin_date').drop('begin_date', axis='columns')

In [None]:
installs = installs.loc[installs['ref_hash'].isin(unique_hashes['ref_hash'])]
installs.sort_values(by='created',inplace=True)
installs = pd.merge_asof(installs,windows,left_on='created',right_on='begin_date').drop('begin_date', axis='columns')

In [25]:
clicks = clicks.loc[clicks['ref_hash'].isin(unique_hashes['ref_hash'])]
clicks.sort_values(by='created',inplace=True)
clicks = pd.merge_asof(clicks,windows,left_on='created',right_on='begin_date').drop('begin_date', axis='columns')

In [None]:
events = events.loc[events['ref_hash'].isin(unique_hashes['ref_hash'])]
events.sort_values(by='created',inplace=True)
events = pd.merge_asof(events,windows,left_on='created',right_on='begin_date').drop('begin_date', axis='columns')

### 2.1 Arreglo de Auctions

In [9]:
#Elimino los registros con menos de un minimo de entradas, ya que no hay mucho que predecir en estos casos
group = ['ref_hash','window_nr']
orig_count = auctions['n'].sum()
auctions = auctions.groupby(group, sort = False).filter(lambda x: x['n'].sum() >= min_devices) 
last_count = auctions['n'].sum()
print('Eliminados:', orig_count-last_count)

Eliminados: 736875


In [10]:
auctions.sort_values(by=group+['created'], inplace=True)

In [11]:
auctions['next_date'] = auctions.groupby(group, as_index = False, sort=False)['created'].transform(lambda x: x.shift(-1))
auctions = auctions.loc[(~auctions['next_date'].isnull())]
auctions['secs_to_next'] = (auctions['next_date'] - auctions['created']).transform(lambda x: round(x.total_seconds()))

In [12]:
auctions.head()

Unnamed: 0,created,ref_hash,source_id,n,window_nr,next_date,secs_to_next
9146132,2019-04-19 19:40:28.465866,41863526108385,8,1,1,2019-04-20 02:52:26.892880,25918
11162626,2019-04-20 02:52:26.892880,41863526108385,3,1,1,2019-04-20 02:59:02.509230,396
11194917,2019-04-20 02:59:02.509230,41863526108385,3,1,1,2019-04-20 03:06:01.675788,419
11229465,2019-04-20 03:06:01.675788,41863526108385,3,1,1,2019-04-20 03:08:57.388160,176
11244486,2019-04-20 03:08:57.388160,41863526108385,3,1,1,2019-04-20 03:11:26.463903,149


### 2.2 Arreglo de datos de Installs

In [13]:
installs['kind'] = installs['kind'].str.replace(' ','_')
installs['kind'] = installs['kind'].str.replace('af_app_open ','af_app_opened')
installs['kind'] = installs['kind'].str.replace('af_app_opend','af_app_opened')
installs['kind'] = installs['kind'].str.lower()

In [14]:
installs.isnull().sum()

created                    0
application_id             0
ref_hash                   0
attributed                 0
implicit                   0
device_brand          204813
device_model           26871
session_user_agent     14828
user_agent            150111
event_uuid            377704
kind                  377704
wifi                  186016
trans_id              472140
device_language        27552
n                          0
window_nr                  0
dtype: int64

In [15]:
for i in ['device_brand','device_model','session_user_agent','user_agent','kind','wifi','device_language']:
    installs[i] = installs[i].fillna('unknown')
installs['device_brand'] = installs['device_brand'].astype('category')
installs['device_model'] = installs['device_model'].astype('category')
installs['session_user_agent'] = installs['session_user_agent'].astype('category')
installs['user_agent'] = installs['user_agent'].astype('category')
installs['kind'] = installs['kind'].astype('category')
installs['wifi'] = installs['wifi'].astype('category')
installs['device_language'] = installs['device_language'].astype('category')

In [16]:
installs.head().transpose()

Unnamed: 0,0,1,2,3,4
created,2019-04-18 00:00:01.560000,2019-04-18 00:00:01.851000,2019-04-18 00:00:05.152000,2019-04-18 00:00:05.589000,2019-04-18 00:00:06.795000
application_id,70,70,65,27,339
ref_hash,4432995619177048534,5904733559638204455,896373747754111825,3399210824535017892,1541425881979513687
attributed,False,False,False,False,False
implicit,False,False,True,False,False
device_brand,unknown,unknown,3.083058605577787e+17,unknown,unknown
device_model,unknown,unknown,5.274185305862703e+18,6.794880020077885e+18,6.794880020077885e+18
session_user_agent,Apsalar-Postback,Apsalar-Postback,http-kit/2.0,http-kit/2.0,http-kit/2.0
user_agent,unknown,unknown,Dalvik/2.1.0 (Linux; U; Android 9; SM-G9650 Bu...,trivago/216 CFNetwork/978.0.7 Darwin/18.5.0,TikTok/109005 CFNetwork/758.5.3 Darwin/15.6.0
event_uuid,,,8c8af5e3-96e7-4a49-9f17-cafa7f300f2c,,


### 2.3. Arreglo de datos de Clicks

In [26]:
clicks.touchX = clicks.touchX.apply(lambda x: np.float64(x))
clicks.touchY = clicks.touchY.apply(lambda x: np.float64(x))
clicks['created'] = clicks['created'].dt.tz_localize(None)

In [27]:
for i in ['carrier_id','os_minor', 'agent_device', 'brand', 'os_major']:
    clicks[i] = clicks[i].fillna('unknown').astype('category')

In [28]:
clicks.isnull().sum()

advertiser_id          0
action_id          63835
source_id              0
created                0
latitude               0
longitude              0
wifi_connection        0
carrier_id             0
trans_id               0
os_minor               0
agent_device           0
os_major               0
specs_brand            0
brand                  0
timeToClick        25885
touchX             20428
touchY             20428
ref_hash               0
n                      0
window_nr          19730
dtype: int64

### 2.4 Arreglo de datos de Events

In [29]:
for i in ['device_os_version','device_brand','device_model','device_city','session_user_agent','user_agent','carrier','kind','device_os','connection_type']:
    events[i] = events[i].fillna('unknown').astype('category')

In [30]:
events.isnull().sum()

created                     0
event_id                    0
ref_hash                    0
application_id              0
attributed                  0
device_os_version           0
device_brand                0
device_model                0
device_city                 0
session_user_agent          0
trans_id              7700153
user_agent                  0
event_uuid              29631
carrier                     0
kind                        0
device_os                   0
wifi                        0
connection_type             0
n                           0
window_nr                   0
dtype: int64

---
## 3. Armado de Features

A continuacion se comienzan a extraer los distintos features que formaran el set de entrenamiento. 
Se cruza cada ventana con los labels que se desean predecir. 

Para entrenar el set debe decidirse como utilizar los datos de las distintas ventanas. 

Se tomaran los datos en 2 ventanas diferentes y se validaran con dos ventanas diferentes, de esta forma no deberia haber problemas de solapamiento y se maximiza la cantidad de datos.

Para el set final puede armarse un set de labels con los valores de los ultimos 3 dias.

### Armado de tiempo entre arribos

Se desea saber el tiempo promedio entre arribos de los dispositivos a las encuestas.

In [None]:
#filtered = auctions.loc[auctions['secs_to_next'] < 120]

In [68]:
temp = unique_hashes
temp['window_nr'] = 1
devices = temp.copy()
temp['window_nr'] = 2
devices = devices.append(temp)
temp['window_nr'] = 3
devices = devices.append(temp)

In [73]:
devices['window_nr'].value_counts()

3    662110
2    662110
1    662110
Name: window_nr, dtype: int64

### Tiempo desde ultima aparicion

In [32]:
max_date = pd.DataFrame({'max_date': [dt.datetime(2019,4,21),dt.datetime(2019,4,24)dt.datetime(2019,4,27)], window_nr = [1,2,3]}

auctions.groupby(group, as_index=False, sort=False)['created'].transform(lambda x: x.max())

Unnamed: 0,created
9146132,2019-04-20 04:29:54.298498
11162626,2019-04-20 04:29:54.298498
11194917,2019-04-20 04:29:54.298498
11229465,2019-04-20 04:29:54.298498
11244486,2019-04-20 04:29:54.298498
11256689,2019-04-20 04:29:54.298498
11257926,2019-04-20 04:29:54.298498
11261785,2019-04-20 04:29:54.298498
11264301,2019-04-20 04:29:54.298498
11274776,2019-04-20 04:29:54.298498


In [75]:
#Estimo los segundos hasta la siguiente ventana y luego me quedo con el minimo tiempo
max_dates = pd.DataFrame({'w_max_date': [dt.datetime(2019,4,21),dt.datetime(2019,4,24),dt.datetime(2019,4,27)], 'window_nr' : [1,2,3]})
time = auctions.merge(max_dates, on='window_nr', how='left')\
               .set_index(group)\
               .transform(lambda x: round((x['w_max_date'] - x['created']).total_seconds()), axis = 1)\
               .rename('secs_since_last_arrival')\
               .reset_index()\
               .groupby(group)['secs_since_last_arrival']\
               .agg('min')

In [76]:
time.head()

ref_hash         window_nr
41863526108385   1             70206
135153013040192  1             70659
161514654074162  1            248758
186034136943920  1             74393
                 2             71628
Name: secs_since_last_arrival, dtype: int64

In [None]:
time = train_auctions.groupby(group).apply(lambda x: round((max_date - x['date'].max()).total_seconds())).to_frame()
time.columns = ['secs_since_last_arrival']

In [None]:
devices = devices.merge(time, how='left', left_index=True, right_index=True)

In [None]:
devices.head()

### Cantidad de apariciones en encuestas 

In [None]:
amount_auctions = train_auctions.groupby(device_uuid)['n'].count().to_frame()
amount_auctions.columns = ['auctions_total']
devices = devices.merge(amount_auctions,how = 'left', left_index=True, right_index=True)
devices.head()

In [None]:
amount_last_auctions = train_auctions.groupby(device_uuid).apply(lambda x: x.loc[x['date'] > (max_date - timedelta(hours=1)),'n'].count()).to_frame()
amount_last_auctions.columns = ['auctions_last_hour']
devices = devices.merge(amount_last_auctions, how='left', left_index=True, right_index=True)
devices.head()