In [1]:
import pandas as pd
import matplotlib as mp
import numpy as np
from datetime import timedelta
import datetime as dt
import gc

%matplotlib inline

#Lee menos data si es testing
is_testing = True

# Cantidad minima de aparicion de equipos en un dataframe
min_devices = 10

# Identificador unico de equipos
device_uuid = ['ref_type','ref_hash']

# Ventanas de tiempo
windows = pd.DataFrame({
'window_nr':[1,2,3,4],
'n': 1,
'min_date': dt.datetime(2019,4,18),
'max_date': [dt.datetime(2019,4,21),dt.datetime(2019,4,22),dt.datetime(2019,4,23),dt.datetime(2019,4,24)]
})

Prediccion de Tiempos de Arribo
===============================

Se utilizan los distintos dataframes para armar un set de features que sirvan para predecir el tiempo hasta la aparicion de un dispositivo nuevamente. 

Este notebook tiene la siguiente estructura: 

1. Lectura de los Dataframes
2. Arreglo de los datos
3. Armado de Features
4. Armado de labels
5. Armado del set de pruebas
6. Training del Modelo Predictivo
7. Predicciones
8. Evaluacion 

------------------------------
## 1. Lectura de los Dataframes

Se realiza la carga de los dataframes en memoria para el armado del modelo predictivo. La lectura se hace optimizando los tipos de datos a fin de utilizar la menor cantidad de memoria posible.

In [2]:
#Optimizado para menos memoria
auction_dtypes = {
    'ref_type_id': np.int8,
    'device_id': np.int64,
    'source_id': np.int8
}

auctions = pd.read_csv('../data/auctions.csv.gzip',
                       compression = 'gzip',
                       dtype = auction_dtypes,
                       parse_dates = ['date'])
auctions.rename({'device_id':'ref_hash',
                 'ref_type_id':'ref_type'}, inplace=True, axis='columns')
auctions['n'] = 1

# Para hacer pruebas ocupando menos memoria se hace un sampleo aleatorio de la mitad del dataframe y se elimina el resto
if is_testing:
    auctions = auctions.sample(frac=.30)
    print("Is Testing, #records:", auctions['n'].sum())
else:
    print("Not Testing, #records:", auctions['n'].sum())


Is Testing, #records: 14222858


In [3]:
installs_dtypes = {
    'application_id': np.int32,
    'ref_type': np.int64,
    'ref_hash': np.int64, 
    #'click_hash':'category',
    'attributed': 'category',
    'implicit': 'category',
    #'device_countrycode': 'object', 
    'device_brand': 'object',
    'device_model': 'object', 
    'session_user_agent': 'object', 
    'user_agent': 'object', 
    'event_uuid':'object',
    'kind': 'object',
    'wifi': 'object', 
    'trans_id': 'object', 
    #'ip_address':'object', 
    'device_language': 'object'
}
install_cols = list(installs_dtypes.keys()) + ['created']
installs = pd.read_csv('../data/installs.csv.gzip', 
                       compression='gzip', 
                       usecols=install_cols,
                       dtype= installs_dtypes,
                      parse_dates=['created'])

installs['n'] = 1
print('#records:', installs['n'].sum())

#records: 481511


In [4]:
clicks_dtypes = {
    'advertiser_id': np.int8, 
    'action_id': np.int32, 
    'source_id': np.int8, 
    'latitude' : np.float64, 
    'longitude': np.float64, 
    'wifi_connection': np.bool, 
    'carrier_id': np.int16, 
    'trans_id': 'object',
    'os_minor':'object', 
    #'agent_device' : 'category', 
    #'os_major': 'category', 
    'specs_brand': 'category', 
    #'brand': np.int8,
    'timeToClick': np.float64, 
    #'touchX': np.float64, 
    #'touchY': np.float64, 
    'ref_type':np.int64, 
    'ref_hash':np.int64
}
clicks = pd.read_csv('../data/clicks.csv.gzip', 
                     compression='gzip',
                     low_memory = False,
                     parse_dates=['created'])
clicks['n'] = 1
print('#records:', clicks['n'].sum())

#records: 64296


In [5]:
events_dtypes = {
    'event_id': np.int64,
    'ref_type': np.int64,
    'ref_hash': np.int64,
    'application_id': np.int64,
    'attributed': np.bool,
    'device_os_version': 'object',
    'device_brand': 'object',
    'device_model': 'object',
    'device_city': 'object',
    'session_user_agent': 'object',
    'trans_id': 'category',
    'user_agent': 'object',
    'event_uuid': 'object',
    'carrier': 'object',
    'kind': 'object',
    'device_os': 'object',
    'wifi': np.bool,
    'connection_type': 'object',
    #'ip_address': np.int64,
    #'device_language': 'category'
}
events_cols = list(events_dtypes.keys()) + ['date']
events = pd.read_csv('../data/events.csv.gzip', 
                     compression='gzip',
                     dtype=events_dtypes,
                     usecols=events_cols,
                     parse_dates=['date'])
events['n'] = 1
# Para hacer pruebas ocupando menos memoria se hace un sampleo aleatorio de la mitad del dataframe y se elimina el resto
if is_testing:
    events = events.sample(frac=.30)
    print("Is Testing, #records:", events['n'].sum())
else:
    print("Not Testing, #records:", events['n'].sum())

Is Testing, #records: 2323374


In [6]:
# Labels a submitir con las predicciones

to_predict = pd.read_csv('../data/target_final_competencia_revamped.csv')
to_predict = to_predict.apply(lambda x: np.int64(x['ref_hash'][0:x['ref_hash'].find('_')]), axis='columns').drop_duplicates().to_frame()
to_predict.columns = ['ref_hash']

---
## 2. Arreglo de los Datos

### 2.1 Arreglo de datos de Auctions


In [7]:
#Elimino los registros con menos de un minimo de entradas, ya que no hay mucho que predecir en estos casos
orig_count = auctions['n'].sum()
auctions = auctions.groupby(device_uuid, sort = False).filter(lambda x: x['n'].sum() >= min_devices) 
last_count = auctions['n'].sum()
print('Eliminados:', orig_count-last_count)

Eliminados: 789832


In [8]:
auctions['n'].sum()

13433026

In [9]:
auctions['date'].min()

Timestamp('2019-04-18 00:00:00.231719')

In [10]:
auctions['date'].max()

Timestamp('2019-04-26 23:59:59.799169')

In [11]:
auctions.head()

Unnamed: 0,date,ref_hash,ref_type,source_id,n
6004034,2019-04-25 00:18:34.880414,3818348047161713811,1,3,1
36697206,2019-04-23 21:34:29.804415,8026666206742025170,7,0,1
9976728,2019-04-24 01:53:28.060788,7489173645677729124,1,1,1
45059496,2019-04-22 21:55:48.303558,6984568849717659916,1,1,1
1187643,2019-04-23 00:50:24.886514,9208922270656484956,1,1,1


### 2.2 Arreglo de datos de Installs

In [12]:
installs['kind'] = installs['kind'].str.replace(' ','_')
installs['kind'] = installs['kind'].str.replace('af_app_open ','af_app_opened')
installs['kind'] = installs['kind'].str.replace('af_app_opend','af_app_opened')
installs['kind'] = installs['kind'].str.lower()

In [13]:
installs.isnull().sum()

created                    0
application_id             0
ref_type                   0
ref_hash                   0
attributed                 0
implicit                   0
device_brand          205068
device_model           26892
session_user_agent     14839
user_agent            150743
event_uuid            378343
kind                  378343
wifi                  186682
trans_id              472578
device_language        27577
n                          0
dtype: int64

In [14]:
for i in ['device_brand','device_model','session_user_agent','user_agent','kind','wifi','device_language']:
    installs[i] = installs[i].fillna('unknown')
installs['device_brand'] = installs['device_brand'].astype('category')
installs['device_model'] = installs['device_model'].astype('category')
installs['session_user_agent'] = installs['session_user_agent'].astype('category')
installs['user_agent'] = installs['user_agent'].astype('category')
installs['kind'] = installs['kind'].astype('category')
installs['wifi'] = installs['wifi'].astype('category')
installs['device_language'] = installs['device_language'].astype('category')

In [15]:
installs.head().transpose()

Unnamed: 0,0,1,2,3,4
created,2019-04-24 06:23:29.495000,2019-04-24 02:06:01.032000,2019-04-20 10:15:36.274000,2019-04-20 21:56:47.151000,2019-04-20 22:40:41.239000
application_id,1,1,1,1,1
ref_type,1494519392962156891,1494519392962156891,1494519392962156891,1494519392962156891,1494519392962156891
ref_hash,4716708407362582887,7143568733100935872,5230323462636548010,5097163995161606833,6328027616411983332
attributed,False,False,False,False,False
implicit,True,False,True,True,False
device_brand,unknown,unknown,unknown,unknown,unknown
device_model,3.739127126472163e+17,7.80553892759877e+18,8.355495513718673e+18,2.3557720913769155e+18,6.156971151807135e+18
session_user_agent,adjust.com,adjust.com,adjust.com,adjust.com,adjust.com
user_agent,unknown,unknown,unknown,unknown,unknown


### 2.3 Arreglo de datos de Clicks

In [16]:
clicks.touchX = clicks.touchX.apply(lambda x: np.float64(x))
clicks.touchY = clicks.touchY.apply(lambda x: np.float64(x))

In [17]:
for i in ['carrier_id','os_minor', 'agent_device', 'brand', 'os_major']:
    clicks[i] = clicks[i].fillna('unknown').astype('category')

In [18]:
clicks.isnull().sum()

advertiser_id          0
action_id          64289
source_id              0
created                0
country_code           0
latitude               0
longitude              0
wifi_connection        0
carrier_id             0
trans_id               0
os_minor               0
agent_device           0
os_major               0
specs_brand            0
brand                  0
timeToClick        26118
touchX             20618
touchY             20618
ref_type               0
ref_hash               0
n                      0
dtype: int64

### 2.4 Arreglo de datos de Events

In [19]:
for i in ['device_os_version','device_brand','device_model','device_city','session_user_agent','user_agent','carrier','kind','device_os','connection_type']:
    events[i] = events[i].fillna('unknown').astype('category')

In [20]:
events.isnull().sum()

date                        0
event_id                    0
ref_type                    0
ref_hash                    0
application_id              0
attributed                  0
device_os_version           0
device_brand                0
device_model                0
device_city                 0
session_user_agent          0
trans_id              2312225
user_agent                  0
event_uuid               8996
carrier                     0
kind                        0
device_os                   0
wifi                        0
connection_type             0
n                           0
dtype: int64

---
Separado de las ventanas de tiempo
----------------------------------

### Separo las semanas de entrenamiento

Se pueden utilizar distintos metodos: 

- Se arman ventanas de 3 días para predecir 3 dias
- Se arman ventanas de n-1 dias para predecir n a n+2 dias.

Para el caso de estudio se utiliza la opcion de maximizar la cantidad de datos para predecir cada ventana.

Los datos pueden entonces estar en las siguientes ventanas y deben solo usarse para predecir la ventana correspondiente en los sets de training.

Las ventanas de los sets de entrenamiento son: 

1. 21 al 23
2. 22 al 24
3. 23 al 25
4. 24 al 26

Entonces las ventanas armadas serán:
1. 18 al 20
2. 18 al 21
3. 18 al 22
4. 18 al 23

---
## 3. Armado de Features

A continuacion se comienzan a extraer los distintos features que formaran el set de entrenamiento. 
Se cruza cada ventana con los labels que se desean predecir. 

Para entrenar el set debe decidirse como utilizar los datos de las distintas ventanas. 

- Una opcion será mezclar todos los datos pero hay que decidir que hacer con los equipos que aparecen mas de una vez. 

- La segunda opcion es entrenar 4 modelos distintos y verificar que haya una mejora en todos ellos.

In [35]:
devices = auctions[device_uuid].drop_duplicates()
devices['ref_hash'].count()

177162

### Armado de tiempo entre arribos

Se desea saber el tiempo promedio entre arribos de los dispositivos a las encuestas.

Se elminan usuarios que aparecen 1 sola vez y usuarios que aparecen mas de 150 veces

In [34]:
devices['ref_hash'].count()

177162

In [36]:
auctions.sort_values(by=(device_uuid+['date']), inplace=True)

In [37]:
auctions['next_date'] = auctions.groupby(device_uuid, as_index = False, sort=False)['date'].transform(lambda x: x.shift(-1))
auctions = auctions.loc[(~auctions['next_date'].isnull())]
auctions['secs_to_next'] = (auctions['next_date'] - auctions['date']).transform(lambda x: round(x.total_seconds()))

In [None]:
#filtered = auctions.loc[auctions['secs_to_next'] < 120]

In [38]:
temp = auctions.groupby(device_uuid, as_index='False')['secs_to_next'].mean().to_frame()
temp.columns = ['secs_to_next_mean']

devices.merge(temp, how='left', on=device_uuid)

Unnamed: 0,ref_type,ref_hash,secs_to_next_mean
0,1,41863526108385,551.888889
1,1,186034136943920,22656.444444
2,1,345999128501141,32477.909091
3,1,360710529886978,47766.400000
4,1,416301579449694,14969.416667
5,1,473668258229864,11876.250000
6,1,501790157110512,11284.875000
7,1,622102439689666,35178.700000
8,1,686608884458246,23044.482759
9,1,693609737448534,32815.611111


In [39]:
devices.head()

Unnamed: 0,ref_type,ref_hash
32062481,1,41863526108385
4859451,1,186034136943920
24453051,1,345999128501141
31219167,1,360710529886978
9100988,1,416301579449694


### Tiempo desde ultima aparicion

In [None]:
max_date = auctions['date'].max()

In [None]:
time = auctions.groupby(device_uuid).apply(lambda x: round((max_date - x['date'].max()).total_seconds())).to_frame()
time.columns = ['secs_since_last_arrival']

In [None]:
devices = devices.merge(time, how='outer', left_index=True, right_index=True)

In [None]:
devices.head()

### Cantidad de apariciones en encuestas 

In [None]:
amount_auctions = auctions.groupby(grp)['n'].count().to_frame()
amount_auctions.columns = ['auctions_total']
devices = devices.merge(amount_auctions,how = 'outer', left_index=True, right_index=True)
devices.head()

In [None]:
amount_last_auctions = auctions.groupby(grp).apply(lambda x: x.loc[x['date'] > (max_date - timedelta(hours=1)),'n'].count()).to_frame()
amount_last_auctions.columns = ['auctions_last_hour']
devices = devices.merge(amount_last_auctions, how='outer', left_index=True, right_index=True)
devices.head()

### Secuencia de ultimos 5 eventos del dispositivo

---
## 4. Armado de Labels 

### 4.1. Prediccion de Installs

In [None]:
all_ref_hash = devices.reset_index()['ref_hash'].drop_duplicates().to_frame()
all_ref_hash.columns = ['ref_hash']


In [None]:
all_ref_hash.append(to_predict).drop_duplicates().count()

In [None]:
window = 3
j=1
installs_training = pd.DataFrame()

for i in range(21,25): #[19, 20, 21, 22, 23, 24]
    print("Ventana: ", j)
    temp = pd.DataFrame()
    
    temp[['ref_hash','date_install']] = installs.loc[(installs['created'].dt.day >= i) & (installs['created'].dt.day < (i+window))].groupby('ref_hash', as_index=False)['created'].min()
    temp['window_nr'] = j
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['secs_to_install'] = (temp['date_install']-temp['window_date_start']).transform(lambda x: x.total_seconds())
    print("  Encontrados: ", len(temp))
    installs_training = installs_training.append(temp,sort=True)
    
    #Agrego los que no se encontraron en esta ventana pero si estan en installs (aparecen en alguna)
    temp = all_ref_hash.merge(installs_training.loc[installs_training['window_nr'] == j], how='left',on='ref_hash')
    temp = temp.loc[temp['window_nr'].isnull()]
    temp['window_nr'] = j
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['date_install'] = dt.datetime(2019,4,i+window)
    temp['secs_to_install'] = (temp['date_install']-temp['window_date_start']).transform(lambda x: x.total_seconds())
    print("  Agregados sin installs en esta ventana: ", len(temp))
    installs_training = installs_training.append(temp[['ref_hash','window_nr','window_date_start','date_install','secs_to_install']],sort=True)
    
    #Agrego los que no se encontraron en install pero se deben predecir conversiones
    temp = to_predict.merge(installs_training.loc[installs_training['window_nr'] == j], how='left', on='ref_hash')
    temp = temp.loc[temp['window_nr'].isnull()]
    print("  Agregados sin installs pero en lista a predecir: ", len(temp))
    temp['window_nr'] = j
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['date_install'] = dt.datetime(2019,4,i+window)
    temp['secs_to_install'] = (temp['date_install']-temp['window_date_start']).transform(lambda x: x.total_seconds())
    installs_training = installs_training.append(temp,sort=True)
    
    j = j+1

sorted_installs = installs_training.sort_values(['ref_hash','window_nr'])[['ref_hash','window_nr','window_date_start','date_install','secs_to_install']]

In [None]:
sorted_installs['window_nr'].value_counts()

In [None]:
sorted_installs.head(20)