In [1]:
import pandas as pd
import matplotlib as mp
import numpy as np
from datetime import timedelta
import datetime as dt

%matplotlib inline

In [2]:
#Optimizado para menos memoria
auction_dtypes = {
    'device_id': np.int64,
    'ref_type_id': np.int8,
    'source_id': np.int8
}

auctions = pd.read_csv('../data/auctions.csv.gzip',
                       compression = 'gzip',
                       dtype = auction_dtypes,
                       parse_dates = ['date'])
auctions['n'] = 1

windows = pd.DataFrame({
'window_nr':[1,2,3,4],
'n': 1,
'min_date': dt.datetime(2019,4,18),
'max_date': [dt.datetime(2019,4,21),dt.datetime(2019,4,22),dt.datetime(2019,4,23),dt.datetime(2019,4,24)]
})

In [3]:
auctions['date'].min()

Timestamp('2019-04-18 00:00:00.015050')

In [4]:
auctions['date'].max()

Timestamp('2019-04-26 23:59:59.969518')

---
Separado de las ventanas de tiempo
----------------------------------

### Separo las semanas de entrenamiento

Se pueden utilizar distintos metodos: 

- Se arman ventanas de 3 días para predecir 3 dias
- Se arman ventanas de n-1 dias para predecir n a n+2 dias.

Para el caso de estudio se utiliza la opcion de maximizar la cantidad de datos para predecir cada ventana.

Los datos pueden entonces estar en las siguientes ventanas y deben solo usarse para predecir la ventana correspondiente en los sets de training.

Las ventanas de los sets de entrenamiento son: 

1. 21 al 23
2. 22 al 24
3. 23 al 25
4. 24 al 26

Entonces las ventanas armadas serán:
1. 18 al 20
2. 18 al 21
3. 18 al 22
4. 18 al 23

---
Armado del set de entrenamiento
-------------------------------

A continuacion se comienzan a extraer los distintos features que formaran el set de entrenamiento. 
Se cruza cada ventana con los labels que se desean predecir. 

Para entrenar el set debe decidirse como utilizar los datos de las distintas ventanas. 

- Una opcion será mezclar todos los datos pero hay que decidir que hacer con los equipos que aparecen mas de una vez. 

- La segunda opcion es entrenar 4 modelos distintos y verificar que haya una mejora en todos ellos.

### Armado de tiempo entre arribos

Se desea saber el tiempo promedio entre arribos de los dispositivos a las encuestas.

Se elminan usuarios que aparecen 1 sola vez y usuarios que aparecen mas de 150 veces

In [5]:
grp = ['device_id']
#Ordeno por grp y fecha
auctions.sort_values(by=['device_id', 'date'], inplace=True)

In [6]:
min_value = 2
max_value = 150
auctions = auctions.groupby(grp, sort=False).filter(lambda data: (min_value < len(data) < max_value))

In [7]:
auctions['next_date'] = auctions.groupby(grp, as_index = False, sort=False)['date']\
                                              .transform(lambda x: x.shift(-1))
auctions = auctions.loc[(~auctions['next_date'].isnull())]
auctions['secs_to_next'] = (auctions['next_date'] - auctions['date'])\
                                        .transform(lambda x: round(x.total_seconds()))

In [8]:
filtered = auctions.loc[auctions['secs_to_next'] < 120]

In [9]:
devices = filtered.groupby(grp, as_index='False')['secs_to_next'].mean().to_frame()
devices.columns = ['secs_to_next_mean']

In [10]:
devices.head()

Unnamed: 0_level_0,secs_to_next_mean
device_id,Unnamed: 1_level_1
41863526108385,37.136364
69039685746313,104.0
135153013040192,53.0
161514654074162,24.6
186034136943920,32.40625


### Tiempo desde ultima aparicion

In [11]:
max_date = auctions['date'].max()

In [12]:
time = auctions.groupby(grp).apply(lambda x: round((max_date - x['date'].max()).total_seconds())).to_frame()
time.columns = ['secs_since_last_arrival']

In [13]:
devices = devices.merge(time, how='outer', left_index=True, right_index=True)

In [14]:
devices.head()

Unnamed: 0_level_0,secs_to_next_mean,secs_since_last_arrival
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1
41863526108385,37.136364,588605
69039685746313,104.0,132716
135153013040192,53.0,589058
161514654074162,24.6,767157
186034136943920,32.40625,296782


### Cantidad de apariciones en encuestas 

In [15]:
amount_auctions = auctions.groupby(grp)['n'].count().to_frame()
amount_auctions.columns = ['auctions_total']
devices = devices.merge(amount_auctions,how = 'outer', left_index=True, right_index=True)
devices.head()

Unnamed: 0_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
41863526108385,37.136364,588605,34
69039685746313,104.0,132716,3
135153013040192,53.0,589058,7
161514654074162,24.6,767157,5
186034136943920,32.40625,296782,65


In [16]:
amount_last_auctions = auctions.groupby(grp).apply(lambda x: x.loc[x['date'] > (max_date - timedelta(hours=1)),'n'].count()).to_frame()
amount_last_auctions.columns = ['auctions_last_hour']
devices = devices.merge(amount_last_auctions, how='outer', left_index=True, right_index=True)
devices.head()

Unnamed: 0_level_0,secs_to_next_mean,secs_since_last_arrival,auctions_total,auctions_last_hour
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
41863526108385,37.136364,588605,34,0
69039685746313,104.0,132716,3,0
135153013040192,53.0,589058,7,0
161514654074162,24.6,767157,5,0
186034136943920,32.40625,296782,65,0


### Secuencia de ultimos 5 eventos del dispositivo

---
Prediccion de Installs
----------------------

Se utiliza el archivo prearmado y se completa los que no aparecen 

In [2]:
installs_dtypes = {
    'application_id': np.int32,
    'ref_type': np.int64,
    'ref_hash': np.int64, 
    'click_hash':'category',
    'attributed': 'category',
    'implicit': 'category',
    #'device_countrycode': 'object', 
    'device_brand': 'category',
    'device_model': 'category', 
    'session_user_agent': 'category', 
    'user_agent': 'category', 
    'event_uuid':'object',
    'kind': 'category',
    'wifi': 'category', 
    'trans_id': 'object', 
    #'ip_address':'object', 
    'device_language': 'category'
}
install_cols = list(installs_dtypes.keys()) + ['created']
installs = pd.read_csv('../data/installs.csv.gzip', 
                       compression='gzip', 
                       usecols=install_cols,
                       dtype= installs_dtypes,
                      parse_dates=['created'])

to_predict = pd.read_csv('../data/target_final_competencia_revamped.csv')
to_predict = to_predict.apply(lambda x: np.int64(x['ref_hash'][0:x['ref_hash'].find('_')]), axis='columns').drop_duplicates().to_frame()
to_predict.columns = ['ref_hash']

In [3]:
all_ref_hash = devices.reset_index()['device_id'].drop_duplicates().to_frame()
all_ref_hash.columns = ['ref_hash']

In [4]:
all_ref_hash.append(to_predict).drop_duplicates().count()

ref_hash    396509
dtype: int64

In [5]:
window = 3
j=1
installs_training = pd.DataFrame()

for i in range(21,25): #[19, 20, 21, 22, 23, 24]
    print("Ventana: ", j)
    temp = pd.DataFrame()
    
    temp[['ref_hash','date_install']] = installs.loc[(installs['created'].dt.day >= i) & (installs['created'].dt.day < (i+window))].groupby('ref_hash', as_index=False)['created'].min()
    temp['window_nr'] = j
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['secs_to_install'] = (temp['date_install']-temp['window_date_start']).transform(lambda x: x.total_seconds())
    print("  Encontrados: ", len(temp))
    installs_training = installs_training.append(temp,sort=True)
    
    #Agrego los que no se encontraron en esta ventana pero si estan en installs (aparecen en alguna)
    temp = all_ref_hash.merge(installs_training.loc[installs_training['window_nr'] == j], how='left',on='ref_hash')
    temp = temp.loc[temp['window_nr'].isnull()]
    temp['window_nr'] = j
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['date_install'] = dt.datetime(2019,4,i+window)
    temp['secs_to_install'] = (temp['date_install']-temp['window_date_start']).transform(lambda x: x.total_seconds())
    print("  Agregados sin installs en esta ventana: ", len(temp))
    installs_training = installs_training.append(temp[['ref_hash','window_nr','window_date_start','date_install','secs_to_install']],sort=True)
    
    #Agrego los que no se encontraron en install pero se deben predecir conversiones
    temp = to_predict.merge(installs_training.loc[installs_training['window_nr'] == j], how='left', on='ref_hash')
    temp = temp.loc[temp['window_nr'].isnull()]
    print("  Agregados sin installs pero en lista a predecir: ", len(temp))
    temp['window_nr'] = j
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['date_install'] = dt.datetime(2019,4,i+window)
    temp['secs_to_install'] = (temp['date_install']-temp['window_date_start']).transform(lambda x: x.total_seconds())
    installs_training = installs_training.append(temp,sort=True)
    
    j = j+1

sorted_installs = installs_training.sort_values(['ref_hash','window_nr'])[['ref_hash','window_nr','window_date_start','date_install','secs_to_install']]

Ventana:  1
  Encontrados:  133278
  Agregados sin installs en esta ventana:  260287
  Agregados sin installs pero en lista a predecir:  2944
Ventana:  2
  Encontrados:  132834
  Agregados sin installs en esta ventana:  260731
  Agregados sin installs pero en lista a predecir:  2944
Ventana:  3
  Encontrados:  133885
  Agregados sin installs en esta ventana:  259680
  Agregados sin installs pero en lista a predecir:  2944
Ventana:  4
  Encontrados:  138317
  Agregados sin installs en esta ventana:  255248
  Agregados sin installs pero en lista a predecir:  2944


In [6]:
sorted_installs['window_nr'].value_counts()

4    396509
3    396509
2    396509
1    396509
Name: window_nr, dtype: int64

In [7]:
sorted_installs.head(20)

Unnamed: 0,ref_hash,window_nr,window_date_start,date_install,secs_to_install
0,40621409780134,1,2019-04-21,2019-04-21 19:17:47.657,69467.657
41317,40621409780134,2,2019-04-22,2019-04-25 00:00:00.000,259200.0
41317,40621409780134,3,2019-04-23,2019-04-26 00:00:00.000,259200.0
41317,40621409780134,4,2019-04-24,2019-04-27 00:00:00.000,259200.0
100266,41863526108385,1,2019-04-21,2019-04-24 00:00:00.000,259200.0
100266,41863526108385,2,2019-04-22,2019-04-25 00:00:00.000,259200.0
100266,41863526108385,3,2019-04-23,2019-04-26 00:00:00.000,259200.0
100266,41863526108385,4,2019-04-24,2019-04-27 00:00:00.000,259200.0
182806,90072729247980,1,2019-04-21,2019-04-24 00:00:00.000,259200.0
0,90072729247980,2,2019-04-22,2019-04-24 18:30:50.199,239450.199
