In [1]:
import pandas as pd
import matplotlib as mp
import numpy as np
from datetime import timedelta
import datetime as dt

%matplotlib inline

#Root de archivos de datos
root_dir = '../data/'

#Lee menos data si es testing
is_testing = False

# Cantidad minima de aparicion de equipos en un dataframe
min_devices = 5

# Ventanas de tiempo
windows = pd.DataFrame({
    'begin_date': [dt.datetime(2019,4,18),dt.datetime(2019,4,21),dt.datetime(2019,4,24)],
    'window_nr':[1,2,3]
})

In [2]:
#Optimizado para menos memoria
auction_dtypes = {
    'device_id': np.int64,
    'source_id': np.int8
}

auctions = pd.read_csv(root_dir + 'auctions.csv.gzip',
                       compression = 'gzip',
                       dtype = auction_dtypes,
                       usecols=list(auction_dtypes.keys()) + ['date'],
                       parse_dates = ['date'])
auctions.rename({'device_id':'ref_hash',
                 'date':'created'}, inplace=True, axis='columns')
auctions['n'] = 1

# Para hacer pruebas ocupando menos memoria se hace un sampleo aleatorio de la mitad del dataframe y se elimina el resto
if is_testing:
    auctions = auctions.sample(frac=.30)
    print("Is Testing, #records:", auctions['n'].sum())
else:
    print("Not Testing, #records:", auctions['n'].sum())

Not Testing, #records: 47409528


In [3]:
installs_dtypes = {
    'application_id': np.int32,
    'ref_hash': np.int64, 
    #'click_hash':'category',
    'attributed': 'category',
    'implicit': 'category',
    #'device_countrycode': 'object', 
    'device_brand': 'object',
    'device_model': 'object', 
    'session_user_agent': 'object', 
    'user_agent': 'object', 
    'event_uuid':'object',
    'kind': 'object',
    'wifi': 'object', 
    'trans_id': 'object', 
    #'ip_address':'object', 
    'device_language': 'object'
}
install_cols = list(installs_dtypes.keys()) + ['created']
installs = pd.read_csv(root_dir + 'installs.csv.gzip', 
                       compression='gzip', 
                       usecols=install_cols,
                       dtype= installs_dtypes,
                      parse_dates=['created'])

installs['n'] = 1
print('#records:', installs['n'].sum())

#records: 481511


In [4]:
unique_hashes = pd.read_csv(root_dir+'unique_hashes.csv')

---
## 2. Arreglo de los datos

### Separo las semanas de entrenamiento

Se utiliza el siguiente metodo: 

1. Ventana del 18 al 20 inclusive (1) -> Predice valores entre el 21 y 24 (2)
2. Ventana del 21 al 23 inclusive (2) -> Predice valores entre el 24 y 26 (3)

In [5]:
#Agrego la ventana de tiempo
auctions = auctions.loc[auctions['ref_hash'].isin(unique_hashes['ref_hash'])]\
                   .sort_values(by='created')
auctions = pd.merge_asof(auctions,windows,left_on='created',right_on='begin_date')

In [6]:
installs = installs.loc[installs['ref_hash'].isin(unique_hashes['ref_hash'])]
installs.sort_values(by='created',inplace=True)
installs = pd.merge_asof(installs,windows,left_on='created',right_on='begin_date')

### 2.2 Arreglo de datos de Installs

In [7]:
installs['kind'] = installs['kind'].str.replace(' ','_')
installs['kind'] = installs['kind'].str.replace('af_app_open ','af_app_opened')
installs['kind'] = installs['kind'].str.replace('af_app_opend','af_app_opened')
installs['kind'] = installs['kind'].str.lower()

In [8]:
installs.isnull().sum()

created                    0
application_id             0
ref_hash                   0
attributed                 0
implicit                   0
device_brand          204813
device_model           26871
session_user_agent     14828
user_agent            150111
event_uuid            377704
kind                  377704
wifi                  186016
trans_id              472140
device_language        27552
n                          0
begin_date                 0
window_nr                  0
dtype: int64

In [9]:
for i in ['device_brand','device_model','session_user_agent','user_agent','kind','wifi','device_language']:
    installs[i] = installs[i].fillna('unknown')
installs['device_brand'] = installs['device_brand'].astype('category')
installs['device_model'] = installs['device_model'].astype('category')
installs['session_user_agent'] = installs['session_user_agent'].astype('category')
installs['user_agent'] = installs['user_agent'].astype('category')
installs['kind'] = installs['kind'].astype('category')
installs['wifi'] = installs['wifi'].astype('category')
installs['device_language'] = installs['device_language'].astype('category')

In [10]:
installs.head().transpose()

Unnamed: 0,0,1,2,3,4
created,2019-04-18 00:00:01.560000,2019-04-18 00:00:01.851000,2019-04-18 00:00:05.152000,2019-04-18 00:00:05.589000,2019-04-18 00:00:06.795000
application_id,70,70,65,27,339
ref_hash,4432995619177048534,5904733559638204455,896373747754111825,3399210824535017892,1541425881979513687
attributed,False,False,False,False,False
implicit,False,False,True,False,False
device_brand,unknown,unknown,3.083058605577787e+17,unknown,unknown
device_model,unknown,unknown,5.274185305862703e+18,6.794880020077885e+18,6.794880020077885e+18
session_user_agent,Apsalar-Postback,Apsalar-Postback,http-kit/2.0,http-kit/2.0,http-kit/2.0
user_agent,unknown,unknown,Dalvik/2.1.0 (Linux; U; Android 9; SM-G9650 Bu...,trivago/216 CFNetwork/978.0.7 Darwin/18.5.0,TikTok/109005 CFNetwork/758.5.3 Darwin/15.6.0
event_uuid,,,8c8af5e3-96e7-4a49-9f17-cafa7f300f2c,,


---
## 3. Armado de Labels

In [11]:
group = ['ref_hash','window_nr']
max_dates = pd.DataFrame({'w_max_date': [dt.datetime(2019,4,21),dt.datetime(2019,4,24),dt.datetime(2019,4,27)], 'window_nr' : [1,2,3]})

In [12]:
sample = auctions.tail(100)

In [13]:
sample.head()

Unnamed: 0,created,ref_hash,source_id,n,begin_date,window_nr
46587817,2019-04-26 23:59:58.341529,4398508384670976399,0,1,2019-04-24,3
46587818,2019-04-26 23:59:58.351963,2883468145566915144,0,1,2019-04-24,3
46587819,2019-04-26 23:59:58.421174,6249640533610682060,0,1,2019-04-24,3
46587820,2019-04-26 23:59:58.438705,6731471049110499914,0,1,2019-04-24,3
46587821,2019-04-26 23:59:58.445829,3724654214762970380,1,1,2019-04-24,3


In [14]:
temp = unique_hashes
temp['window_nr'] = 1
devices = temp.copy()
#temp['window_nr'] = 2
#devices = devices.append(temp)

In [15]:
devices['window_nr'].value_counts()

1    662110
Name: window_nr, dtype: int64

In [16]:
%%time
auctions.sort_values(by=group+['created'], inplace=True)

CPU times: user 1min 15s, sys: 11.7 s, total: 1min 27s
Wall time: 1min 27s


In [17]:
%%time
installs.sort_values(by=group+['created'], inplace=True)

CPU times: user 545 ms, sys: 231 µs, total: 546 ms
Wall time: 544 ms


### 3.1 Label Tiempo hasta reaparicion (St)

In [18]:
max_secs = 3*24*60*60
temp = auctions.loc[auctions['window_nr'] != 1]\
               .groupby(group,sort=False)\
               .head(1)\
               .copy()
temp['window_nr'] = temp['window_nr'] - 1
temp = temp.set_index(group)
st = (temp['created'] - temp['begin_date']).transform(lambda x: round(x.total_seconds()))\
                                           .rename('target_st')

target_st = devices.set_index(group)\
                   .merge(st, how='left', left_index=True, right_index=True)\
                   .fillna(max_secs)

In [19]:
target_st.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,target_st
ref_hash,window_nr,Unnamed: 2_level_1
2564673204772915246,1,63743.0
4441121667607578179,1,74414.0
7721769811471055264,1,565.0
6416039086842158968,1,13864.0
1258642015983312729,1,148073.0


### 3.1 Label Tiempo hasta install (Sc)

In [20]:
temp = installs.loc[installs['window_nr'] != 1]\
               .groupby(group,sort=False)\
               .head(1)\
               .copy()
temp['window_nr'] = temp['window_nr'] - 1
temp = temp.set_index(group)
sc = (temp['created'] - temp['begin_date']).transform(lambda x: round(x.total_seconds()))\
                                           .rename('target_sc')

target_sc = devices.set_index(group)\
                   .merge(sc, how='left',left_index=True, right_index=True)\
                   .fillna(max_secs)

In [21]:
target_sc.count()

target_sc    662110
dtype: int64

### 3.3. Merge y guardo a file

In [22]:
targets = target_st.merge(target_sc, how='left',left_index=True, right_index=True)

In [23]:
targets.to_csv("targets.csv", header=True,index=True)