In [1]:
import pandas as pd
import matplotlib as mp
import numpy as np
from datetime import timedelta
import datetime as dt
import gc

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb

%matplotlib inline

  from numpy.core.umath_tests import inner1d


# Levanto los installs ya procesados
....

In [2]:
#Root de archivos de datos
root_dir = '../data/'

#Lee menos data si es testing
is_testing = False

# Cantidad minima de aparicion de equipos en un dataframe
#min_devices = 5
min_devices = 20

# Identificador unico de equipos
device_uuid = ['ref_hash']

# Ventanas de tiempo
windows = pd.DataFrame({
    'begin_date': [dt.datetime(2019,4,18),dt.datetime(2019,4,21),dt.datetime(2019,4,24)],
    'window_nr':[1,2,3]
})

In [3]:
installs_dtypes = {
    'application_id': np.int32,
    'ref_hash': np.int64, 
    #'click_hash':'category',
    'attributed': 'category',
    'implicit': 'category',
    #'device_countrycode': 'object', 
    'device_brand': 'object',
    'device_model': 'object', 
    'session_user_agent': 'object', 
    'user_agent': 'object', 
    'event_uuid':'object',
    'kind': 'object',
    'wifi': 'object', 
    'trans_id': 'object', 
    #'ip_address':'object', 
    'device_language': 'object'
}
install_cols = list(installs_dtypes.keys()) + ['created']
installs = pd.read_csv(root_dir + 'installs.csv.gzip', 
                       compression='gzip', 
                       usecols=install_cols,
                       dtype= installs_dtypes,
                      parse_dates=['created'])

installs['n'] = 1
print('#records install:', installs['n'].sum())

('#records install:', 481511)


In [4]:
# Labels a submitir con las predicciones
to_predict = pd.read_csv(root_dir + 'target_competencia_ids.csv')
to_predict = to_predict.apply(lambda x: np.int64(x['ref_hash'][0:x['ref_hash'].find('_')]), axis='columns').drop_duplicates().to_frame()
to_predict.columns = ['ref_hash']

In [5]:
unique_hashes = pd.read_csv(root_dir+'unique_hashes.csv')

---
## 2. Arreglo de los datos

### Separo las semanas de entrenamiento

Se utiliza el siguiente metodo: 

1. Ventana del 18 al 20 inclusive (1) -> Predice valores entre el 21 y 24 (2)
2. Ventana del 21 al 23 inclusive (2) -> Predice valores entre el 24 y 26 (3)

In [6]:
print("Tiempo init: "+str(pd.Timestamp.now()))

Tiempo init: 2019-06-22 13:30:33.939140


In [7]:
installs = installs.loc[installs['ref_hash'].isin(unique_hashes['ref_hash'])]
installs.sort_values(by='created',inplace=True)
installs = pd.merge_asof(installs,windows,left_on='created',right_on='begin_date').drop('begin_date', axis='columns')

### 2.2 Arreglo de datos de Installs

In [8]:
installs['kind'] = installs['kind'].str.replace(' ','_')
installs['kind'] = installs['kind'].str.replace('af_app_open ','af_app_opened')
installs['kind'] = installs['kind'].str.replace('af_app_opend','af_app_opened')
installs['kind'] = installs['kind'].str.lower()

In [9]:
installs.isnull().sum()

created                    0
application_id             0
ref_hash                   0
attributed                 0
implicit                   0
device_brand          204813
device_model           26871
session_user_agent     14828
user_agent            150111
event_uuid            377704
kind                  377704
wifi                  186016
trans_id              472140
device_language        27552
n                          0
window_nr                  0
dtype: int64

In [10]:
for i in ['device_brand','device_model','session_user_agent','user_agent','kind','wifi','device_language']:
    installs[i] = installs[i].fillna('unknown')
installs['device_brand'] = installs['device_brand'].astype('category')
installs['device_model'] = installs['device_model'].astype('category')
installs['session_user_agent'] = installs['session_user_agent'].astype('category')
installs['user_agent'] = installs['user_agent'].astype('category')
installs['kind'] = installs['kind'].astype('category')
installs['wifi'] = installs['wifi'].astype('category')
installs['device_language'] = installs['device_language'].astype('category')

In [11]:
installs.head().transpose()

Unnamed: 0,0,1,2,3,4
created,2019-04-18 00:00:01.560000,2019-04-18 00:00:01.851000,2019-04-18 00:00:05.152000,2019-04-18 00:00:05.589000,2019-04-18 00:00:06.795000
application_id,70,70,65,27,339
ref_hash,4432995619177048534,5904733559638204455,896373747754111825,3399210824535017892,1541425881979513687
attributed,False,False,False,False,False
implicit,False,False,True,False,False
device_brand,unknown,unknown,3.083058605577787e+17,unknown,unknown
device_model,unknown,unknown,5.274185305862703e+18,6.794880020077885e+18,6.794880020077885e+18
session_user_agent,Apsalar-Postback,Apsalar-Postback,http-kit/2.0,http-kit/2.0,http-kit/2.0
user_agent,unknown,unknown,Dalvik/2.1.0 (Linux; U; Android 9; SM-G9650 Bu...,trivago/216 CFNetwork/978.0.7 Darwin/18.5.0,TikTok/109005 CFNetwork/758.5.3 Darwin/15.6.0
event_uuid,,,8c8af5e3-96e7-4a49-9f17-cafa7f300f2c,,


In [12]:
print("Tiempo end: "+str(pd.Timestamp.now()))

Tiempo end: 2019-06-22 13:30:51.493998


In [13]:
installs.head(1)

Unnamed: 0,created,application_id,ref_hash,attributed,implicit,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,device_language,n,window_nr
0,2019-04-18 00:00:01.560,70,4432995619177048534,False,False,unknown,unknown,Apsalar-Postback,unknown,,unknown,unknown,,unknown,1,1


### Cant. de Apps distintas instaladas	por ref_hash

In [151]:
#created: fecha de instalacion de app
#application_id: id interno de instalacion de app
installs['n'] = 1
inst_app_count = installs.groupby(by=['ref_hash','application_id']).sum()
inst_app_count = inst_app_count.reset_index()

In [152]:
inst_app_count.head()

Unnamed: 0,ref_hash,application_id,n,window_nr
0,40621409780134,77,1,2
1,41863526108385,65,2,2
2,41863526108385,121,2,2
3,90072729247980,210,1,3
4,135153013040192,155,1,1


In [153]:
inst_app_count['m'] = 1

In [154]:
inst_app_count.head(5)

Unnamed: 0,ref_hash,application_id,n,window_nr,m
0,40621409780134,77,1,2,1
1,41863526108385,65,2,2,1
2,41863526108385,121,2,2,1
3,90072729247980,210,1,3,1
4,135153013040192,155,1,1,1


In [155]:
temp = inst_app_count.groupby('ref_hash').sum()['m'].reset_index()
temp.head()

Unnamed: 0,ref_hash,m
0,40621409780134,1
1,41863526108385,2
2,90072729247980,1
3,135153013040192,1
4,161514654074162,1


In [156]:
inst_app_count = inst_app_count.merge(temp,how='inner',on='ref_hash')
inst_app_count = inst_app_count[['ref_hash','m_y','window_nr']]
inst_app_count = inst_app_count.rename(columns={"m_y":"count_app_dif"})
inst_app_count = inst_app_count.drop_duplicates()

inst_app_count = inst_app_count.sort_values(by='count_app_dif',ascending=False)
inst_app_count = inst_app_count.drop_duplicates()
inst_app_count.head()

Unnamed: 0,ref_hash,count_app_dif,window_nr
247023,5446085605337844584,10,2
247027,5446085605337844584,10,4
41177,906973248467925335,9,4
41178,906973248467925335,9,1
41171,906973248467925335,9,2


In [157]:
inst_app_count.head(2)

Unnamed: 0,ref_hash,count_app_dif,window_nr
247023,5446085605337844584,10,2
247027,5446085605337844584,10,4


### Feature: Instaló en la ventana anterior ?

In [290]:
installs.loc[installs['window_nr'] > 3]

Unnamed: 0,created,application_id,ref_hash,attributed,implicit,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,device_language,n,window_nr


In [201]:
inst_app_count.head(6)

Unnamed: 0,ref_hash,count_app_dif,window_nr
247023,5446085605337844584,10,2
247027,5446085605337844584,10,4
41177,906973248467925335,9,4
41178,906973248467925335,9,1
41171,906973248467925335,9,2
287444,6337720443035762926,9,1


In [280]:
in_sort2 = in_sort.merge(win_ok,how='inner',on='ref_hash')
in_sort2 = in_sort2[['ref_hash','count_app_dif','window_nr_y']]
in_sort2 = in_sort2.rename(columns={'window_nr_y':'window_nr'})
in_sort2.head(3)

Unnamed: 0,ref_hash,count_app_dif,window_nr
0,5446085605337844584,10,2
1,5446085605337844584,10,2
2,5446085605337844584,10,2


In [288]:
in_sort2.loc[in_sort2['window_nr'] > 3]

Unnamed: 0,ref_hash,count_app_dif,window_nr


In [289]:
in_sort.loc[in_sort['window_nr'] > 3]

Unnamed: 0,ref_hash,count_app_dif,window_nr
247027,5446085605337844584,10,4
41177,906973248467925335,9,4
297583,6561273576987834801,7,6
248272,5472947665337192726,7,4
71825,1581855818518364041,7,6
408472,9010591822021924075,6,6
329245,7266425636812329249,6,6
287183,6332251117034865118,6,4
231018,5088289517478304879,6,4
227192,5003101052612746791,6,6


In [282]:
in_sort = in_sort2

In [283]:
in_sort = inst_app_count.sort_values(by=['count_app_dif','ref_hash','window_nr'],ascending=False)
in_sort.head(4)

Unnamed: 0,ref_hash,count_app_dif,window_nr
247027,5446085605337844584,10,4
247023,5446085605337844584,10,2
287447,6337720443035762926,9,2
287444,6337720443035762926,9,1


# !

In [293]:
installs.loc[installs['window_nr'] > 3]

Unnamed: 0,created,application_id,ref_hash,attributed,implicit,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,device_language,n,window_nr


In [296]:
in_next_prev = installs[['ref_hash','window_nr']]
in_next_prev.head(5)
print("Count: "+str(in_next_prev.count()))

Count: ref_hash     480566
window_nr    480566
dtype: int64


In [323]:
ref_win_1 = in_next_prev.loc[in_next_prev['window_nr'] == 1][['ref_hash']]
ref_win_2 = in_next_prev.loc[in_next_prev['window_nr'] == 2][['ref_hash']]
ref_win_3 = in_next_prev.loc[in_next_prev['window_nr'] == 3][['ref_hash']]
ref_win_2['win_2_in_1'] = ref_win_2['ref_hash'].isin(ref_win_1['ref_hash'])
ref_win_3['win_3_in_2'] = ref_win_3['ref_hash'].isin(ref_win_2['ref_hash'])
in_win = in_next_prev.merge(ref_win_2,how="left",on="ref_hash")
in_win = in_win.merge(ref_win_3,how="inner",on="ref_hash")
in_win_d = in_win.drop_duplicates()
in_win_d.head(10)

Unnamed: 0,ref_hash,window_nr,win_2_in_1,win_3_in_2
0,8509930510698276342,1,,False
2,8509930510698276342,3,,False
3,5072620966954264118,1,True,True
5,5072620966954264118,2,True,True
7,5072620966954264118,3,True,True
11,3664457589760889308,1,,False
12,3664457589760889308,3,,False
13,5894856328487209314,1,True,True
14,5894856328487209314,2,True,True
15,5894856328487209314,3,True,True


In [347]:
in_win_e = in_win_d.fillna(False)
in_win_e.head(10)

Unnamed: 0,ref_hash,window_nr,win_2_in_1,win_3_in_2
0,8509930510698276342,1,False,False
2,8509930510698276342,3,False,False
3,5072620966954264118,1,True,True
5,5072620966954264118,2,True,True
7,5072620966954264118,3,True,True
11,3664457589760889308,1,False,False
12,3664457589760889308,3,False,False
13,5894856328487209314,1,True,True
14,5894856328487209314,2,True,True
15,5894856328487209314,3,True,True


### Feature: Cantidad de eventos por tipo 


In [352]:
installs.head()

Unnamed: 0,created,application_id,ref_hash,attributed,implicit,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,device_language,n,window_nr
0,2019-04-18 00:00:01.560,70,4432995619177048534,False,False,unknown,unknown,Apsalar-Postback,unknown,,unknown,unknown,,unknown,1,1
1,2019-04-18 00:00:01.851,70,5904733559638204455,False,False,unknown,unknown,Apsalar-Postback,unknown,,unknown,unknown,,unknown,1,1
2,2019-04-18 00:00:05.152,65,896373747754111825,False,True,3.083058605577787e+17,5.274185305862703e+18,http-kit/2.0,Dalvik/2.1.0 (Linux; U; Android 9; SM-G9650 Bu...,8c8af5e3-96e7-4a49-9f17-cafa7f300f2c,af_app_opened,false,,6.977049253562486e+18,1,1
3,2019-04-18 00:00:05.589,27,3399210824535017892,False,False,unknown,6.794880020077885e+18,http-kit/2.0,trivago/216 CFNetwork/978.0.7 Darwin/18.5.0,,unknown,true,,5.221862722669226e+18,1,1
4,2019-04-18 00:00:06.795,339,1541425881979513687,False,False,unknown,6.794880020077885e+18,http-kit/2.0,TikTok/109005 CFNetwork/758.5.3 Darwin/15.6.0,,unknown,true,,7.528973756559112e+18,1,1


In [356]:
kind_count = installs.groupby('kind').sum().reset_index()[['kind','n']]

In [359]:
inst_kind = installs

In [360]:
inst_kind2 = inst_kind.merge(kind_count,how='inner',on='kind')
inst_kind2.head()

Unnamed: 0,created,application_id,ref_hash,attributed,implicit,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,device_language,n_x,window_nr,n_y
0,2019-04-18 00:00:01.560,70,4432995619177048534,False,False,unknown,unknown,Apsalar-Postback,unknown,,unknown,unknown,,unknown,1,1,377704.0
1,2019-04-18 00:00:01.851,70,5904733559638204455,False,False,unknown,unknown,Apsalar-Postback,unknown,,unknown,unknown,,unknown,1,1,377704.0
2,2019-04-18 00:00:05.589,27,3399210824535017892,False,False,unknown,6.794880020077885e+18,http-kit/2.0,trivago/216 CFNetwork/978.0.7 Darwin/18.5.0,,unknown,true,,5.221862722669226e+18,1,1,377704.0
3,2019-04-18 00:00:06.795,339,1541425881979513687,False,False,unknown,6.794880020077885e+18,http-kit/2.0,TikTok/109005 CFNetwork/758.5.3 Darwin/15.6.0,,unknown,true,,7.528973756559112e+18,1,1,377704.0
4,2019-04-18 00:00:08.994,210,8942039642364169230,False,False,unknown,2.0193221952374024e+18,http-kit/2.0,Dalvik/2.1.0 (Linux; U; Android 8.0.0; FIG-LX3...,,unknown,unknown,,3.3013777759776993e+18,1,1,377704.0


In [363]:
inst_kind = inst_kind2.rename(columns={"n_y":"count_for_kind"})
inst_kind.head(2)

Unnamed: 0,created,application_id,ref_hash,attributed,implicit,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,device_language,n_x,window_nr,count_for_kind
0,2019-04-18 00:00:01.560,70,4432995619177048534,False,False,unknown,unknown,Apsalar-Postback,unknown,,unknown,unknown,,unknown,1,1,377704.0
1,2019-04-18 00:00:01.851,70,5904733559638204455,False,False,unknown,unknown,Apsalar-Postback,unknown,,unknown,unknown,,unknown,1,1,377704.0


In [366]:
inst_kind_train = inst_kind[[]]

Unnamed: 0_level_0,application_id,ref_hash,n_x,window_nr
count_for_kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,8072,2.203972e+20,45.0,101.0
2.0,4624,1.230110e+20,26.0,54.0
3.0,5307,1.310087e+20,30.0,59.0
4.0,3312,9.916405e+19,24.0,59.0
5.0,1655,7.955901e+19,20.0,45.0
6.0,3537,9.961691e+19,24.0,53.0
7.0,2206,6.952328e+19,14.0,33.0
8.0,3760,1.115951e+20,24.0,47.0
9.0,2787,1.322098e+20,27.0,68.0
10.0,1319,4.974349e+19,10.0,24.0


In [None]:
#Agregamos features
columns_to_xgboost = ['ref_hash','source_id'\
                      ,'ref_type','secs_to_next_mean'\
                      ,'date_int','next_date_int','secs_to_next','alta_demanda','pico_demanda','is_week']