In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
import datetime as dt
import seaborn as sns

window = 3 #tamaño de ventana
primeros_dias_de_cada_ventana = [18,21,24]

## Cargamos los datos

In [2]:
#Optimizado para menos memoria
auction_dtypes = {
    'device_id': np.int64,
    'ref_type_id': np.int8,
    'source_id': np.int8
}

auctions = pd.read_csv('auctions.csv.gzip',
                       compression = 'gzip',
                       dtype = auction_dtypes,
                       parse_dates = ['date'])
auctions.columns = ['date','ref_hash','ref_type_id','source_id']


In [3]:
clicks_dtypes = {
    'advertiser_id': np.int8, 
    'action_id': np.int32, 
    'source_id': np.int8, 
    'latitude' : np.float64, 
    'longitude': np.float64, 
    'wifi_connection': np.bool, 
    'carrier_id': np.int16, 
    'trans_id': 'object',
    'os_minor':'category', 
    'agent_device' : 'category', 
    'os_major': 'category', 
    'specs_brand': 'category', 
    'brand': np.int8,
    'timeToClick': np.float64, 
    #'touchX': np.float64, 
    #'touchY': np.float64, 
    'ref_type':np.int64, 
    'ref_hash':np.int64
}
clicks = pd.read_csv('clicks.csv.gzip', 
                     compression='gzip',
                     low_memory = False,
                     parse_dates=['created'])

clicks.touchX = clicks.touchX.apply(lambda x: np.float64(x))
clicks.touchY = clicks.touchY.apply(lambda x: np.float64(x))
clicks.head()

Unnamed: 0,advertiser_id,action_id,source_id,created,country_code,latitude,longitude,wifi_connection,carrier_id,trans_id,os_minor,agent_device,os_major,specs_brand,brand,timeToClick,touchX,touchY,ref_type,ref_hash
0,1,,2,2019-04-18 05:27:42.197,6287817205707153877,1.714547,0.871535,False,3.0,9JMAfrb-b9cSEVCJb0P9JfihGthaS7E,1.517644e+18,,5.131616e+18,71913840936116953,0.0,2.317,0.968,0.503,1891515180541284343,1293710398598742392
1,1,,1,2019-04-18 05:27:03.164,6287817205707153877,1.714512,0.871062,True,2.0,r3xtTRv2lInfiXG8JI3NQsNcBo8GyFQ,1.288578e+18,,3.90839e+18,3576558787748411622,1.0,7.653,0.712,1.689,1891515180541284343,1663930990551616564
2,1,,1,2019-04-18 05:42:07.926,6287817205707153877,1.714547,0.871535,True,4.0,WOnHFqQtY48z_ygKZ-030U_g0TMGVMw,2.238736e+18,,3.581233e+18,3576558787748411622,,464.796,0.227,0.251,1891515180541284343,8488038938665586188
3,1,,1,2019-04-18 05:26:04.446,6287817205707153877,1.708041,0.870772,True,1.0,wQMLLmYqiFhSuha9p9B13PMtcyBW_vM,2.41164e+18,,3.90839e+18,3576558787748411622,,225.311,0.696,6.587,1891515180541284343,6488361690105189959
4,1,,1,2019-04-18 05:23:37.764,6287817205707153877,1.715514,0.870772,True,2.0,GeFoyBzMA7taylMxxjzlNPTU-n4FXFs,1.517644e+18,,5.131616e+18,3576558787748411622,0.0,84.736,0.059,0.142,1891515180541284343,1348993302102753419


In [4]:
events_dtypes = {
    'event_id': np.int64,
    'ref_type': np.int64,
    'ref_hash': np.int64,
    'application_id': np.int64,
    'attributed': np.bool,
    'device_os_version': 'category',
    'device_brand': 'category',
    'device_model': 'category',
    'device_city': 'category',
    'session_user_agent': 'category',
    'trans_id': 'category',
    'user_agent': 'category',
    'event_uuid': 'object',
    'carrier': 'category',
    'kind': 'category',
    'device_os': 'category',
    'wifi': np.bool,
    'connection_type': 'category',
    #'ip_address': np.int64,
    #'device_language': 'category'
}
events_cols = list(events_dtypes.keys()) + ['date']
events = pd.read_csv('events.csv.gzip', 
                     compression='gzip',
                     dtype=events_dtypes,
                     usecols=events_cols,
                     parse_dates=['date'])

In [5]:
installs_dtypes = {
    'application_id': np.int32,
    'ref_type': np.int64,
    'ref_hash': np.int64, 
    'click_hash':'category',
    'attributed': 'category',
    'implicit': 'category',
    #'device_countrycode': 'object', 
    'device_brand': 'category',
    'device_model': 'category', 
    'session_user_agent': 'category', 
    'user_agent': 'category', 
    'event_uuid':'object',
    'kind': 'category',
    'wifi': 'category', 
    'trans_id': 'object', 
    #'ip_address':'object', 
    'device_language': 'category'
}
install_cols = list(installs_dtypes.keys()) + ['created']
installs = pd.read_csv('installs.csv.gzip', 
                       compression='gzip', 
                       usecols=install_cols,
                       dtype= installs_dtypes,
                      parse_dates=['created'])


## Armamos el ref_hash global

In [6]:
all_ref_hash = clicks['ref_hash'].append( events['ref_hash']).append(auctions['ref_hash']).append(installs['ref_hash']).drop_duplicates().to_frame()
all_ref_hash.columns = ['ref_hash']


Con esto tenemos todos los dispositivos que aparecen en por lo menos algun csv (sea auctions, clicks o events).

## Features posibles en base a eventos

In [18]:
events.head()

Unnamed: 0,date,event_id,ref_type,ref_hash,application_id,attributed,device_os_version,device_brand,device_model,device_city,session_user_agent,trans_id,user_agent,event_uuid,carrier,kind,device_os,wifi,connection_type
0,2019-04-20 01:42:49.120,0,1891515180541284343,5857744372586891366,210,False,,,4.318294190479584e+18,,3.819516403548394e+18,,5.046185273150854e+18,5b506964-5f47-4b28-a8c2-8a92d6c23379,,5.882882097123621e+18,,False,
1,2019-04-20 01:42:49.340,1,1891515180541284343,7642521036780133571,210,False,,,,,3.819516403548394e+18,,,f1fb9d15-1a7b-4116-8d3b-c4c403e197e2,,4.017674184041173e+18,,False,
2,2019-04-20 01:42:49.365,1,1891515180541284343,2548841562898283198,210,False,,,,,3.819516403548394e+18,,,c85a0b15-a5d7-472e-8116-6bfa3db19687,,4.017674184041173e+18,,False,
3,2019-04-20 01:42:51.438,2,1891515180541284343,609402887625919085,210,False,,,,,3.819516403548394e+18,,,f4aa0a97-2de6-4f22-95c6-1b3150112cb9,,6.168308581888314e+18,,False,
4,2019-04-20 01:42:51.838,1,1891515180541284343,9114651763556439823,210,False,,,,,3.819516403548394e+18,,,08e2f7f7-875f-4aa0-b337-b9b87b0d83ea,,4.017674184041173e+18,,False,


### Cantidad de eventos

In [19]:
df = pd.DataFrame()
df = events.loc[:,['ref_hash','ref_type','date']]
df['date'] = df['date'].dt.date
df['cant_eventos'] = pd.Series([1 for i in range(len(df))])
df = df.groupby(by=['date','ref_hash']).agg({'cant_eventos':'sum'}).reset_index()
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,date,ref_hash,cant_eventos
0,2019-04-18,41863526108385,24
1,2019-04-18,186034136943920,13
2,2019-04-18,501790157110512,1
3,2019-04-18,558877640599287,1
4,2019-04-18,643594200494946,3


In [20]:
eventos_por_ventana = pd.DataFrame()

for i in primeros_dias_de_cada_ventana:
    temp = pd.DataFrame()
    
    temp[['ref_hash','cant_eventos']] = df.loc[(df['date'].dt.day >= i) & (df['date'].dt.day < (i+window-1))].groupby('ref_hash', as_index=False)['cant_eventos'].sum()
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['window_date_end'] = dt.datetime(2019,4, i + window -1)
    eventos_por_ventana = eventos_por_ventana.append(temp,sort=True)
    
    #Agrego los que no se encontraron en esta ventana pero si estan en events, auctions o installs (aparecen en alguna)
    temp = all_ref_hash.merge(eventos_por_ventana.loc[eventos_por_ventana['window_date_start'].dt.day == i], how='left',on='ref_hash')
    temp = temp.loc[temp['cant_eventos'].isnull()]
    temp['cant_eventos'] = 0
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['window_date_end'] = dt.datetime(2019,4,i+window-1)
    eventos_por_ventana = eventos_por_ventana.append(temp,sort=True)

    
eventos_por_ventana = eventos_por_ventana[['window_date_start', 'window_date_end', 'ref_hash', 'cant_eventos']]
eventos_por_ventana.drop_duplicates(inplace = True)
eventos_por_ventana.head()

Unnamed: 0,window_date_start,window_date_end,ref_hash,cant_eventos
0,2019-04-18,2019-04-20,41863526108385,45
1,2019-04-18,2019-04-20,161514654074162,8
2,2019-04-18,2019-04-20,186034136943920,13
3,2019-04-18,2019-04-20,360710529886978,1
4,2019-04-18,2019-04-20,365882020742330,36


### Ultimos eventos

Tenemos como problema en "ultimo evento" porque para los que no tienen eventos en determinada ventana, quedaría NULL.
Hay que decidir que se hace en esos casos... Porque sino no se puede calcular la distancia para el KNN.

In [21]:
ultimos_eventos = pd.DataFrame()
ultimos_eventos = events.loc[:,['ref_hash','ref_type','date']]

ultimos_eventos = ultimos_eventos.groupby(by=['ref_hash']).agg({'date':'max'}).reset_index()
ultimos_eventos.head()

Unnamed: 0,ref_hash,date
0,40621409780134,2019-04-23 15:02:00.589
1,41863526108385,2019-04-26 05:54:36.530
2,69039685746313,2019-04-26 23:21:49.082
3,90072729247980,2019-04-24 18:31:16.624
4,161514654074162,2019-04-25 18:50:33.535


In [22]:
ultimo_evento_por_ventana = pd.DataFrame()

for i in primeros_dias_de_cada_ventana:
    temp = pd.DataFrame()
    
    temp[['ref_hash','last_event_date']] = ultimos_eventos.loc[(df['date'].dt.day >= i) & (df['date'].dt.day < (i+window-1))].groupby('ref_hash', as_index=False)['date'].max()
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['window_date_end'] = dt.datetime(2019,4, i + window -1)
    ultimo_evento_por_ventana  = ultimo_evento_por_ventana.append(temp,sort=True)
    
    #Agrego los que no se encontraron en esta ventana pero si estan en events, auctions o installs (aparecen en alguna)
    temp = all_ref_hash.merge(ultimo_evento_por_ventana.loc[ultimo_evento_por_ventana['window_date_start'].dt.day == i], how='left',on='ref_hash')
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['window_date_end'] = dt.datetime(2019,4,i+window-1)
    ultimo_evento_por_ventana = ultimo_evento_por_ventana.append(temp,sort=True)

ultimo_evento_por_ventana = ultimo_evento_por_ventana[['window_date_start', 'window_date_end', 'ref_hash', 'last_event_date']]
ultimo_evento_por_ventana.drop_duplicates(inplace = True)
ultimo_evento_por_ventana.head()    

Unnamed: 0,window_date_start,window_date_end,ref_hash,last_event_date
0,2019-04-18,2019-04-20,40621409780134,2019-04-23 15:02:00.589
1,2019-04-18,2019-04-20,41863526108385,2019-04-26 05:54:36.530
2,2019-04-18,2019-04-20,69039685746313,2019-04-26 23:21:49.082
3,2019-04-18,2019-04-20,90072729247980,2019-04-24 18:31:16.624
4,2019-04-18,2019-04-20,161514654074162,2019-04-25 18:50:33.535


### Secuencia ultimos n eventos

In [23]:
n = 5
secuencia_n = pd.DataFrame()
secuencia_n = events.loc[:,['ref_hash','ref_type','date']]
secuencia_n = secuencia_n.groupby(by = ['ref_hash'])['date'].nlargest(n)
type(secuencia_n)#.head(20) #Esta bien esto?

pandas.core.series.Series

Falta hacer...

## Features posibles en base a clicks

### Cantidad de clicks

In [24]:
cant_clicks = pd.DataFrame()
cant_clicks = clicks.loc[:,['ref_hash','ref_type','created']]
cant_clicks['created'] = cant_clicks['created'].dt.date
cant_clicks['cant_clicks'] = 1
cant_clicks = cant_clicks.groupby(by=['created','ref_hash']).agg({'cant_clicks':'sum'}).reset_index()
cant_clicks['created'] = pd.to_datetime(cant_clicks['created'])
cant_clicks.head()

Unnamed: 0,created,ref_hash,cant_clicks
0,2019-04-12,10151602175762247,1
1,2019-04-12,25016781820813419,1
2,2019-04-12,30208233531284574,2
3,2019-04-12,31455207219712322,1
4,2019-04-12,31852821981696986,3


In [25]:
cantidad_de_clicks_por_ventana = pd.DataFrame()

for i in primeros_dias_de_cada_ventana:
    temp = pd.DataFrame()
    
    temp[['ref_hash','cant_clicks']] = cant_clicks.loc[(cant_clicks['created'].dt.day >= i) & (cant_clicks['created'].dt.day < (i+window-1))].groupby('ref_hash', as_index=False)['cant_clicks'].sum()
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['window_date_end'] = dt.datetime(2019,4, i + window -1)
    cantidad_de_clicks_por_ventana  = cantidad_de_clicks_por_ventana.append(temp,sort=True)
    
    #Agrego los que no se encontraron en esta ventana pero si estan en events, auctions o installs (aparecen en alguna)
    temp = all_ref_hash.merge(cantidad_de_clicks_por_ventana.loc[cantidad_de_clicks_por_ventana['window_date_start'].dt.day == i], how='left',on='ref_hash')
    temp = temp.loc[temp['cant_clicks'].isnull()]
    temp['cant_clicks'] = 0
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['window_date_end'] = dt.datetime(2019,4,i+window-1)
    cantidad_de_clicks_por_ventana = cantidad_de_clicks_por_ventana.append(temp,sort=True)

cantidad_de_clicks_por_ventana = cantidad_de_clicks_por_ventana[['window_date_start', 'window_date_end', 'ref_hash', 'cant_clicks']]
cantidad_de_clicks_por_ventana.drop_duplicates(inplace = True)
cantidad_de_clicks_por_ventana.head()    

Unnamed: 0,window_date_start,window_date_end,ref_hash,cant_clicks
0,2019-04-18,2019-04-20,1461247282174365,1
1,2019-04-18,2019-04-20,5534899846690585,1
2,2019-04-18,2019-04-20,7429113196145773,1
3,2019-04-18,2019-04-20,10745840231746185,1
4,2019-04-18,2019-04-20,11447172392196885,3


## Features posibles en base a Auctions

### Cantidad de sources distintos

In [26]:
cant_src_distintos = auctions.loc[:,['ref_hash','ref_type','date','source_id']]
cant_src_distintos['date'] = cant_src_distintos['date'].dt.date
cant_src_distintos['src_distintos'] = pd.Series([1 for i in range(len(cant_src_distintos))])
cant_src_distintos = cant_src_distintos.groupby( by = ['date','ref_hash','source_id']).agg({'src_distintos':'sum'}).reset_index()
cant_src_distintos['date'] = pd.to_datetime(cant_src_distintos['date'])
cant_src_distintos.head()

Unnamed: 0,date,ref_hash,source_id,src_distintos
0,2019-04-18,161514654074162,0,4
1,2019-04-18,161514654074162,1,1
2,2019-04-18,161514654074162,8,1
3,2019-04-18,186034136943920,1,2
4,2019-04-18,283297668933729,3,1


In [27]:
cantidad_src_distintos_por_ventana = pd.DataFrame()

for i in primeros_dias_de_cada_ventana:
    temp = pd.DataFrame()
    
    temp[['ref_hash','src_distintos']] = cant_src_distintos.loc[(cant_src_distintos['date'].dt.day >= i) & (cant_src_distintos['date'].dt.day < (i+window-1))].groupby('ref_hash', as_index=False)['src_distintos'].sum()
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['window_date_end'] = dt.datetime(2019,4, i + window -1)
    cantidad_src_distintos_por_ventana  = cantidad_src_distintos_por_ventana.append(temp,sort=True)
    
   #Agrego los que no se encontraron en esta ventana pero si estan en events, auctions o installs (aparecen en alguna)
    temp = all_ref_hash.merge(cantidad_src_distintos_por_ventana.loc[cantidad_src_distintos_por_ventana['window_date_start'].dt.day == i], how='left',on='ref_hash')
    temp = temp.loc[temp['src_distintos'].isnull()]
    temp['src_distintos'] = 0
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['window_date_end'] = dt.datetime(2019,4,i+window-1)
    cantidad_src_distintos_por_ventana = cantidad_src_distintos_por_ventana.append(temp,sort=True)

cantidad_src_distintos_por_ventana = cantidad_src_distintos_por_ventana[['window_date_start', 'window_date_end', 'ref_hash', 'src_distintos']]
cantidad_src_distintos_por_ventana.drop_duplicates(inplace = True)
cantidad_src_distintos_por_ventana.head()    

Unnamed: 0,window_date_start,window_date_end,ref_hash,src_distintos
0,2019-04-18,2019-04-20,41863526108385,1
1,2019-04-18,2019-04-20,161514654074162,6
2,2019-04-18,2019-04-20,186034136943920,2
3,2019-04-18,2019-04-20,283297668933729,1
4,2019-04-18,2019-04-20,345999128501141,47


### Cantidad de apariciones

In [28]:
cant_apariciones = auctions.loc[:,['ref_hash','ref_type','date','source_id']]
cant_apariciones['date'] = cant_apariciones['date'].dt.date
cant_apariciones['cant_auctions'] = pd.Series([1 for i in range(len(cant_apariciones))])
cant_apariciones = cant_apariciones.groupby( by = ['date','ref_hash']).agg({'cant_auctions':'sum'}).reset_index()
cant_apariciones['date'] = pd.to_datetime(cant_apariciones['date'])
cant_apariciones.head()

Unnamed: 0,date,ref_hash,cant_auctions
0,2019-04-18,161514654074162,6
1,2019-04-18,186034136943920,2
2,2019-04-18,283297668933729,1
3,2019-04-18,345999128501141,45
4,2019-04-18,360710529886978,9


In [29]:
cant_apariciones_por_ventana = pd.DataFrame()

for i in primeros_dias_de_cada_ventana:
    temp = pd.DataFrame()
    
    temp[['ref_hash','cant_auctions']] = cant_apariciones.loc[(cant_apariciones['date'].dt.day >= i) & (cant_apariciones['date'].dt.day < (i+window-1))].groupby('ref_hash', as_index=False)['cant_auctions'].sum()
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['window_date_end'] = dt.datetime(2019,4, i + window -1)
    cant_apariciones_por_ventana  = cant_apariciones_por_ventana.append(temp,sort=True)
    
   #Agrego los que no se encontraron en esta ventana pero si estan en events, auctions o installs (aparecen en alguna)
    temp = all_ref_hash.merge(cant_apariciones_por_ventana.loc[cant_apariciones_por_ventana['window_date_start'].dt.day == i], how='left',on='ref_hash')
    temp = temp.loc[temp['cant_auctions'].isnull()]
    temp['cant_auctions'] = 0
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['window_date_end'] = dt.datetime(2019,4,i+window-1)
    cant_apariciones_por_ventana = cant_apariciones_por_ventana.append(temp,sort=True)

cant_apariciones_por_ventana = cant_apariciones_por_ventana[['window_date_start', 'window_date_end', 'ref_hash', 'cant_auctions']]
cant_apariciones_por_ventana.drop_duplicates(inplace = True)
cant_apariciones_por_ventana.head()

Unnamed: 0,window_date_start,window_date_end,ref_hash,cant_auctions
0,2019-04-18,2019-04-20,41863526108385,1
1,2019-04-18,2019-04-20,161514654074162,6
2,2019-04-18,2019-04-20,186034136943920,2
3,2019-04-18,2019-04-20,283297668933729,1
4,2019-04-18,2019-04-20,345999128501141,47


## Features posibles en base a installs

### Cantidad de installs

In [30]:
cant_installs = installs.loc[:,['ref_hash','ref_type','created','source_id']]
cant_installs['created'] = cant_installs['created'].dt.date
cant_installs['cant_installs'] = pd.Series([1 for i in range(len(cant_apariciones))])
cant_installs = cant_installs.groupby( by = ['created','ref_hash']).agg({'cant_installs':'sum'}).reset_index()
cant_installs['created'] = pd.to_datetime(cant_installs['created'])
cant_installs.head()

Unnamed: 0,created,ref_hash,cant_installs
0,2019-04-18,41863526108385,4
1,2019-04-18,186034136943920,1
2,2019-04-18,530786270564316,1
3,2019-04-18,558877640599287,1
4,2019-04-18,655267966876774,1


In [31]:
cant_installs_por_ventana = pd.DataFrame()

for i in primeros_dias_de_cada_ventana:
    temp = pd.DataFrame()
    
    temp[['ref_hash','cant_installs']] = cant_installs.loc[(cant_installs['created'].dt.day >= i) & (cant_installs['created'].dt.day < (i+window-1))].groupby('ref_hash', as_index=False)['cant_installs'].sum()
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['window_date_end'] = dt.datetime(2019,4, i + window -1)
    cant_installs_por_ventana  = cant_installs_por_ventana.append(temp,sort=True)
    
   #Agrego los que no se encontraron en esta ventana pero si estan en events, auctions, installs o clicks (aparecen en alguna)
    temp = all_ref_hash.merge(cant_installs_por_ventana.loc[cant_installs_por_ventana['window_date_start'].dt.day == i], how='left',on='ref_hash')
    temp = temp.loc[temp['cant_installs'].isnull()]
    temp['cant_installs'] = 0
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['window_date_end'] = dt.datetime(2019,4,i+window-1)
    cant_installs_por_ventana = cant_installs_por_ventana.append(temp,sort=True)

cant_installs_por_ventana = cant_installs_por_ventana[['window_date_start', 'window_date_end', 'ref_hash', 'cant_installs']]
cant_installs_por_ventana.drop_duplicates(inplace = True)
cant_installs_por_ventana.head()

Unnamed: 0,window_date_start,window_date_end,ref_hash,cant_installs
0,2019-04-18,2019-04-20,41863526108385,4
1,2019-04-18,2019-04-20,186034136943920,1
2,2019-04-18,2019-04-20,365882020742330,1
3,2019-04-18,2019-04-20,519199987760489,1
4,2019-04-18,2019-04-20,530786270564316,1


## KNN probabilidades

## Dataframe final con todos los features (hasta el momento):

In [32]:
features = eventos_por_ventana.merge(ultimo_evento_por_ventana, how = 'inner', on = ['window_date_start', 'window_date_end', 'ref_hash'])\
            .merge(cantidad_de_clicks_por_ventana, how = 'inner', on = ['window_date_start', 'window_date_end', 'ref_hash'])\
            .merge(cantidad_src_distintos_por_ventana, how = 'inner', on = ['window_date_start', 'window_date_end', 'ref_hash'])\
            .merge(cant_apariciones_por_ventana, how = 'inner', on = ['window_date_start', 'window_date_end', 'ref_hash'])\
            .merge(cant_installs_por_ventana, how = 'inner', on = ['window_date_start','window_date_end','ref_hash'])

features.head()

Unnamed: 0,window_date_start,window_date_end,ref_hash,cant_eventos,last_event_date,cant_clicks,src_distintos,cant_auctions,cant_installs
0,2019-04-18,2019-04-20,41863526108385,45,2019-04-26 05:54:36.530,0,1,1,4
1,2019-04-18,2019-04-20,161514654074162,8,2019-04-25 18:50:33.535,0,6,6,0
2,2019-04-18,2019-04-20,186034136943920,13,2019-04-24 06:30:25.319,0,2,2,1
3,2019-04-18,2019-04-20,360710529886978,1,2019-04-22 19:50:03.504,0,10,10,0
4,2019-04-18,2019-04-20,365882020742330,36,2019-04-26 04:25:31.977,0,4,4,1


In [33]:
features.to_csv('features.csv')

In [34]:
features = pd.read_csv('features.csv',parse_dates = ['window_date_start','window_date_end', 'last_event_date'])

In [35]:
features.head()

Unnamed: 0.1,Unnamed: 0,window_date_start,window_date_end,ref_hash,cant_eventos,last_event_date,cant_clicks,src_distintos,cant_auctions,cant_installs
0,0,2019-04-18,2019-04-20,41863526108385,45,2019-04-26 05:54:36.530,0,1,1,4
1,1,2019-04-18,2019-04-20,161514654074162,8,2019-04-25 18:50:33.535,0,6,6,0
2,2,2019-04-18,2019-04-20,186034136943920,13,2019-04-24 06:30:25.319,0,2,2,1
3,3,2019-04-18,2019-04-20,360710529886978,1,2019-04-22 19:50:03.504,0,10,10,0
4,4,2019-04-18,2019-04-20,365882020742330,36,2019-04-26 04:25:31.977,0,4,4,1


In [None]:
def distancia_cos(x,y):
    return (x.values * y.values).sum() / (np.linalg.norm(x.values) * np.linalg.norm(y.values))


In [None]:
def buscar_k_vecinos(distancia,k,datos,q):
    '''Busca los k vecinos de q en los datos pasados y segun la distancia pasada'''
    return datos.apply(lambda x: distancia(x,q), axis = 1).nlargest(k, keep = 'first')

Agregamos la columna KNN a los features que será la probabilidad de convertir de cada ref_hash en la ventana siguiente...

Como label de KNN tomamos la cantidad de instalaciones y de subastas, tomando como booleano esa cantidad.

In [None]:
k = 5

for i in primeros_dias_de_cada_ventana:
    temp = pd.DataFrame()
    temp = features.loc[(features['window_date_start'].dt.day == i)
                        
    for index, row in temp.iterrows():
        k_vecinos = buscar_k_vecinos(distancia_cos, k, temp, row)
        #Despues tendria que buscar cada K vecino de la serie, en el la siguiente ventana. Obtener si convirtio o no, y despues
        #calcular la prob de que esta fila convierta (o aparezca en subasta) y listo.... Tiene pienta de ser horrible asi...
        #demasiados for anidados...