In [12]:
import pandas as pd
import matplotlib as mp
import numpy as np
from datetime import timedelta
import datetime as dt

%matplotlib inline
    
#Root de archivos de datos
root_dir = ''

#Lee menos data si es testing
is_testing = False

# Cantidad minima de aparicion de equipos en un dataframe
min_devices = 5

# Identificador unico de equipos
device_uuid = ['ref_hash']

# Ventanas de tiempo
windows = pd.DataFrame({
    'begin_date': [dt.datetime(2019,4,18),dt.datetime(2019,4,21),dt.datetime(2019,4,24)],
    'window_nr':[1,2,3]
})
group = ['ref_hash','window_nr']

In [2]:
#Optimizado para menos memoria
auction_dtypes = {
    'device_id': np.int64,
    'source_id': np.int8
}

auctions = pd.read_csv(root_dir + 'auctions.csv.gzip',
                       compression = 'gzip',
                       dtype = auction_dtypes,
                       usecols=list(auction_dtypes.keys()) + ['date'],
                       parse_dates = ['date'])
auctions.rename({'device_id':'ref_hash',
                 'date':'created'}, inplace=True, axis='columns')
auctions['n'] = 1

# Para hacer pruebas ocupando menos memoria se hace un sampleo aleatorio de la mitad del dataframe y se elimina el resto
if is_testing:
    auctions = auctions.sample(frac=.30)
    print("Is Testing, #records:", auctions['n'].sum())
else:
    print("Not Testing, #records:", auctions['n'].sum())

Not Testing, #records: 47409528


In [3]:
installs_dtypes = {
    'application_id': np.int32,
    'ref_hash': np.int64, 
    #'click_hash':'category',
    'attributed': 'category',
    'implicit': 'category',
    #'device_countrycode': 'object', 
    'device_brand': 'object',
    'device_model': 'object', 
    'session_user_agent': 'object', 
    'user_agent': 'object', 
    'event_uuid':'object',
    'kind': 'object',
    'wifi': 'object', 
    'trans_id': 'object', 
    #'ip_address':'object', 
    'device_language': 'object'
}
install_cols = list(installs_dtypes.keys()) + ['created']
installs = pd.read_csv(root_dir + 'installs.csv.gzip', 
                       compression='gzip', 
                       usecols=install_cols,
                       dtype= installs_dtypes,
                      parse_dates=['created'])

installs['n'] = 1
print('#records:', installs['n'].sum())

#records: 481511


In [4]:
clicks_dtypes = {
    'advertiser_id': np.int64, 
    'action_id': np.float, 
    'source_id': np.int8, 
    'latitude' : np.float64, 
    'longitude': np.float64, 
    'wifi_connection': np.bool, 
    'carrier_id': 'object', 
    'trans_id': 'object',
    'os_minor':'object', 
    'agent_device' : 'object', 
    'os_major': 'object', 
    'specs_brand': 'category', 
    'brand': 'object',
    'timeToClick': np.float64, 
    'touchX': 'object', 
    'touchY': 'object', 
    'ref_hash':np.int64,
    'created' : 'object'
}
clicks = pd.read_csv(root_dir + 'clicks.csv.gzip', 
                     compression='gzip',
                     usecols=list(clicks_dtypes.keys()) + ['created'],
                     dtype=clicks_dtypes,
                     parse_dates=['created'])
clicks['n'] = 1
print('#records:', clicks['n'].sum())

#records: 64296


In [2]:
events_dtypes = {
    'event_id': np.int64,
    'ref_hash': np.int64,
    'application_id': np.int64,
    'attributed': np.bool,
    'device_os_version': 'object',
    'device_brand': 'object',
    'device_model': 'object',
    'device_city': 'object',
    'session_user_agent': 'object',
    'trans_id': 'category',
    'user_agent': 'object',
    'event_uuid': 'object',
    'carrier': 'object',
    'kind': 'object',
    'device_os': 'object',
    'wifi': np.bool,
    'connection_type': 'object',
    #'ip_address': np.int64,
    #'device_language': 'category'
}
events_cols = list(events_dtypes.keys()) + ['date']
events = pd.read_csv(root_dir + 'events.csv.gzip', 
                     compression='gzip',
                     dtype=events_dtypes,
                     usecols=events_cols,
                     parse_dates=['date'])

events.rename({'date':'created'}, inplace=True, axis='columns')
events['n'] = 1
# Para hacer pruebas ocupando menos memoria se hace un sampleo aleatorio de la mitad del dataframe y se elimina el resto
if is_testing:
    events = events.sample(frac=.30)
    print("Is Testing, #records:", events['n'].sum())
else:
    print("Not Testing, #records:", events['n'].sum())

Not Testing, #records: 7744581


In [4]:
# Labels a submitir con las predicciones
to_predict = pd.read_csv(root_dir + 'target_competencia_ids.csv')
to_predict = to_predict.apply(lambda x: np.int64(x['ref_hash'][0:x['ref_hash'].find('_')]), axis='columns').drop_duplicates().to_frame()
to_predict.columns = ['ref_hash']

In [5]:
unique_hashes = pd.read_csv(root_dir+'unique_hashes.csv')

In [5]:

#Agrego la ventana de tiempo
auctions = auctions.loc[auctions['ref_hash'].isin(unique_hashes['ref_hash'])]
auctions.sort_values(by='created',inplace=True)
auctions = pd.merge_asof(auctions,windows,left_on='created',right_on='begin_date').drop('begin_date', axis='columns')

In [None]:
installs = installs.loc[installs['ref_hash'].isin(unique_hashes['ref_hash'])]
installs.sort_values(by='created',inplace=True)
installs = pd.merge_asof(installs,windows,left_on='created',right_on='begin_date').drop('begin_date', axis='columns')

In [None]:
clicks = clicks.loc[clicks['ref_hash'].isin(unique_hashes['ref_hash'])]
clicks.sort_values(by='created',inplace=True)
clicks = pd.merge_asof(clicks,windows,left_on='created',right_on='begin_date').drop('begin_date', axis='columns')

In [6]:
events = events.loc[events['ref_hash'].isin(unique_hashes['ref_hash'])]
events.sort_values(by='created',inplace=True)
events = pd.merge_asof(events,windows,left_on='created',right_on='begin_date').drop('begin_date', axis='columns')

## Arreglo de Auctions

In [6]:
#Elimino los registros con menos de un minimo de entradas, ya que no hay mucho que predecir en estos casos
group = ['ref_hash','window_nr']
orig_count = auctions['n'].sum()
#auctions = auctions.groupby(group, sort = False).filter(lambda x: x['n'].sum() >= min_devices) 
last_count = auctions['n'].sum()
print('Eliminados:', orig_count-last_count)

Eliminados: 0


In [7]:
auctions.sort_values(by=group+['created'], inplace=True)


In [8]:
auctions['next_date'] = auctions.groupby(group, as_index = False, sort=False)['created'].transform(lambda x: x.shift(-1))
auctions = auctions.loc[(~auctions['next_date'].isnull())]
auctions['secs_to_next'] = (auctions['next_date'] - auctions['created']).transform(lambda x: round(x.total_seconds()))

In [9]:
auctions.head()

Unnamed: 0,created,ref_hash,source_id,n,window_nr,next_date,secs_to_next
9146132,2019-04-19 19:40:28.465866,41863526108385,8,1,1,2019-04-20 02:52:26.892880,25918
11162626,2019-04-20 02:52:26.892880,41863526108385,3,1,1,2019-04-20 02:59:02.509230,396
11194917,2019-04-20 02:59:02.509230,41863526108385,3,1,1,2019-04-20 03:06:01.675788,419
11229465,2019-04-20 03:06:01.675788,41863526108385,3,1,1,2019-04-20 03:08:57.388160,176
11244486,2019-04-20 03:08:57.388160,41863526108385,3,1,1,2019-04-20 03:11:26.463903,149


## Arreglo de installs

In [None]:
installs['kind'] = installs['kind'].str.replace(' ','_')
installs['kind'] = installs['kind'].str.replace('af_app_open ','af_app_opened')
installs['kind'] = installs['kind'].str.replace('af_app_opend','af_app_opened')
installs['kind'] = installs['kind'].str.lower()

In [None]:
installs.isnull().sum()


In [None]:
for i in ['device_brand','device_model','session_user_agent','user_agent','kind','wifi','device_language']:
    installs[i] = installs[i].fillna('unknown')
installs['device_brand'] = installs['device_brand'].astype('category')
installs['device_model'] = installs['device_model'].astype('category')
installs['session_user_agent'] = installs['session_user_agent'].astype('category')
installs['user_agent'] = installs['user_agent'].astype('category')
installs['kind'] = installs['kind'].astype('category')
installs['wifi'] = installs['wifi'].astype('category')
installs['device_language'] = installs['device_language'].astype('category')

In [None]:
installs.head()


## Arreglo de Clicks

In [None]:
clicks.touchX = clicks.touchX.apply(lambda x: np.float64(x))
clicks.touchY = clicks.touchY.apply(lambda x: np.float64(x))
clicks['created'] = clicks['created'].dt.tz_localize(None)

In [None]:
for i in ['carrier_id','os_minor', 'agent_device', 'brand', 'os_major']:
    clicks[i] = clicks[i].fillna('unknown').astype('category')

In [None]:
clicks.isnull().sum()


## Arreglo de Events

In [7]:
for i in ['device_os_version','device_brand','device_model','device_city','session_user_agent','user_agent','carrier','kind','device_os','connection_type']:
    events[i] = events[i].fillna('unknown').astype('category')

In [8]:
events.isnull().sum()


created                     0
event_id                    0
ref_hash                    0
application_id              0
attributed                  0
device_os_version           0
device_brand                0
device_model                0
device_city                 0
session_user_agent          0
trans_id              7700153
user_agent                  0
event_uuid              29631
carrier                     0
kind                        0
device_os                   0
wifi                        0
connection_type             0
n                           0
window_nr                   0
dtype: int64

## Armado de Features

In [38]:
temp = unique_hashes
temp['window_nr'] = 1
devices = temp.copy()
temp['window_nr'] = 2
devices = devices.append(temp)
temp['window_nr'] = 3
devices = devices.append(temp)

In [39]:
devices['window_nr'].value_counts()


3    662110
2    662110
1    662110
Name: window_nr, dtype: int64

In [40]:
devices.set_index(group)

ref_hash,window_nr
2564673204772915246,1
4441121667607578179,1
7721769811471055264,1
6416039086842158968,1
1258642015983312729,1
6707090658317158573,1
8869722088125970841,1
7445213948764639634,1
2932617030932207332,1
6405811806780450397,1


## Cantidad de sources distintos

In [34]:
amount_dif_src = auctions.groupby(by = ['ref_hash', 'window_nr','source_id']).agg({'n':'sum'}).reset_index()
amount_dif_src = auctions.groupby(by = ['ref_hash', 'window_nr']).agg({'n' : 'sum'}).reset_index()
amount_dif_src.rename(index = str, columns = {'n': 'amount_dif_src' },inplace = True)
amount_dif_src.head()

Unnamed: 0,ref_hash,window_nr,amount_dif_src
0,41863526108385,1,34
1,69039685746313,3,3
2,135153013040192,1,7
3,161514654074162,1,5
4,186034136943920,1,6


In [35]:
devices = devices.merge(amount_dif_src, on = ['ref_hash','window_nr'], how = 'left')
devices['amount_dif_src'].fillna(0,inplace=True)
devices.head()

Unnamed: 0,ref_hash,window_nr,amount_dif_src
0,2564673204772915246,1,216.0
1,4441121667607578179,1,1188.0
2,7721769811471055264,1,908.0
3,6416039086842158968,1,0.0
4,1258642015983312729,1,12.0


## Aparecio el fin de semana?

In [36]:
auctions['is_weekend'] = auctions.groupby(by = ['ref_hash'])['created'].transform(lambda x: (x.dt.weekday == 5) | (x.dt.weekday == 6))
auctions.head()

Unnamed: 0,created,ref_hash,source_id,n,window_nr,next_date,secs_to_next,is_weekend
9146132,2019-04-19 19:40:28.465866,41863526108385,8,1,1,2019-04-20 02:52:26.892880,25918,False
11162626,2019-04-20 02:52:26.892880,41863526108385,3,1,1,2019-04-20 02:59:02.509230,396,True
11194917,2019-04-20 02:59:02.509230,41863526108385,3,1,1,2019-04-20 03:06:01.675788,419,True
11229465,2019-04-20 03:06:01.675788,41863526108385,3,1,1,2019-04-20 03:08:57.388160,176,True
11244486,2019-04-20 03:08:57.388160,41863526108385,3,1,1,2019-04-20 03:11:26.463903,149,True


In [37]:
temp = auctions.groupby(by = ['ref_hash','window_nr']).agg({'is_weekend': 'sum'}).reset_index()
temp.rename(index = str, columns = {'is_weekend': 'appears_on_weekend' },inplace = True)
temp['appears_on_weekend'] = temp['appears_on_weekend'].astype(bool)
temp.head()

Unnamed: 0,ref_hash,window_nr,appears_on_weekend
0,41863526108385,1,True
1,69039685746313,3,False
2,135153013040192,1,True
3,161514654074162,1,False
4,186034136943920,1,True


In [38]:
devices = devices.merge(temp, on = ['ref_hash','window_nr'], how = 'left')
devices['appears_on_weekend'].fillna(False, inplace =True)
devices.head()

Unnamed: 0,ref_hash,window_nr,amount_dif_src,appears_on_weekend
0,2564673204772915246,1,216.0,False
1,4441121667607578179,1,1188.0,True
2,7721769811471055264,1,908.0,True
3,6416039086842158968,1,0.0,False
4,1258642015983312729,1,12.0,False


## Cantidad de eventos

In [41]:
events['n'] = 1
temp = events.groupby(by = ['ref_hash','window_nr']).agg({'n':'sum'}).reset_index()
temp.rename(index = str, columns = {'n': 'amount_events' },inplace = True)
temp.head()

Unnamed: 0,ref_hash,window_nr,amount_events
0,40621409780134,2,9
1,41863526108385,1,88
2,41863526108385,2,8
3,41863526108385,3,57
4,69039685746313,2,4


In [42]:
devices = devices.merge(temp, on = ['ref_hash', 'window_nr'], how = 'left')
devices['amount_events'].fillna(0, inplace =True)
devices.head()

Unnamed: 0,ref_hash,window_nr,amount_events
0,2564673204772915246,1,0.0
1,4441121667607578179,1,0.0
2,7721769811471055264,1,99.0
3,6416039086842158968,1,0.0
4,1258642015983312729,1,0.0


## Ultimo evento

In [43]:
temp = events.groupby(by = ['ref_hash','window_nr']).agg({'created':'max'}).reset_index()
temp.rename(columns = {'created': 'last_event'}, inplace = True)
temp.head()

Unnamed: 0,ref_hash,window_nr,last_event
0,40621409780134,2,2019-04-23 15:02:00.589
1,41863526108385,1,2019-04-20 05:32:26.512
2,41863526108385,2,2019-04-21 02:33:06.322
3,41863526108385,3,2019-04-26 05:54:36.530
4,69039685746313,2,2019-04-23 17:35:07.083


In [44]:
devices =devices.merge(temp, on = ['ref_hash', 'window_nr'], how = 'left')
#devices['last_event'].fillna(?)
devices.head()

Unnamed: 0,ref_hash,window_nr,amount_events,last_event
0,2564673204772915246,1,0.0,NaT
1,4441121667607578179,1,0.0,NaT
2,7721769811471055264,1,99.0,2019-04-18 17:25:25.479
3,6416039086842158968,1,0.0,NaT
4,1258642015983312729,1,0.0,NaT


Con que completo los ultimos eventos que no sucedieron?

## Ultimos eventos fueron con wifi?

In [47]:
temp = events.groupby(by = ['ref_hash','window_nr']).agg({'wifi':'sum'}).reset_index()
temp['wifi'] = temp['wifi'].astype(bool)
temp.head()

Unnamed: 0,ref_hash,window_nr,wifi
0,40621409780134,2,False
1,41863526108385,1,False
2,41863526108385,2,False
3,41863526108385,3,False
4,69039685746313,2,True


In [48]:
devices =devices.merge(temp, on = ['ref_hash', 'window_nr'], how = 'left')
#devices['wifi'].fillna(?)
devices.head()

Unnamed: 0,ref_hash,window_nr,amount_events,last_event,wifi
0,2564673204772915246,1,0.0,NaT,
1,4441121667607578179,1,0.0,NaT,
2,7721769811471055264,1,99.0,2019-04-18 17:25:25.479,False
3,6416039086842158968,1,0.0,NaT,
4,1258642015983312729,1,0.0,NaT,
