In [1]:
import pandas as pd
import matplotlib as mp
import numpy as np
from datetime import timedelta
import datetime as dt
import gc

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb

%matplotlib inline

#Root de archivos de datos
root_dir = '../data/'

#Lee menos data si es testing
is_testing = True
testing_frac = .30

# Cantidad minima de aparicion de equipos en un dataframe
min_devices = 10

# Identificador unico de equipos
device_uuid = ['ref_hash']

# Ventanas de tiempo
windows = pd.DataFrame({
'window_nr':[1,2,3,4],
'n': 1,
'min_date': dt.datetime(2019,4,18),
'max_date': [dt.datetime(2019,4,21),dt.datetime(2019,4,22),dt.datetime(2019,4,23),dt.datetime(2019,4,24)]
})

Prediccion de Tiempos de Arribo
===============================

Se utilizan los distintos dataframes para armar un set de features que sirvan para predecir el tiempo hasta la aparicion de un dispositivo nuevamente. 

Este notebook tiene la siguiente estructura: 

1. Lectura de los Dataframes
2. Arreglo de los datos
3. Armado de Features
4. Armado de labels
5. Armado del set de pruebas
6. Training del Modelo Predictivo
7. Predicciones
8. Evaluacion 

------------------------------
## 1. Lectura de los Dataframes

Se realiza la carga de los dataframes en memoria para el armado del modelo predictivo. La lectura se hace optimizando los tipos de datos a fin de utilizar la menor cantidad de memoria posible.

In [6]:
# Labels a submitir con las predicciones

to_predict = pd.read_csv(root_dir + 'target_competencia_ids.csv')
to_predict = to_predict.apply(lambda x: np.int64(x['ref_hash'][0:x['ref_hash'].find('_')]), axis='columns').drop_duplicates().to_frame()
to_predict.columns = ['ref_hash']

In [7]:
hash_to_drop = pd.read_csv(root_dir+'hash_to_drop.csv')

### 1.1.1. Lectura de Auctions

In [8]:
#Optimizado para menos memoria
auction_dtypes = {
    #'ref_type_id': np.int8,
    'device_id': np.int64,
    'source_id': np.int8
}

auctions = pd.read_csv(root_dir + 'auctions.csv.gzip',
                       compression = 'gzip',
                       dtype = auction_dtypes,
                       usecols = list(auction_dtypes.keys()) + ['date'],
                       parse_dates = ['date'])
auctions.rename({'device_id':'ref_hash'}, inplace=True, axis='columns')
auctions['n'] = 1

# Para hacer pruebas ocupando menos memoria se hace un sampleo aleatorio de la mitad del dataframe y se elimina el resto
if is_testing:
    auctions = auctions.sample(frac=testing_frac)
    print("Is Testing, #records:", auctions['n'].sum())
else:
    print("Not Testing, #records:", auctions['n'].sum())


Is Testing, #records: 14222858


### 1.1.2. Arreglo de datos de Auctions


In [9]:
#Elimino los ref_hash duplicados en mas de un ref_type
auctions = auctions.loc[~auctions['ref_hash'].isin(hash_to_drop['ref_hash'])]

In [10]:
#Elimino los registros con menos de un minimo de entradas, ya que no hay mucho que predecir en estos casos
orig_count = auctions['n'].sum()
auctions = auctions.groupby(device_uuid, sort = False).filter(lambda x: x['n'].sum() >= min_devices) 
last_count = auctions['n'].sum()
print('Eliminados:', orig_count-last_count)

Eliminados: 790528


In [11]:
auctions.sort_values(by=(device_uuid+['date']), inplace=True)
auctions['next_date'] = auctions.groupby(device_uuid, as_index = False, sort=False)['date'].transform(lambda x: x.shift(-1))
auctions = auctions.loc[(~auctions['next_date'].isnull())]
auctions['secs_to_next'] = (auctions['next_date'] - auctions['date']).transform(lambda x: round(x.total_seconds()))

In [12]:
auctions['n'].sum()

13017264

In [13]:
auctions['date'].min()

Timestamp('2019-04-18 00:00:00.015050')

In [14]:
auctions['date'].max()

Timestamp('2019-04-26 23:59:57.829179')

In [15]:
auctions.head()

Unnamed: 0,date,ref_hash,source_id,n,next_date,secs_to_next
14875535,2019-04-20 02:52:26.892880,41863526108385,3,1,2019-04-20 03:12:26.681062,1200
42215538,2019-04-20 03:12:26.681062,41863526108385,5,1,2019-04-20 03:14:56.144108,149
40811049,2019-04-20 03:14:56.144108,41863526108385,3,1,2019-04-20 03:15:25.579598,29
12654048,2019-04-20 03:15:25.579598,41863526108385,3,1,2019-04-20 03:17:35.306737,130
21547286,2019-04-20 03:17:35.306737,41863526108385,3,1,2019-04-20 03:17:49.278956,14


### 1.2.1. Read de Installs

In [None]:
installs_dtypes = {
    'application_id': np.int32,
    #'ref_type': np.int64,
    'ref_hash': np.int64, 
    #'click_hash':'category',
    'attributed': 'category',
    'implicit': 'category',
    #'device_countrycode': 'object', 
    'device_brand': 'object',
    'device_model': 'object', 
    'session_user_agent': 'object', 
    'user_agent': 'object', 
    'event_uuid':'object',
    'kind': 'object',
    'wifi': 'object', 
    'trans_id': 'object', 
    #'ip_address':'object', 
    'device_language': 'object'
}
install_cols = list(installs_dtypes.keys()) + ['created']
installs = pd.read_csv(root_dir + 'installs.csv.gzip', 
                       compression='gzip', 
                       usecols=install_cols,
                       dtype= installs_dtypes,
                      parse_dates=['created'])

installs['n'] = 1
print('#records:', installs['n'].sum())

### 1.2.2. Arreglo de datos de Installs


In [16]:
#installs.loc[installs['ref_type'] == 1891515180541284343, 'ref_type'] = 1
#installs.loc[installs['ref_type'] == 1494519392962156891, 'ref_type'] = 7

installs['kind'] = installs['kind'].str.replace(' ','_')
installs['kind'] = installs['kind'].str.replace('af_app_open ','af_app_opened')
installs['kind'] = installs['kind'].str.replace('af_app_opend','af_app_opened')
installs['kind'] = installs['kind'].str.lower()

NameError: name 'installs' is not defined

In [None]:
installs.isnull().sum()

In [None]:
for i in ['device_brand','device_model','session_user_agent','user_agent','kind','wifi','device_language']:
    installs[i] = installs[i].fillna('unknown')
installs['device_brand'] = installs['device_brand'].astype('category')
installs['device_model'] = installs['device_model'].astype('category')
installs['session_user_agent'] = installs['session_user_agent'].astype('category')
installs['user_agent'] = installs['user_agent'].astype('category')
installs['kind'] = installs['kind'].astype('category')
installs['wifi'] = installs['wifi'].astype('category')
installs['device_language'] = installs['device_language'].astype('category')

In [None]:
installs.head().transpose()

### 1.3.1. Lectura de Auctions

In [None]:
clicks_dtypes = {
    'advertiser_id': np.int8, 
    'action_id': np.int32, 
    'source_id': np.int8, 
    'latitude' : np.float64, 
    'longitude': np.float64, 
    'wifi_connection': np.bool, 
    'carrier_id': np.int16, 
    'trans_id': 'object',
    'os_minor':'object', 
    #'agent_device' : 'category', 
    #'os_major': 'category', 
    'specs_brand': 'category', 
    #'brand': np.int8,
    'timeToClick': np.float64, 
    #'touchX': np.float64, 
    #'touchY': np.float64, 
    #'ref_type':np.int64, 
    'ref_hash':np.int64,
    'created' : 'object'
}
clicks = pd.read_csv(root_dir + 'clicks.csv.gzip', 
                     compression='gzip',
                     low_memory=False,
                     parse_dates=['created'])
clicks['n'] = 1
print('#records:', clicks['n'].sum())

In [None]:
events_dtypes = {
    'event_id': np.int64,
    'ref_type': np.int64,
    'ref_hash': np.int64,
    'application_id': np.int64,
    'attributed': np.bool,
    'device_os_version': 'object',
    'device_brand': 'object',
    'device_model': 'object',
    'device_city': 'object',
    'session_user_agent': 'object',
    'trans_id': 'category',
    'user_agent': 'object',
    'event_uuid': 'object',
    'carrier': 'object',
    'kind': 'object',
    'device_os': 'object',
    'wifi': np.bool,
    'connection_type': 'object',
    #'ip_address': np.int64,
    #'device_language': 'category'
}
events_cols = list(events_dtypes.keys()) + ['date']
events = pd.read_csv(root_dir + 'events.csv.gzip', 
                     compression='gzip',
                     dtype=events_dtypes,
                     usecols=events_cols,
                     parse_dates=['date'])
events['n'] = 1
# Para hacer pruebas ocupando menos memoria se hace un sampleo aleatorio de la mitad del dataframe y se elimina el resto
if is_testing:
    events = events.sample(frac=testing_frac)
    print("Is Testing, #records:", events['n'].sum())
else:
    print("Not Testing, #records:", events['n'].sum())

Not Testing, #records: 7744581
