In [1]:
import pandas as pd
import matplotlib as mp
import numpy as np
from datetime import timedelta
import datetime as dt
import gc

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb

%matplotlib inline

#Root de archivos de datos
root_dir = '../data/'

#Lee menos data si es testing
is_testing = True

# Cantidad minima de aparicion de equipos en un dataframe
min_devices = 10

# Identificador unico de equipos
device_uuid = ['ref_hash']

# Ventanas de tiempo
windows = pd.DataFrame({
'window_nr':[1,2,3,4],
'n': 1,
'min_date': dt.datetime(2019,4,18),
'max_date': [dt.datetime(2019,4,21),dt.datetime(2019,4,22),dt.datetime(2019,4,23),dt.datetime(2019,4,24)]
})

Prediccion de Tiempos de Arribo
===============================

Se utilizan los distintos dataframes para armar un set de features que sirvan para predecir el tiempo hasta la aparicion de un dispositivo nuevamente. 

Este notebook tiene la siguiente estructura: 

1. Lectura de los Dataframes
2. Arreglo de los datos
3. Armado de Features
4. Armado de labels
5. Armado del set de pruebas
6. Training del Modelo Predictivo
7. Predicciones
8. Evaluacion 

------------------------------
## 1. Lectura de los Dataframes

Se realiza la carga de los dataframes en memoria para el armado del modelo predictivo. La lectura se hace optimizando los tipos de datos a fin de utilizar la menor cantidad de memoria posible.

In [2]:
#Optimizado para menos memoria
auction_dtypes = {
    'ref_type_id': np.int8,
    'device_id': np.int64,
    'source_id': np.int8
}

auctions = pd.read_csv(root_dir + 'auctions.csv.gzip',
                       compression = 'gzip',
                       dtype = auction_dtypes,
                       parse_dates = ['date'])
auctions.rename({'device_id':'ref_hash',
                 'ref_type_id':'ref_type'}, inplace=True, axis='columns')
auctions['n'] = 1

# Para hacer pruebas ocupando menos memoria se hace un sampleo aleatorio de la mitad del dataframe y se elimina el resto
if is_testing:
    auctions = auctions.sample(frac=.30)
    print("Is Testing, #records:", auctions['n'].sum())
else:
    print("Not Testing, #records:", auctions['n'].sum())


Not Testing, #records: 47409528


In [None]:
installs_dtypes = {
    'application_id': np.int32,
    'ref_type': np.int64,
    'ref_hash': np.int64, 
    #'click_hash':'category',
    'attributed': 'category',
    'implicit': 'category',
    #'device_countrycode': 'object', 
    'device_brand': 'object',
    'device_model': 'object', 
    'session_user_agent': 'object', 
    'user_agent': 'object', 
    'event_uuid':'object',
    'kind': 'object',
    'wifi': 'object', 
    'trans_id': 'object', 
    #'ip_address':'object', 
    'device_language': 'object'
}
install_cols = list(installs_dtypes.keys()) + ['created']
installs = pd.read_csv(root_dir + 'installs.csv.gzip', 
                       compression='gzip', 
                       usecols=install_cols,
                       dtype= installs_dtypes,
                      parse_dates=['created'])

installs['n'] = 1
print('#records:', installs['n'].sum())

#records: 481511


In [None]:
installs.loc[installs['ref_type'] == 1891515180541284343, 'ref_type'] = 1
installs.loc[installs['ref_type'] == 1494519392962156891, 'ref_type'] = 7

In [None]:
clicks_dtypes = {
    'advertiser_id': np.int8, 
    'action_id': np.int32, 
    'source_id': np.int8, 
    'latitude' : np.float64, 
    'longitude': np.float64, 
    'wifi_connection': np.bool, 
    'carrier_id': np.int16, 
    'trans_id': 'object',
    'os_minor':'object', 
    #'agent_device' : 'category', 
    #'os_major': 'category', 
    'specs_brand': 'category', 
    #'brand': np.int8,
    'timeToClick': np.float64, 
    #'touchX': np.float64, 
    #'touchY': np.float64, 
    'ref_type':np.int64, 
    'ref_hash':np.int64,
    'created' : 'object'
}
clicks = pd.read_csv(root_dir + 'clicks.csv.gzip', 
                     compression='gzip',
                     low_memory=False,
                     parse_dates=['created'])
clicks['n'] = 1
print('#records:', clicks['n'].sum())

#records: 64296


In [None]:
events_dtypes = {
    'event_id': np.int64,
    'ref_type': np.int64,
    'ref_hash': np.int64,
    'application_id': np.int64,
    'attributed': np.bool,
    'device_os_version': 'object',
    'device_brand': 'object',
    'device_model': 'object',
    'device_city': 'object',
    'session_user_agent': 'object',
    'trans_id': 'category',
    'user_agent': 'object',
    'event_uuid': 'object',
    'carrier': 'object',
    'kind': 'object',
    'device_os': 'object',
    'wifi': np.bool,
    'connection_type': 'object',
    #'ip_address': np.int64,
    #'device_language': 'category'
}
events_cols = list(events_dtypes.keys()) + ['date']
events = pd.read_csv(root_dir + 'events.csv.gzip', 
                     compression='gzip',
                     dtype=events_dtypes,
                     usecols=events_cols,
                     parse_dates=['date'])
events['n'] = 1
# Para hacer pruebas ocupando menos memoria se hace un sampleo aleatorio de la mitad del dataframe y se elimina el resto
if is_testing:
    events = events.sample(frac=.30)
    print("Is Testing, #records:", events['n'].sum())
else:
    print("Not Testing, #records:", events['n'].sum())

Not Testing, #records: 7744581


In [None]:
# Labels a submitir con las predicciones

to_predict = pd.read_csv(root_dir + 'target_final_competencia_revamped.csv')
to_predict = to_predict.apply(lambda x: np.int64(x['ref_hash'][0:x['ref_hash'].find('_')]), axis='columns').drop_duplicates().to_frame()
to_predict.columns = ['ref_hash']

---
## 2. Arreglo de los Datos

### 2.1 Arreglo de datos de Auctions


In [None]:
#Elimino los registros con menos de un minimo de entradas, ya que no hay mucho que predecir en estos casos
orig_count = auctions['n'].sum()
auctions = auctions.groupby(device_uuid, sort = False).filter(lambda x: x['n'].sum() >= min_devices) 
last_count = auctions['n'].sum()
print('Eliminados:', orig_count-last_count)

Eliminados: 699013


In [None]:
auctions.sort_values(by=(device_uuid+['date']), inplace=True)
auctions['next_date'] = auctions.groupby(device_uuid, as_index = False, sort=False)['date'].transform(lambda x: x.shift(-1))
auctions = auctions.loc[(~auctions['next_date'].isnull())]
auctions['secs_to_next'] = (auctions['next_date'] - auctions['date']).transform(lambda x: round(x.total_seconds()))

In [None]:
auctions['n'].sum()

In [None]:
auctions['date'].min()

In [None]:
auctions['date'].max()

In [None]:
auctions.head()

### 2.2 Arreglo de datos de Installs

In [None]:
installs['kind'] = installs['kind'].str.replace(' ','_')
installs['kind'] = installs['kind'].str.replace('af_app_open ','af_app_opened')
installs['kind'] = installs['kind'].str.replace('af_app_opend','af_app_opened')
installs['kind'] = installs['kind'].str.lower()

In [None]:
installs.isnull().sum()

In [None]:
for i in ['device_brand','device_model','session_user_agent','user_agent','kind','wifi','device_language']:
    installs[i] = installs[i].fillna('unknown')
installs['device_brand'] = installs['device_brand'].astype('category')
installs['device_model'] = installs['device_model'].astype('category')
installs['session_user_agent'] = installs['session_user_agent'].astype('category')
installs['user_agent'] = installs['user_agent'].astype('category')
installs['kind'] = installs['kind'].astype('category')
installs['wifi'] = installs['wifi'].astype('category')
installs['device_language'] = installs['device_language'].astype('category')

In [None]:
installs.head().transpose()

### 2.3 Arreglo de datos de Clicks

In [None]:
clicks.touchX = clicks.touchX.apply(lambda x: np.float64(x))
clicks.touchY = clicks.touchY.apply(lambda x: np.float64(x))
clicks['created'] = clicks['created'].dt.tz_localize(None)

In [None]:
for i in ['carrier_id','os_minor', 'agent_device', 'brand', 'os_major']:
    clicks[i] = clicks[i].fillna('unknown').astype('category')

In [None]:
clicks.isnull().sum()

### 2.4 Arreglo de datos de Events

In [None]:
for i in ['device_os_version','device_brand','device_model','device_city','session_user_agent','user_agent','carrier','kind','device_os','connection_type']:
    events[i] = events[i].fillna('unknown').astype('category')

In [None]:
events.isnull().sum()

---
Separado de las ventanas de tiempo
----------------------------------

### Separo las semanas de entrenamiento

Se pueden utilizar distintos metodos: 

- Se arman ventanas de 3 días para predecir 3 dias
- Se arman ventanas de n-1 dias para predecir n a n+2 dias.

Para el caso de estudio se utiliza la opcion de maximizar la cantidad de datos para predecir cada ventana.

Los datos pueden entonces estar en las siguientes ventanas y deben solo usarse para predecir la ventana correspondiente en los sets de training.

Las ventanas de los sets de entrenamiento son: 

1. 21 al 23
2. 22 al 24
3. 23 al 25
4. 24 al 26

Entonces las ventanas armadas serán:
1. 18 al 20
2. 18 al 21
3. 18 al 22
4. 18 al 23

---
## 3. Armado de Features

A continuacion se comienzan a extraer los distintos features que formaran el set de entrenamiento. 
Se cruza cada ventana con los labels que se desean predecir. 

Para entrenar el set debe decidirse como utilizar los datos de las distintas ventanas. 

- Una opcion será mezclar todos los datos pero hay que decidir que hacer con los equipos que aparecen mas de una vez. 

- La segunda opcion es entrenar 4 modelos distintos y verificar que haya una mejora en todos ellos.

In [None]:
# Entreno con ventana de 18 al 23 y valido con ventana del 24 al 26
cutoff_date = windows.set_index('window_nr').loc[4,'max_date']

train_auctions = auctions.loc[auctions['date'] < cutoff_date]
train_installs = installs.loc[installs['created'] < cutoff_date]
train_events = events.loc[events['date'] < cutoff_date]
train_clicks = clicks.loc[clicks['created'] < cutoff_date]

In [None]:
labels_installs = installs.loc[installs['created'] >= cutoff_date]

In [None]:
devices = train_auctions[device_uuid].drop_duplicates()
devices['ref_hash'].count()

### Armado de tiempo entre arribos

Se desea saber el tiempo promedio entre arribos de los dispositivos a las encuestas.

In [None]:
#filtered = auctions.loc[auctions['secs_to_next'] < 120]

In [None]:
temp = train_auctions.groupby(device_uuid, as_index='False')['secs_to_next'].mean().to_frame()
temp.columns = ['secs_to_next_mean']

devices = devices.merge(temp, how='left', on=device_uuid).set_index(device_uuid)

In [None]:
devices.head()

### Tiempo desde ultima aparicion

In [None]:
max_date = train_auctions['date'].max()

In [None]:
time = train_auctions.groupby(device_uuid).apply(lambda x: round((max_date - x['date'].max()).total_seconds())).to_frame()
time.columns = ['secs_since_last_arrival']

In [None]:
devices = devices.merge(time, how='left', left_index=True, right_index=True)

In [None]:
devices.head()

### Cantidad de apariciones en encuestas 

In [None]:
amount_auctions = train_auctions.groupby(device_uuid)['n'].count().to_frame()
amount_auctions.columns = ['auctions_total']
devices = devices.merge(amount_auctions,how = 'left', left_index=True, right_index=True)
devices.head()

In [None]:
amount_last_auctions = train_auctions.groupby(device_uuid).apply(lambda x: x.loc[x['date'] > (max_date - timedelta(hours=1)),'n'].count()).to_frame()
amount_last_auctions.columns = ['auctions_last_hour']
devices = devices.merge(amount_last_auctions, how='left', left_index=True, right_index=True)
devices.head()

### Secuencia de ultimos 5 eventos del dispositivo

---
## 4. Armado de Labels 

### 4.1. Prediccion de Installs

In [None]:
all_ref_hash = devices.reset_index()[device_uuid].drop_duplicates()
len(all_ref_hash)

In [None]:
installs['ref_type'].value_counts()

In [None]:
window = 3
j=1
installs_training = pd.DataFrame()

for i in range(21,25): #[19, 20, 21, 22, 23, 24]
    print("Ventana: ", j)
    temp = pd.DataFrame()
    
    temp[device_uuid + ['date_install']] = installs.loc[(installs['created'].dt.day >= i) & (installs['created'].dt.day < (i+window))]\
                                                    .groupby(device_uuid, as_index=False)['created'].min()
    temp['window_nr'] = j
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['secs_to_install'] = (temp['date_install']-temp['window_date_start']).transform(lambda x: x.total_seconds())
    print("  Encontrados: ", len(temp))
    installs_training = installs_training.append(temp,sort=True)
    
    #Agrego los que no se encontraron en esta ventana pero si estan en installs (aparecen en alguna)
    temp = all_ref_hash.merge(installs_training.loc[installs_training['window_nr'] == j], how='left',on=device_uuid)
    temp = temp.loc[temp['window_nr'].isnull()]
    temp['window_nr'] = j
    temp['window_date_start'] = dt.datetime(2019,4,i)
    temp['date_install'] = dt.datetime(2019,4,i+window)
    temp['secs_to_install'] = (temp['date_install']-temp['window_date_start']).transform(lambda x: x.total_seconds())
    print("  Agregados sin installs en esta ventana: ", len(temp))
    installs_training = installs_training.append(temp,sort=True)
    
    #Agrego los que no se encontraron en install pero se deben predecir conversiones
    #temp = to_predict.merge(installs_training.loc[installs_training['window_nr'] == j], how='left', on='ref_hash')
    #temp = temp.loc[temp['window_nr'].isnull()]
    #print("  Agregados sin installs pero en lista a predecir: ", len(temp))
    #temp['window_nr'] = j
    #temp['window_date_start'] = dt.datetime(2019,4,i)
    #temp['date_install'] = dt.datetime(2019,4,i+window)
    #temp['secs_to_install'] = (temp['date_install']-temp['window_date_start']).transform(lambda x: x.total_seconds())
    #installs_training = installs_training.append(temp,sort=True)
    
    j = j+1

sorted_installs = installs_training.sort_values(device_uuid + ['window_nr'])

In [None]:
sorted_installs['window_nr'].value_counts()

---
## 5. Training 

In [None]:
training_set = devices.merge(sorted_installs.loc[sorted_installs['window_nr'] == 4, device_uuid+['secs_to_install']].set_index(device_uuid), left_index=True,right_index=True, how='left')

In [None]:
training_set.isnull().sum()

In [None]:
x=devices
y=training_set['secs_to_install']

x_train, x_test, y_train, y_test =  train_test_split(x,y)

In [None]:
regr = RandomForestRegressor(random_state=0, n_estimators=100)
regr.fit(x_train,y_train)

---
## 6. Prediccion

In [None]:
y_predicted = regr.predict(x_test)

---
## 7. Validacion

In [None]:
mean_squared_error(y_test,y_predicted)

In [None]:
regr.feature_importances_

In [None]:
x_test.columns