In [1]:
import pandas as pd
import matplotlib as mp
import numpy as np
from datetime import timedelta

%matplotlib inline

In [3]:
#Optimizado para menos memoria
auction_dtypes = {
    'device_id': np.int64,
    'ref_type_id': np.int8,
    'source_id': np.int8
}

auctions = pd.read_csv('../data/auctions.csv.gzip',
                       compression = 'gzip',
                       dtype = auction_dtypes,
                       parse_dates = ['date'])
auctions['n'] = 1

In [4]:
auctions['date'].min()

Timestamp('2019-04-18 00:00:00.015050')

In [5]:
auctions['date'].max()

Timestamp('2019-04-26 23:59:59.969518')

## Armado de tiempo entre arribos

Se desea saber el tiempo promedio entre arribos de los dispositivos a las encuestas.

In [6]:
grp = ['device_id']
#Ordeno por grp y fecha
auctions.sort_values(by=['device_id', 'date'], inplace=True)

KeyboardInterrupt: 

In [None]:
min_value = 2
max_value = 150
auctions = auctions.groupby(grp, sort=False).filter(lambda data: (min_value < len(data) < max_value))

In [None]:
auctions['next_date'] = auctions.groupby(grp, as_index = False, sort=False)['date']\
                                              .transform(lambda x: x.shift(-1))
auctions = auctions.loc[(~auctions['next_date'].isnull())]
auctions['secs_to_next'] = (auctions['next_date'] - auctions['date'])\
                                        .transform(lambda x: round(x.total_seconds()))

In [None]:
filtered = auctions.loc[auctions['secs_to_next'] < 120]

In [None]:
devices = filtered.groupby(grp, as_index='False')['secs_to_next'].mean().to_frame()
devices.columns = ['secs_to_next_mean']

In [None]:
devices.head()

## Tiempo desde ultima aparicion

In [None]:
max_date = auctions['date'].max()

In [None]:
time = auctions.groupby(grp).apply(lambda x: round((max_date - x['date'].max()).total_seconds())).to_frame()
time.columns = ['secs_since_last_arrival']

In [None]:
devices = devices.merge(time, how='outer', left_index=True, right_index=True)

In [None]:
devices.head()

## Cantidad de apariciones en encuestas 

In [None]:
amount_auctions = auctions.groupby(grp)['n'].count().to_frame()
amount_auctions.columns = ['auctions_total']
devices = devices.merge(amount_auctions,how = 'outer', left_index=True, right_index=True)
devices.head()

In [None]:
amount_last_auctions = auctions.groupby(grp).apply(lambda x: x.loc[x['date'] > (max_date - timedelta(hours=1)),'n'].count()).to_frame()
amount_last_auctions.columns = ['auctions_last_hour']
devices = devices.merge(amount_last_auctions, how='outer', left_index=True, right_index=True)
devices.head()

## Secuencia de ultimos 5 eventos del dispositivo