In [32]:
import pandas as pd
import matplotlib as mp
from datetime import timedelta

%matplotlib inline

In [33]:
auctions = pd.read_csv('../../data/auctions.csv.gzip',compression='gzip',\
                        usecols=['date','device_id','platform','ref_type_id','source_id'],\
                        dtype={'device_id':'int64','platform':'category','ref_type_id':'category','source_id':'category'},\
                        parse_dates=['date'])
auctions['n'] = 1

In [34]:
auctions['date'].min()

Timestamp('2019-03-05 00:52:33.352526')

In [35]:
auctions['date'].max()

Timestamp('2019-03-13 23:59:59.997032')

## Armado de tiempo entre arribos

Se desea saber el tiempo promedio entre arribos de los dispositivos a las encuestas.

In [36]:
grp = ['device_id']
#Ordeno por grp y fecha
auctions.sort_values(by=['device_id', 'date'], inplace=True)

In [37]:
min_value = 2
max_value = 150
auctions = auctions.groupby(grp, sort=False).filter(lambda data: (min_value < len(data) < max_value))

In [38]:
auctions['next_date'] = auctions.groupby(grp, as_index = False, sort=False)['date']\
                                              .transform(lambda x: x.shift(-1))
auctions = auctions.loc[(~auctions['next_date'].isnull())]
auctions['secs_to_next'] = (auctions['next_date'] - auctions['date'])\
                                        .transform(lambda x: round(x.total_seconds()))

In [39]:
filtered = auctions.loc[auctions['secs_to_next'] < 120]

In [40]:
devices = filtered.groupby(grp, as_index='False')['secs_to_next'].mean().to_frame()
devices.columns = ['secs_to_next_mean']

In [41]:
devices.head()

Unnamed: 0_level_0,secs_to_next
device_id,Unnamed: 1_level_1
113858820194433,2.285714
148049712234927,6.0
163367509015039,39.44
250378692954397,8.2
356084629798952,20.0


## Tiempo desde ultima aparicion

In [42]:
max_date = auctions['date'].max()

In [43]:
time = auctions.groupby(grp).apply(lambda x: round((max_date - x['date'].max()).total_seconds())).to_frame()
time.columns = ['secs_since_last_arrival']

In [44]:
devices = devices.merge(time, how='outer', left_index=True, right_index=True)

In [45]:
devices.head()

Unnamed: 0_level_0,secs_to_next,time_from_last_arrival
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1
113858820194433,2.285714,32593
148049712234927,6.0,760317
163367509015039,39.44,286790
250378692954397,8.2,139412
356084629798952,20.0,436175


## Cantidad de apariciones en encuestas 

In [46]:
amount_auctions = auctions.groupby(grp)['n'].count().to_frame()
amount_auctions.columns = ['auctions_total']
devices = devices.merge(amount_auctions,how = 'outer', left_index=True, right_index=True)
devices.head()

Unnamed: 0_level_0,secs_to_next,time_from_last_arrival,total_auctions
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
113858820194433,2.285714,32593,47
148049712234927,6.0,760317,3
163367509015039,39.44,286790,35
250378692954397,8.2,139412,13
356084629798952,20.0,436175,3


In [47]:
amount_last_auctions = auctions.groupby(grp).apply(lambda x: x.loc[x['date'] > (max_date - timedelta(hours=1)),'n'].count()).to_frame()
amount_last_auctions.columns = ['auctions_last_hour']
devices = devices.merge(amount_last_auctions, how='outer', left_index=True, right_index=True)
devices.head()

Unnamed: 0_level_0,secs_to_next,time_from_last_arrival,total_auctions,auctions_last_hour
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
113858820194433,2.285714,32593,47,0
148049712234927,6.0,760317,3,0
163367509015039,39.44,286790,35,0
250378692954397,8.2,139412,13,0
356084629798952,20.0,436175,3,0


## Secuencia de ultimos 5 eventos del dispositivo