In [1]:
import pandas    as pd
import numpy as np

Completamos los datos de train con la data de **content_id** que obtenemos de la metadata.

In [2]:
df_train = pd.read_csv(
    '../resources/train.csv',
    parse_dates=[
        'tunein',
        'tuneout'
    ]
)
df_train = df_train[~df_train.asset_id.isna()]
df_train.asset_id = df_train.asset_id.astype(pd.Int64Dtype())

In [3]:
df_metadata = pd.read_csv(
    '../resources/metadata.csv',
    delimiter=';',
    parse_dates=[
        'create_date',
        'modify_date',
        'start_vod_date',
        'end_vod_date'
    ]
)
df_metadata.content_id = df_metadata.content_id.astype(pd.Int64Dtype())

In [4]:
df_train_full = pd.merge(
    df_train,
    df_metadata[['asset_id','content_id','run_time_min', 'end_vod_date']],
    on='asset_id',
    how='left'
)

In [5]:
df_train_full.sort_values(
    ['account_id', 'customer_id', 'content_id', 'asset_id', 'tunein', 'resume'],
    inplace=True
)

In [6]:
df_train_full['view_time_min'] = (df_train_full.tuneout - df_train_full.tunein).astype('timedelta64[m]')
df_train_full.head(3)

Unnamed: 0,customer_id,account_id,device_type,asset_id,tunein,tuneout,resume,content_id,run_time_min,end_vod_date,view_time_min
3164569,94636,0,STB,15900,2021-01-18 15:21:00,2021-01-18 17:29:00,0,1503,128.0,2021-03-05 23:59:59+00:00,128.0
3164567,94636,0,STB,13056,2021-01-13 00:12:00,2021-01-13 01:17:00,1,2866,86.0,2021-12-31 23:59:59+00:00,65.0
3164566,94636,0,STB,29811,2021-01-12 21:13:00,2021-01-12 22:32:00,0,3438,79.0,2021-05-11 23:59:00+00:00,79.0


Generamos metadata para profiling util agrupada por:
- **customer_id**
- **account_id**
- **content_id**
- **asset_id**

In [7]:
df_train_full.shape

(3657779, 11)

In [8]:
gb = df_train_full.groupby(
    by=[
        'account_id',
        'customer_id',
        'content_id',
        'asset_id'
    ]
).agg({
    'tunein': 'min',
    'tuneout': 'max',
    'end_vod_date': 'max',
    'view_time_min': 'sum',
    'run_time_min': 'sum',
    'resume': 'sum',
    'asset_id': 'count'
})

gb.rename(columns={'asset_id': 'count'}, inplace=True)

gb['times_seen'] = (gb.view_time_min // gb.run_time_min)
gb.times_seen = gb.times_seen.apply(lambda x: 0.0 if np.isinf(x) else x)
gb.times_seen = gb.times_seen.astype('int64')

gb['pct_seen'] = (gb.view_time_min / gb.run_time_min).astype(float)
gb.reset_index(inplace=True)

In [9]:
# Almacenamos el nuevo archivo de entrenamiento
df_train_full.to_csv(
    '../resources/train_full.csv',
    index=False
)

In [10]:
gb.to_csv(
    '../resources/train_full_v2.csv',
    index=False
)



