# Recomender V2.1

El dataset se armará de la siguiente forma:

Por cada usuario, tenemos una lista de content ids vistos. Para cada una de estas listas se arman las combinaciones de modo que tengamos una fila con cada id usado como label. Por ejemplo:

Usuario A vio: 1,2,3,4.

Dataset:

2 3 4 __ label__1

1 3 4 __ label__2

1 2 4 __ label__3

1 2 3 __ label__4

Y asi por cada usuario.

Para predecir, al modelo se le pasaría el listado de ids visto por el usuario y se aplicaría el mismo filtro y cold start que en V1.

Esto es un **fix** de la v2.0: La lógica es la misma, pero al momento de generar predicciones, en la v2.0 se generaban duplicados en la lista. Además, las recomendaciones cold start no se estaban filtrando con la condicion que elimina lo que no estará disponible en abril.

In [1]:
from fasttext import load_model, train_supervised
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime

In [2]:
metadata = pd.read_csv("../../../data/metadata.csv", delimiter=";", parse_dates=['create_date', 'modify_date', 'start_vod_date', 'end_vod_date'])
train = pd.read_csv("../../../data/train.csv", parse_dates=['tunein', 'tuneout'])

In [3]:
train.head()

Unnamed: 0,customer_id,account_id,device_type,asset_id,tunein,tuneout,resume
0,0,90627,STATIONARY,18332.0,2021-02-18 22:52:00,2021-02-18 23:35:00,0
1,0,90627,STATIONARY,24727.0,2021-03-24 23:17:00,2021-03-25 00:01:00,0
2,1,3387,STB,895.0,2021-03-15 10:05:00,2021-03-15 10:23:00,0
3,1,3387,STB,895.0,2021-03-15 10:23:00,2021-03-15 11:18:00,1
4,1,3387,STB,26062.0,2021-03-16 09:24:00,2021-03-16 09:44:00,0


In [4]:
metadata.head()

Unnamed: 0,asset_id,content_id,title,reduced_title,episode_title,show_type,released_year,country_of_origin,category,keywords,...,language_rating,dialog_rating,fv_rating,pay_per_view,pack_premium_1,pack_premium_2,create_date,modify_date,start_vod_date,end_vod_date
0,15188,0.0,Ep:17 Tiempos Compulsivos,Tiempos_Compul_E17,Episodio 17,Serie,2012.0,AR,Drama,"Trastornos,Médicos,Tragicómica,Telenovela,Enfe...",...,N,N,N,N,N,N,2017-12-01 10:18:15+00:00,2019-01-26 06:37:18+00:00,2017-12-01 00:00:00+00:00,2020-12-01 23:59:59+00:00
1,24940,1.0,7 Cajas,7_Cajas,,Película,2012.0,PY,Suspenso/Acción,"Latinoamérica,Pobreza,Crimen,Pandillas",...,N,N,N,Y,N,N,2017-12-19 20:58:15+00:00,2019-09-17 19:02:03+00:00,2017-12-15 00:00:00+00:00,2022-12-14 23:59:59+00:00
2,21939,2.0,La Maldición de las Hormigas Gigantes,La_Maldicion_de_las,,Película,2016.0,FI,Terror/Comedia,"Criaturas,Plagas,Adolescentes,Fantasía,Video J...",...,N,N,N,N,N,N,2018-02-16 13:51:07+00:00,2020-04-28 14:16:38+00:00,2018-01-25 00:00:00+00:00,2020-12-01 23:59:59+00:00
3,9005,3.0,Una Mujer Fantástica,Una_Mujer_Fantastic,,Película,2017.0,CL,Drama,"LGBT,Mujeres,Latinoamérica",...,N,N,N,N,Y,N,2018-05-26 11:58:44+00:00,2019-11-15 03:00:23+00:00,2018-05-27 00:00:00+00:00,2021-04-30 23:59:59+00:00
4,7391,4.0,Star Trek,Star_Trek,,Película,2009.0,US,Ciencia Ficción/Aventura,"Fantasía,Galaxia,Futurismo,Aliens,Criaturas",...,N,N,N,Y,N,N,2019-05-03 20:07:24+00:00,2020-04-09 04:37:29+00:00,2019-05-02 00:00:00+00:00,2020-12-31 23:59:59+00:00


# Merge datasets

In [5]:
train_metadata = train.merge(metadata, on='asset_id', how='left')

In [6]:
train_metadata.columns

Index(['customer_id', 'account_id', 'device_type', 'asset_id', 'tunein',
       'tuneout', 'resume', 'content_id', 'title', 'reduced_title',
       'episode_title', 'show_type', 'released_year', 'country_of_origin',
       'category', 'keywords', 'description', 'reduced_desc',
       'cast_first_name', 'credits_first_name', 'run_time_min', 'audience',
       'made_for_tv', 'close_caption', 'sex_rating', 'violence_rating',
       'language_rating', 'dialog_rating', 'fv_rating', 'pay_per_view',
       'pack_premium_1', 'pack_premium_2', 'create_date', 'modify_date',
       'start_vod_date', 'end_vod_date'],
      dtype='object')

In [7]:
train_metadata = train_metadata[['account_id', 'tunein', 'tuneout', 'content_id', 'title', 'end_vod_date']]

In [8]:
train_metadata.head()

Unnamed: 0,account_id,tunein,tuneout,content_id,title,end_vod_date
0,90627,2021-02-18 22:52:00,2021-02-18 23:35:00,2040.0,T:5 Ep:08 This is Us,2021-06-30 23:59:59+00:00
1,90627,2021-03-24 23:17:00,2021-03-25 00:01:00,2040.0,T:5 Ep:10 This is Us,2021-06-30 23:59:59+00:00
2,3387,2021-03-15 10:05:00,2021-03-15 10:23:00,1983.0,T:1 Ep:02 Big Little Lies,2021-03-28 23:59:00+00:00
3,3387,2021-03-15 10:23:00,2021-03-15 11:18:00,1983.0,T:1 Ep:02 Big Little Lies,2021-03-28 23:59:00+00:00
4,3387,2021-03-16 09:24:00,2021-03-16 09:44:00,729.0,T:1 Ep:02 Dime quién soy,2021-03-28 23:59:00+00:00


Para filtrar los contenidos que no van a estar disponibles en abril

In [9]:
train_metadata['end_vod_date'] = train_metadata['end_vod_date'].dt.tz_localize(None)

In [10]:
max_end_vod_date_by_content = train_metadata[['content_id', 'end_vod_date']].groupby('content_id', as_index=False).agg({'end_vod_date': 'max'})

In [11]:
end_before_april = max_end_vod_date_by_content[max_end_vod_date_by_content.end_vod_date < datetime(year=2021, month=4, day=1)].content_id.unique()
end_before_april = [int(c) for c in end_before_april]

In [12]:
len(end_before_april)

928

In [13]:
train_metadata.content_id.nunique()

4064

# Dataset de interacciones

In [14]:
ft_train = train_metadata[['account_id', 'content_id']].copy()
ft_train.head()

Unnamed: 0,account_id,content_id
0,90627,2040.0
1,90627,2040.0
2,3387,1983.0
3,3387,1983.0
4,3387,729.0


In [15]:
ft_train.shape

(3657801, 2)

In [16]:
ft_train.isna().sum()

account_id      0
content_id    142
dtype: int64

In [17]:
ft_train.account_id.nunique()

113881

In [18]:
ft_train.content_id.nunique()

4064

In [19]:
ft_train = ft_train.dropna()
ft_train['content_id'] = ft_train['content_id'].astype(int)

In [20]:
ft_train.shape

(3657659, 2)

In [21]:
ft_train.account_id.nunique()

113875

In [22]:
ft_train.content_id.nunique()

4064

In [23]:
account_contents = ft_train.groupby(by='account_id', as_index=False).agg({'content_id': 'unique'})
account_contents.head()

Unnamed: 0,account_id,content_id
0,0,"[3438, 2866, 3498, 1503, 3845]"
1,1,"[1020, 1220, 1761]"
2,2,"[1099, 6, 183, 557, 1582, 1443, 433]"
3,3,"[1463, 3790, 3755, 3769, 2810, 842, 3206, 2230..."
4,4,"[289, 2786, 4352, 1008, 3748, 2816, 1139, 3711..."


In [24]:
account_contents.content_id.dtype

dtype('O')

# Armado de dataset de fasttext

In [25]:
def get_fasttext_dataset(views_df):
    ft_dataset_lines = []
    
    for account_id in tqdm(account_contents.account_id.unique()):
        watched_contents = views_df[views_df.account_id == account_id].content_id.values[0]
        
        if len(watched_contents) > 1:
            for content in watched_contents:
                contents = [str(c) for c in watched_contents if c != content]
                label = content
                str_contents = " ".join([c for c in contents])
                line = " ".join([str_contents, f"__label__{label}", "\n"])
                ft_dataset_lines.append(line)
    return ft_dataset_lines

In [26]:
%%time
ft_dataset_lines = get_fasttext_dataset(account_contents)

100%|██████████| 113875/113875 [02:42<00:00, 701.80it/s]

CPU times: user 2min 40s, sys: 2.48 s, total: 2min 43s
Wall time: 2min 42s





In [27]:
ft_dataset_lines[:10]

['2866 3498 1503 3845 __label__3438 \n',
 '3438 3498 1503 3845 __label__2866 \n',
 '3438 2866 1503 3845 __label__3498 \n',
 '3438 2866 3498 3845 __label__1503 \n',
 '3438 2866 3498 1503 __label__3845 \n',
 '1220 1761 __label__1020 \n',
 '1020 1761 __label__1220 \n',
 '1020 1220 __label__1761 \n',
 '6 183 557 1582 1443 433 __label__1099 \n',
 '1099 183 557 1582 1443 433 __label__6 \n']

In [28]:
%%time
with open('fastext_features.txt', 'w') as f:
    
    for line in tqdm(ft_dataset_lines):
        f.write(line)

100%|██████████| 951580/951580 [00:13<00:00, 71435.41it/s]

CPU times: user 1.94 s, sys: 1.18 s, total: 3.12 s
Wall time: 13.3 s





In [29]:
%%time
model = train_supervised(
            input='fastext_features.txt',
            epoch=25,
            lr=0.3,
            lrUpdateRate=100,
            loss='hs',
            wordNgrams=1,
            verbose=2,
            minCount=5,
            minCountLabel=10,
            dim=30
        )

CPU times: user 3min 24s, sys: 5.65 s, total: 3min 30s
Wall time: 33.9 s


In [30]:
def id2title(content_id):
    title = metadata[metadata.content_id == content_id].title.values[0]
    return title

def id2title_list(content_ids):
    titles = metadata[metadata.content_id.isin(content_ids)].drop_duplicates('content_id').title.values
    return titles

def clean_recomms(recomms):
    return [int(rec.replace("__label__", "")) for rec in recomms]

# Popularidad para cold start

In [31]:
popularity_df = ft_train[['account_id', 'content_id']].groupby('content_id', as_index=False).agg({'account_id': 'nunique'})
popularity_df.columns = ['content_id', 'accounts']
popularity_df = popularity_df.sort_values(by='accounts', ascending=False)
popularity_df.head()

Unnamed: 0,content_id,accounts
1764,2040,13629
3515,3806,9377
3609,3900,9082
3839,4133,8722
1707,1983,6695


# Recomendaciones

In [32]:
N_PREDS = 15
recomms = {
    'account_id': [],
    'recomms': []
}

for account_id in tqdm(train_metadata.account_id.unique()):
    watched_contents = account_contents[account_contents.account_id == account_id].content_id.values
    
    if len(watched_contents) > 0:
        watched_contents = watched_contents[0]
        preds = []
        str_contents = " ".join([str(c) for c in watched_contents])
        line_to_predict = " ".join([str_contents])
        preds = model.predict(line_to_predict, k=N_PREDS)[0]
        preds = clean_recomms(preds)
        preds = [p for p in preds if p not in end_before_april]
        int_watched_contents = [int(c) for c in watched_contents]
        preds = [p for p in preds if p not in int_watched_contents]
        
        no_watched_popular_contents = popularity_df[(~popularity_df.content_id.isin(int_watched_contents))
                                                     & (~popularity_df.content_id.isin(preds))
                                                     & (~popularity_df.content_id.isin(end_before_april))].sort_values(by='accounts', ascending=False).content_id.unique()
        i = 0
        while len(preds) < 20:
            preds.append(no_watched_popular_contents[i])
            i += 1
        
        recomms['account_id'].append(account_id)
        recomms['recomms'].append(preds)
    else:
        popular = popularity_df[~popularity_df.content_id.isin(end_before_april)].sort_values(by='accounts', ascending=False).content_id.unique()[:20]
        recomms['account_id'].append(account_id)
        recomms['recomms'].append(popular)

100%|██████████| 113881/113881 [12:18<00:00, 154.20it/s]


In [33]:
len(recomms['account_id'])

113881

In [34]:
%%time
recomms_df = pd.DataFrame(recomms)

CPU times: user 154 ms, sys: 3.98 ms, total: 158 ms
Wall time: 157 ms


In [35]:
recomms_df.account_id.nunique()

113881

In [36]:
recomms_df.head()

Unnamed: 0,account_id,recomms
0,90627,"[116, 1877, 1983, 3806, 1573, 3900, 1539, 774,..."
1,3387,"[2040, 4133, 3900, 3806, 2627, 1462, 2942, 187..."
2,3388,"[116, 4133, 724, 1877, 1983, 3716, 3900, 1462,..."
3,3389,"[1020, 3900, 1573, 724, 3214, 3806, 1983, 3206..."
4,3390,"[2721, 3900, 4133, 1462, 3152, 3711, 2942, 205..."


# Recomendaciones csv

In [37]:
recomms_df[['account_id', 'recomms']].sort_values(by='account_id', ascending=True).to_csv('v2_1.csv', index=None, header=None)

In [69]:
user = 1000

In [70]:
test = recomms_df[recomms_df.account_id == user].copy()
test['names'] = test['recomms'].apply(id2title_list)

In [71]:
test

Unnamed: 0,account_id,recomms,names
94714,1000,"[2160, 1139, 304, 8, 20, 2163, 491, 712, 637, ...","[T:1 Ep:05 Mickey: Aventuras sobre ruedas, T:2..."


In [72]:
for name in test.names:
    print(name)

['T:1 Ep:05 Mickey: Aventuras sobre ruedas' 'T:2 Ep:73 Puppy Dog Pals'
 'T:3 Ep:74 Doctora Juguetes' 'T:1 Ep:01 La Granja de Zenón'
 'T:1 Ep:01 La Granja de Zenón: La Serie' 'T:1 Ep:19 Fancy Nancy Clancy'
 'T:2 Ep:65 Vampirina' 'T:1 Ep:30 Mira: La detective del reino'
 'T:1 Ep:43 T.O.T.S. Tiny Ones Transfer Service'
 'T:1 Ep:03 Dime quién soy' 'T:1 Ep:36 Bluey' 'T:1 Ep:04 Big Little Lies'
 'T:1 Ep:01 This is Us' 'T:1 Ep:01 Paw Patrol, Patrulla Canina'
 'T:1 Ep:01 Gallina Pintadita Mini' 'T:1 Ep:01 The Outpost'
 'Fuga de pretoria' 'Cosa de minas' 'T:1 Ep:01 The Collapse'
 'T:1 Ep:01 El nudo']


In [73]:
id2title_list(train_metadata[train_metadata.account_id == user].content_id.unique())

array(['T:2 Ep:50 Muppet Babies', 'T:8 Ep:40 Peppa Pig',
       'T:1 Ep:01 Gallina Pintadita Collections', 'T:1 Ep:01 Devils'],
      dtype=object)