In [None]:
#!pip install lightfm

# V3 Recommender: LightFM 

Doc: https://making.lyst.com/lightfm/docs/lightfm.html

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
from lightfm import LightFM
from scipy.sparse import csr_matrix

# Datos

In [3]:
metadata = pd.read_csv("../../data/metadata.csv", delimiter=";", parse_dates=['create_date', 'modify_date', 'start_vod_date', 'end_vod_date'])
train = pd.read_csv("../../data/train.csv", parse_dates=['tunein', 'tuneout'])

In [4]:
train.head()

Unnamed: 0,customer_id,account_id,device_type,asset_id,tunein,tuneout,resume
0,0,90627,STATIONARY,18332.0,2021-02-18 22:52:00,2021-02-18 23:35:00,0
1,0,90627,STATIONARY,24727.0,2021-03-24 23:17:00,2021-03-25 00:01:00,0
2,1,3387,STB,895.0,2021-03-15 10:05:00,2021-03-15 10:23:00,0
3,1,3387,STB,895.0,2021-03-15 10:23:00,2021-03-15 11:18:00,1
4,1,3387,STB,26062.0,2021-03-16 09:24:00,2021-03-16 09:44:00,0


In [5]:
metadata.head()

Unnamed: 0,asset_id,content_id,title,reduced_title,episode_title,show_type,released_year,country_of_origin,category,keywords,...,language_rating,dialog_rating,fv_rating,pay_per_view,pack_premium_1,pack_premium_2,create_date,modify_date,start_vod_date,end_vod_date
0,15188,0.0,Ep:17 Tiempos Compulsivos,Tiempos_Compul_E17,Episodio 17,Serie,2012.0,AR,Drama,"Trastornos,Médicos,Tragicómica,Telenovela,Enfe...",...,N,N,N,N,N,N,2017-12-01 10:18:15+00:00,2019-01-26 06:37:18+00:00,2017-12-01 00:00:00+00:00,2020-12-01 23:59:59+00:00
1,24940,1.0,7 Cajas,7_Cajas,,Película,2012.0,PY,Suspenso/Acción,"Latinoamérica,Pobreza,Crimen,Pandillas",...,N,N,N,Y,N,N,2017-12-19 20:58:15+00:00,2019-09-17 19:02:03+00:00,2017-12-15 00:00:00+00:00,2022-12-14 23:59:59+00:00
2,21939,2.0,La Maldición de las Hormigas Gigantes,La_Maldicion_de_las,,Película,2016.0,FI,Terror/Comedia,"Criaturas,Plagas,Adolescentes,Fantasía,Video J...",...,N,N,N,N,N,N,2018-02-16 13:51:07+00:00,2020-04-28 14:16:38+00:00,2018-01-25 00:00:00+00:00,2020-12-01 23:59:59+00:00
3,9005,3.0,Una Mujer Fantástica,Una_Mujer_Fantastic,,Película,2017.0,CL,Drama,"LGBT,Mujeres,Latinoamérica",...,N,N,N,N,Y,N,2018-05-26 11:58:44+00:00,2019-11-15 03:00:23+00:00,2018-05-27 00:00:00+00:00,2021-04-30 23:59:59+00:00
4,7391,4.0,Star Trek,Star_Trek,,Película,2009.0,US,Ciencia Ficción/Aventura,"Fantasía,Galaxia,Futurismo,Aliens,Criaturas",...,N,N,N,Y,N,N,2019-05-03 20:07:24+00:00,2020-04-09 04:37:29+00:00,2019-05-02 00:00:00+00:00,2020-12-31 23:59:59+00:00


In [6]:
train_metadata = train.merge(metadata, on='asset_id', how='left')

In [7]:
train_metadata.columns

Index(['customer_id', 'account_id', 'device_type', 'asset_id', 'tunein',
       'tuneout', 'resume', 'content_id', 'title', 'reduced_title',
       'episode_title', 'show_type', 'released_year', 'country_of_origin',
       'category', 'keywords', 'description', 'reduced_desc',
       'cast_first_name', 'credits_first_name', 'run_time_min', 'audience',
       'made_for_tv', 'close_caption', 'sex_rating', 'violence_rating',
       'language_rating', 'dialog_rating', 'fv_rating', 'pay_per_view',
       'pack_premium_1', 'pack_premium_2', 'create_date', 'modify_date',
       'start_vod_date', 'end_vod_date'],
      dtype='object')

In [8]:
train_metadata = train_metadata[['account_id', 'tunein', 'tuneout', 'content_id', 'title', 'end_vod_date']]

In [9]:
train_metadata.head()

Unnamed: 0,account_id,tunein,tuneout,content_id,title,end_vod_date
0,90627,2021-02-18 22:52:00,2021-02-18 23:35:00,2040.0,T:5 Ep:08 This is Us,2021-06-30 23:59:59+00:00
1,90627,2021-03-24 23:17:00,2021-03-25 00:01:00,2040.0,T:5 Ep:10 This is Us,2021-06-30 23:59:59+00:00
2,3387,2021-03-15 10:05:00,2021-03-15 10:23:00,1983.0,T:1 Ep:02 Big Little Lies,2021-03-28 23:59:00+00:00
3,3387,2021-03-15 10:23:00,2021-03-15 11:18:00,1983.0,T:1 Ep:02 Big Little Lies,2021-03-28 23:59:00+00:00
4,3387,2021-03-16 09:24:00,2021-03-16 09:44:00,729.0,T:1 Ep:02 Dime quién soy,2021-03-28 23:59:00+00:00


# Train test split

In [10]:
train_max_date = datetime(year=2021, month=3, day=1)

In [11]:
test = train_metadata[train_metadata.tunein > train_max_date]
train_metadata = train_metadata[train_metadata.tunein <= train_max_date]

# Filtros

Se filtran contenidos que no van a estar disponibles (ninguno de sus asset_ids) después de la fecha de train

In [12]:
train_metadata['end_vod_date'] = train_metadata['end_vod_date'].dt.tz_localize(None)

In [13]:
max_end_vod_date_by_content = train_metadata[['content_id', 'end_vod_date']].groupby('content_id', as_index=False).agg({'end_vod_date': 'max'})

In [14]:
end_before_april = max_end_vod_date_by_content[max_end_vod_date_by_content.end_vod_date < datetime(year=2021, month=3, day=1)].content_id.unique()
end_before_april[:10]

array([ 10., 101., 107., 114., 118., 127., 156., 171., 174., 183.])

# Interacciones

Armo un dataset con las interacciones account - content.

In [15]:
interactions = train_metadata[['account_id', 'content_id']].copy()
interactions.head()

Unnamed: 0,account_id,content_id
0,90627,2040.0
6,3388,2100.0
7,3388,2100.0
8,3388,2100.0
9,3388,2100.0


Nulos:

In [16]:
interactions.isna().sum()

account_id     0
content_id    24
dtype: int64

In [17]:
interactions.shape

(2339070, 2)

In [18]:
interactions.account_id.nunique()

99649

In [19]:
interactions.content_id.nunique()

3692

Elimino los nulos

In [20]:
interactions = interactions.dropna()

In [21]:
interactions.shape

(2339046, 2)

In [22]:
interactions.head()

Unnamed: 0,account_id,content_id
0,90627,2040.0
6,3388,2100.0
7,3388,2100.0
8,3388,2100.0
9,3388,2100.0


In [23]:
interactions = interactions.drop_duplicates()

In [24]:
interactions.shape

(672124, 2)

In [25]:
interactions.account_id.nunique()

99647

In [26]:
interactions.content_id.nunique()

3692

Agrego columna watched con el valor 1 que indica que la cuenta vio el contenido

In [27]:
interactions['watched'] = 1

In [28]:
interactions.head()

Unnamed: 0,account_id,content_id,watched
0,90627,2040.0,1
6,3388,2100.0,1
12,3388,691.0,1
30,3388,3487.0,1
38,3388,3038.0,1


# Matrix

Armo una matriz con las interacciones. Los valores van a ser 1 si el usuario vio el contenido y 0 de lo contrario

In [29]:
interactions_matrix = pd.pivot_table(interactions, index='account_id', columns='content_id', values='watched')

In [30]:
interactions_matrix.shape

(99647, 3692)

In [31]:
interactions_matrix.head()

content_id,0.0,1.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,...,4357.0,4358.0,4359.0,4360.0,4361.0,4362.0,4363.0,4364.0,4365.0,4366.0
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,1.0,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [32]:
interactions_matrix.shape

(99647, 3692)

In [33]:
interactions_matrix = interactions_matrix.fillna(0)

In [34]:
interactions_matrix.head()

content_id,0.0,1.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,...,4357.0,4358.0,4359.0,4360.0,4361.0,4362.0,4363.0,4364.0,4365.0,4366.0
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Armo diccionario de accounts que voy a usar más adelante al momento de armar las predicciones

In [35]:
acc_ids = list(interactions_matrix.index)
account_dict = {}
counter = 0 
for i in acc_ids:
    account_dict[i] = counter
    counter += 1

Convierto a csr matrix

In [36]:
account_content_interactions = csr_matrix(interactions_matrix.values)

# Modelo

In [37]:
model = LightFM(loss='warp',
                random_state=100,
                learning_rate=0.03,
                no_components=16)

Fit 

In [38]:
%%time
model = model.fit(account_content_interactions, epochs=10)

CPU times: user 11.2 s, sys: 2.47 ms, total: 11.2 s
Wall time: 11.3 s


# Popularidad para recomendaciones cold start

Para usuarios que no vieron nada les vamos a recomendar los contenidos ordenados por popularidad

In [39]:
popularity_df = train_metadata[['account_id', 'content_id']].groupby('content_id', as_index=False).agg({'account_id': 'nunique'})
popularity_df.columns = ['content_id', 'accounts']
popularity_df = popularity_df.sort_values(by='accounts', ascending=False)
popularity_df.head()

Unnamed: 0,content_id,accounts
1733,2040.0,11481
3478,3806.0,8762
3572,3900.0,7960
3274,3598.0,3772
1851,2160.0,3506


# Generación de recomendaciones

In [40]:
%%time
account_watched_contents = {}

watched_contents = interactions.groupby('account_id').agg({'content_id': 'unique'})

for account in watched_contents.index:
    watched = watched_contents.loc[account, 'content_id']
    account_watched_contents[account] = watched

CPU times: user 33.4 s, sys: 18.9 ms, total: 33.4 s
Wall time: 33.4 s


In [41]:
%%time
recomms = {
    'account_id': [],
    'recomms': []
}

n_users, n_items = interactions_matrix.shape
item_ids = np.arange(n_items)

for account in tqdm(train_metadata.account_id.unique()):
    if account in list(interactions_matrix.index):
        acc_x = account_dict[account]

        preds = model.predict(user_ids=acc_x, item_ids = item_ids)

        scores = pd.Series(preds)
        scores.index = interactions_matrix.columns
        scores = list(pd.Series(scores.sort_values(ascending=False).index))[:200] # Tomo las primeras 200 para que sea más rapido

        watched_contents = account_watched_contents[account]
        scores = [x for x in scores if x not in watched_contents]
        scores = [x for x in scores if x not in end_before_april]
        scores = scores[:20]

        recomms['account_id'].append(account)
        recomms['recomms'].append(scores)
    else:
        recomms['account_id'].append(account)
        recomms['recomms'].append(popularity_df.sort_values(by='accounts', ascending=False).content_id.unique()[:20])

100%|██████████| 99649/99649 [47:06<00:00, 35.25it/s]  

CPU times: user 42min 7s, sys: 5min 12s, total: 47min 20s
Wall time: 47min 6s





In [42]:
recomms = pd.DataFrame(recomms)
recomms = recomms.sort_values(by='account_id', ascending=True)

In [43]:
recomms.head()

Unnamed: 0,account_id,recomms
83742,0,"[3806.0, 3900.0, 2160.0, 1316.0, 3775.0, 558.0..."
83755,1,"[2040.0, 3900.0, 3806.0, 116.0, 2012.0, 1971.0..."
83857,2,"[135.0, 718.0, 3364.0, 2040.0, 1800.0, 2160.0,..."
83898,3,"[3711.0, 1008.0, 2815.0, 3210.0, 3382.0, 3806...."
84010,4,"[3900.0, 173.0, 3806.0, 3210.0, 3381.0, 3384.0..."


In [44]:
recomms['n_recomms'] = recomms.recomms.apply(len)

less_than_20 = recomms[recomms.n_recomms != 20].shape[0]

if less_than_20 > 0:
    print("FALTAN RECOMMS")

In [45]:
actual_views = test[['account_id', 'content_id']]\
                .drop_duplicates()\
                .groupby(by='account_id', as_index=False)\
                .agg({'content_id': 'unique'})\
                .sort_values(by='account_id')

# Evaluacion

In [46]:
preds = recomms.recomms.values
labels = actual_views.content_id.values

In [47]:
aps = [] # lista vacía para ir almacenando la AP de cada recomendación
for pred, label in zip(preds, labels):
    n = len(pred) 
    arange = np.arange(n, dtype=np.int32) + 1. # indexamos en base 1
    rel_k = np.in1d(pred[:n], label)
    tp = np.ones(rel_k.sum(), dtype=np.int32).cumsum() # lista con el contador de verdaderos positivos
    denom = arange[rel_k] # posiciones donde se encuentran los ítems relantes
    ap = (tp / denom).sum() / len(label) # average precision
    aps.append(ap)

In [48]:
np.mean(aps)

0.030321301741158265

----------------------------------
model = LightFM(loss='warp',
                random_state=100,
                learning_rate=0.03,
                no_components=64)
                
MAP: 0.021121823981033343  

----------------------------------                

----------------------------------

model = LightFM(loss='warp',
                random_state=100,
                learning_rate=0.03,
                no_components=96)
MAP: 0.018823548327592

----------------------------------

----------------------------------

model = LightFM(loss='warp',
                random_state=100,
                learning_rate=0.03,
                no_components=32)

MAP: 0.027996600485548712

----------------------------------

----------------------------------

model = LightFM(loss='warp',
                random_state=100,
                learning_rate=0.03,
                no_components=16)

MAP: 0.030321301741158265 -----> MEJOR

----------------------------------