In [1]:
#!pip install lightfm

# V3 Recommender: LightFM 

La idea en esta versión es utilizar la librería LightFM [docs](https://making.lyst.com/lightfm/docs/lightfm.html) que implementa algoritmos de recomendación y sirve cuando no tenemos rankings expícitos.

Tenemos la opción de agregar features de los usuarios y los items pero en esta V3.0 no se van a agregar. Al no agregar estas features, el problema se reduce a una factorización de matriz (filtro colaborativo): 
"*If no feature matrices are provided to the lightfm.LightFM.fit() or lightfm.LightFM.predict() methods, they are implicitly assumed to be identity matrices: that is, each user and item are characterised by one feature that is unique to that user (or item). In this case, LightFM reduces to a traditional collaborative filtering matrix factorization method.*"

La loss function que se va a usar es **WARP**: "*Weighted Approximate-Rank Pairwise loss. Maximises the rank of positive examples by repeatedly sampling negative examples until rank violating one is found. Useful when only positive interactions are present and optimising the top of the recommendation list (precision@k) is desired.*"

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
from lightfm import LightFM
from scipy.sparse import csr_matrix

# Datos

In [3]:
metadata = pd.read_csv("../../data/metadata.csv", delimiter=";", parse_dates=['create_date', 'modify_date', 'start_vod_date', 'end_vod_date'])
train = pd.read_csv("../../data/train.csv", parse_dates=['tunein', 'tuneout'])

In [4]:
train.head()

Unnamed: 0,customer_id,account_id,device_type,asset_id,tunein,tuneout,resume
0,0,90627,STATIONARY,18332.0,2021-02-18 22:52:00,2021-02-18 23:35:00,0
1,0,90627,STATIONARY,24727.0,2021-03-24 23:17:00,2021-03-25 00:01:00,0
2,1,3387,STB,895.0,2021-03-15 10:05:00,2021-03-15 10:23:00,0
3,1,3387,STB,895.0,2021-03-15 10:23:00,2021-03-15 11:18:00,1
4,1,3387,STB,26062.0,2021-03-16 09:24:00,2021-03-16 09:44:00,0


In [5]:
metadata.head()

Unnamed: 0,asset_id,content_id,title,reduced_title,episode_title,show_type,released_year,country_of_origin,category,keywords,...,language_rating,dialog_rating,fv_rating,pay_per_view,pack_premium_1,pack_premium_2,create_date,modify_date,start_vod_date,end_vod_date
0,15188,0.0,Ep:17 Tiempos Compulsivos,Tiempos_Compul_E17,Episodio 17,Serie,2012.0,AR,Drama,"Trastornos,Médicos,Tragicómica,Telenovela,Enfe...",...,N,N,N,N,N,N,2017-12-01 10:18:15+00:00,2019-01-26 06:37:18+00:00,2017-12-01 00:00:00+00:00,2020-12-01 23:59:59+00:00
1,24940,1.0,7 Cajas,7_Cajas,,Película,2012.0,PY,Suspenso/Acción,"Latinoamérica,Pobreza,Crimen,Pandillas",...,N,N,N,Y,N,N,2017-12-19 20:58:15+00:00,2019-09-17 19:02:03+00:00,2017-12-15 00:00:00+00:00,2022-12-14 23:59:59+00:00
2,21939,2.0,La Maldición de las Hormigas Gigantes,La_Maldicion_de_las,,Película,2016.0,FI,Terror/Comedia,"Criaturas,Plagas,Adolescentes,Fantasía,Video J...",...,N,N,N,N,N,N,2018-02-16 13:51:07+00:00,2020-04-28 14:16:38+00:00,2018-01-25 00:00:00+00:00,2020-12-01 23:59:59+00:00
3,9005,3.0,Una Mujer Fantástica,Una_Mujer_Fantastic,,Película,2017.0,CL,Drama,"LGBT,Mujeres,Latinoamérica",...,N,N,N,N,Y,N,2018-05-26 11:58:44+00:00,2019-11-15 03:00:23+00:00,2018-05-27 00:00:00+00:00,2021-04-30 23:59:59+00:00
4,7391,4.0,Star Trek,Star_Trek,,Película,2009.0,US,Ciencia Ficción/Aventura,"Fantasía,Galaxia,Futurismo,Aliens,Criaturas",...,N,N,N,Y,N,N,2019-05-03 20:07:24+00:00,2020-04-09 04:37:29+00:00,2019-05-02 00:00:00+00:00,2020-12-31 23:59:59+00:00


In [6]:
train_metadata = train.merge(metadata, on='asset_id', how='left')

In [7]:
train_metadata.columns

Index(['customer_id', 'account_id', 'device_type', 'asset_id', 'tunein',
       'tuneout', 'resume', 'content_id', 'title', 'reduced_title',
       'episode_title', 'show_type', 'released_year', 'country_of_origin',
       'category', 'keywords', 'description', 'reduced_desc',
       'cast_first_name', 'credits_first_name', 'run_time_min', 'audience',
       'made_for_tv', 'close_caption', 'sex_rating', 'violence_rating',
       'language_rating', 'dialog_rating', 'fv_rating', 'pay_per_view',
       'pack_premium_1', 'pack_premium_2', 'create_date', 'modify_date',
       'start_vod_date', 'end_vod_date'],
      dtype='object')

In [8]:
train_metadata = train_metadata[['account_id', 'tunein', 'tuneout', 'content_id', 'title', 'end_vod_date']]

In [9]:
train_metadata.head()

Unnamed: 0,account_id,tunein,tuneout,content_id,title,end_vod_date
0,90627,2021-02-18 22:52:00,2021-02-18 23:35:00,2040.0,T:5 Ep:08 This is Us,2021-06-30 23:59:59+00:00
1,90627,2021-03-24 23:17:00,2021-03-25 00:01:00,2040.0,T:5 Ep:10 This is Us,2021-06-30 23:59:59+00:00
2,3387,2021-03-15 10:05:00,2021-03-15 10:23:00,1983.0,T:1 Ep:02 Big Little Lies,2021-03-28 23:59:00+00:00
3,3387,2021-03-15 10:23:00,2021-03-15 11:18:00,1983.0,T:1 Ep:02 Big Little Lies,2021-03-28 23:59:00+00:00
4,3387,2021-03-16 09:24:00,2021-03-16 09:44:00,729.0,T:1 Ep:02 Dime quién soy,2021-03-28 23:59:00+00:00


# Filtros

Se filtran contenidos que no van a estar disponibles (ninguno de sus asset_ids) después de el 1 de abril

In [10]:
train_metadata['end_vod_date'] = train_metadata['end_vod_date'].dt.tz_localize(None)

In [11]:
max_end_vod_date_by_content = train_metadata[['content_id', 'end_vod_date']].groupby('content_id', as_index=False).agg({'end_vod_date': 'max'})

In [12]:
end_before_april = max_end_vod_date_by_content[max_end_vod_date_by_content.end_vod_date < datetime(year=2021, month=4, day=1)].content_id.unique()
end_before_april[:10]

array([ 10.,  62.,  79.,  83., 101., 114., 118., 156., 171., 174.])

# Interacciones

Armo un dataset con las interacciones account - content.

In [13]:
interactions = train_metadata[['account_id', 'content_id']].copy()
interactions.head()

Unnamed: 0,account_id,content_id
0,90627,2040.0
1,90627,2040.0
2,3387,1983.0
3,3387,1983.0
4,3387,729.0


Nulos:

In [14]:
interactions.isna().sum()

account_id      0
content_id    142
dtype: int64

In [15]:
interactions.shape

(3657801, 2)

In [16]:
interactions.account_id.nunique()

113881

In [17]:
interactions.content_id.nunique()

4064

Elimino los nulos

In [18]:
interactions = interactions.dropna()

In [19]:
interactions.shape

(3657659, 2)

In [20]:
interactions.head()

Unnamed: 0,account_id,content_id
0,90627,2040.0
1,90627,2040.0
2,3387,1983.0
3,3387,1983.0
4,3387,729.0


In [21]:
interactions = interactions.drop_duplicates()

In [22]:
interactions.shape

(971470, 2)

In [23]:
interactions.account_id.nunique()

113875

In [24]:
interactions.content_id.nunique()

4064

Agrego columna watched con el valor 1 que indica que la cuenta vio el contenido

In [25]:
interactions['watched'] = 1

In [26]:
interactions.head()

Unnamed: 0,account_id,content_id,watched
0,90627,2040.0,1
2,3387,1983.0,1
4,3387,729.0,1
6,3388,2100.0,1
12,3388,691.0,1


# Matrix

Armo una matriz con las interacciones. Los valores van a ser 1 si el usuario vio el contenido y 0 de lo contrario

In [27]:
interactions_matrix = pd.pivot_table(interactions, index='account_id', columns='content_id', values='watched')

In [28]:
interactions_matrix.shape

(113875, 4064)

In [29]:
interactions_matrix.head()

content_id,0.0,1.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,...,4362.0,4363.0,4364.0,4365.0,4366.0,4367.0,4368.0,4369.0,4370.0,4371.0
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,1.0,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [30]:
interactions_matrix.shape

(113875, 4064)

In [31]:
interactions_matrix = interactions_matrix.fillna(0)

In [32]:
interactions_matrix.head()

content_id,0.0,1.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,...,4362.0,4363.0,4364.0,4365.0,4366.0,4367.0,4368.0,4369.0,4370.0,4371.0
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Armo diccionario de accounts que voy a usar más adelante al momento de armar las predicciones

In [33]:
acc_ids = list(interactions_matrix.index)
account_dict = {}
counter = 0 
for i in acc_ids:
    account_dict[i] = counter
    counter += 1

Convierto a csr matrix

In [34]:
account_content_interactions = csr_matrix(interactions_matrix.values)

# Modelo

In [35]:
model = LightFM(loss='warp',
                random_state=100,
                learning_rate=0.03,
                no_components=16)

Fit 

In [36]:
%%time
model = model.fit(account_content_interactions, epochs=10)

CPU times: user 17.2 s, sys: 0 ns, total: 17.2 s
Wall time: 17.3 s


# Que predice ?

Ejemplo para un usuario

In [68]:
user_id_test = 12345

In [69]:
acc_x = acc_ids[user_id_test]
preds = model.predict(user_ids=acc_x, item_ids = np.arange(interactions.content_id.nunique()))
preds

array([-0.7288568, -0.6717346, -1.5576108, ..., -1.4599506, -1.2201344,
       -1.0091008], dtype=float32)

In [70]:
def id2title_list(content_ids):
    titles = metadata[metadata.content_id.isin(content_ids)].drop_duplicates('content_id').title.values
    return titles

In [71]:
scores = pd.Series(preds)
scores.index = interactions_matrix.columns
scores = list(pd.Series(scores.sort_values(ascending=False).index))

watched_contents = interactions[interactions.account_id == user_id_test].content_id.unique()
scores = [x for x in scores if x not in watched_contents]
scores = [x for x in scores if x not in end_before_april]
scores = scores[:20]

id2title_list(scores)

array(['T:1 Ep:05 Mickey: Aventuras sobre ruedas',
       'T:2 Ep:73 Puppy Dog Pals', 'Madagascar 3: Los fugitivos',
       'Madagascar', 'Madagascar 2: Escape a África',
       'T:1 Ep:01 La Granja de Zenón',
       'T:1 Ep:01 La Granja de Zenón: La Serie',
       'T:1 Ep:30 Mira: La detective del reino',
       'T:1 Ep:43 T.O.T.S. Tiny Ones Transfer Service',
       'T:1 Ep:15 Gravity Falls: Un verano de misterios',
       'T:2 Ep:08 Mickey Mouse', '01/04 - MasterChef - Celebrity',
       'T:1 Ep:36 Bluey', 'T:8 Ep:40 Peppa Pig',
       'T:1 Ep:01 Paw Patrol, Patrulla Canina',
       'T:1 Ep:01 Gallina Pintadita Mini', 'Escandalosos: La película',
       'El árbol de los deseos', 'Cosa de minas',
       'T:1 Ep:01 The Collapse'], dtype=object)

In [72]:
id2title_list(watched_contents)

array(['T:5 Ep:110 Junior Express', 'T:2 Ep:65 Vampirina'], dtype=object)

# Popularidad para recomendaciones cold start

Para usuarios que no vieron nada les vamos a recomendar los contenidos ordenados por popularidad

In [42]:
popularity_df = train_metadata[['account_id', 'content_id']].groupby('content_id', as_index=False).agg({'account_id': 'nunique'})
popularity_df.columns = ['content_id', 'accounts']
popularity_df = popularity_df.sort_values(by='accounts', ascending=False)
popularity_df.head()

Unnamed: 0,content_id,accounts
1764,2040.0,13629
3515,3806.0,9377
3609,3900.0,9082
3839,4133.0,8722
1707,1983.0,6695


# Generación de recomendaciones

In [43]:
%%time
account_watched_contents = {}

watched_contents = interactions.groupby('account_id').agg({'content_id': 'unique'})

for account in watched_contents.index:
    watched = watched_contents.loc[account, 'content_id']
    account_watched_contents[account] = watched

CPU times: user 37.2 s, sys: 0 ns, total: 37.2 s
Wall time: 37.2 s


In [44]:
%%time
recomms = {
    'account_id': [],
    'recomms': []
}

n_users, n_items = interactions_matrix.shape
item_ids = np.arange(n_items)

for account in tqdm(train_metadata.account_id.unique()):
    
    if account in list(interactions_matrix.index):
        
        acc_x = account_dict[account]
        
        preds = model.predict(user_ids=acc_x, item_ids = item_ids)

        scores = pd.Series(preds)
        scores.index = interactions_matrix.columns
        scores = list(pd.Series(scores.sort_values(ascending=False).index))[:300]

        watched_contents = account_watched_contents[account]
        scores = [x for x in scores if x not in watched_contents]
        scores = [x for x in scores if x not in end_before_april]
        scores = scores[:20]
        
        recomms['account_id'].append(account)
        recomms['recomms'].append(scores)
    else:
        recomms['account_id'].append(account)
        recomms['recomms'].append(popularity_df.sort_values(by='accounts', ascending=False).content_id.unique()[:20])

100%|██████████| 113881/113881 [57:21<00:00, 33.09it/s] 

CPU times: user 51min 24s, sys: 6min 14s, total: 57min 39s
Wall time: 57min 21s





In [55]:
recomms = pd.DataFrame(recomms)
recomms['recomms'] = recomms['recomms'].apply(lambda x: [int(i) for i in x]) #paso a int los ids
recomms = recomms.sort_values(by='account_id', ascending=True)

In [56]:
recomms.head()

Unnamed: 0,account_id,recomms,n_recomms
95869,0,"[3806, 3381, 2160, 2040, 3384, 1020, 1316, 390...",20
95883,1,"[2040, 1573, 2012, 3806, 3026, 1971, 1877, 116...",20
95995,2,"[2040, 1020, 1800, 3806, 2160, 1877, 2178, 135...",20
96044,3,"[3210, 3711, 173, 3386, 3344, 2815, 2972, 1573...",20
96169,4,"[2040, 3806, 3900, 3210, 1983, 4133, 2815, 157...",20


In [57]:
recomms['n_recomms'] = recomms.recomms.apply(len)

less_than_20 = recomms[recomms.n_recomms != 20].shape[0]

if less_than_20 > 0:
    print("FALTAN RECOMMS")
    
else:
    recomms.drop('n_recomms', axis=1).to_csv("v3_0.csv", header=None, index=None)