### Tercer Entrega Data Science

**GRUPO 1**

- Daniel Aguilera
- Gianella Salluzzi
- Gisela Turletti
- Guillermo More
- Marcos Savy 

###Instalamos las librerías que necesitaremos a lo largo del notebook.

In [None]:
pip install lightfm



In [None]:
pip install python-dateutil



###Importamos librerías

In [None]:
import numpy as np
import pandas as pd

from datetime import datetime
from scipy.sparse import csr_matrix
from lightfm import LightFM
from dateutil import tz
from dateutil.parser import isoparse
from tqdm import tqdm

### Funciones auxiliares

In [None]:
def time_subtract(date_1, date_2):
  '''
  Description: Funciòn para sustraer dos fechas dadas en el siguiente formato -> '2021-02-18 22:52:00
  Input:
        date_1: Fecha minuendo 
        date_2: Fecha sustraendo
  Output: 
        resultado de la resta date_2 - date_1
  '''
  #format in which the time is expected
  format = '%Y-%m-%d %H:%M:%S'
  #convert input datetime to list
  val_1 = datetime.strptime(date_1, format)
  val_2 = datetime.strptime(date_2, format)
  #compute the substraction
  diff = val_2 - val_1
  #convert the value to minutesç
  #diff_min = diff.total_seconds()/60

  return diff.total_seconds() / 60

In [None]:
def rate_calculator(time_spent, asset_time):
  '''
  Description: To calculate a rating value (interaction with a contet) using the amount of time that the user spent interacting with it
  
  Calculamos un rating porcentual. También contamos con la opción de binarizarlo (diciendo si vio el contenido o no lo vio).
  '''
  #threshold = 0.5

  if time_spent >= asset_time:
    rate = 1
  else: 
    rate = time_spent / asset_time
    
  '''
  Binarize output using a predefined threshold value
  '''

  #if rate > threshold:
  #  rate = 1
  #else: 
  #  rate = 0

  return rate

###Importamos los datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('drive/MyDrive/Data Science ICARO/Projects/train-entrega3.csv', parse_dates=['tunein', 'tuneout']) 
print(data.head())

md = pd.read_csv('drive/MyDrive/Data Science ICARO/Projects/metadata.csv', sep = ';', header = 0) 
print(md.head())

   customer_id  account_id  ...             tuneout  resume
0            0       90627  ... 2021-02-18 23:35:00       0
1            0       90627  ... 2021-03-25 00:01:00       0
2            1        3387  ... 2021-03-15 10:23:00       0
3            1        3387  ... 2021-03-15 11:18:00       1
4            1        3387  ... 2021-03-16 09:44:00       0

[5 rows x 7 columns]
   asset_id  content_id  ...          start_vod_date            end_vod_date
0     15188         0.0  ...  2017-12-01T00:00:00.0Z  2020-12-01T23:59:59.0Z
1     24940         1.0  ...  2017-12-15T00:00:00.0Z  2022-12-14T23:59:59.0Z
2     21939         2.0  ...  2018-01-25T00:00:00.0Z  2020-12-01T23:59:59.0Z
3      9005         3.0  ...  2018-05-27T00:00:00.0Z  2021-04-30T23:59:59.0Z
4      7391         4.0  ...  2019-05-02T00:00:00.0Z  2020-12-31T23:59:59.0Z

[5 rows x 30 columns]


In [None]:
data = data.dropna()

###Creamos un df temporal de prueba debido a que toma demasiado tiempo ejecutar el dataset completo.

In [None]:
data_prueba = data.head(500000)

###Creamos rating feature sobre el dataset de train usando la cantidad de tiempo que pasó el usuario mirando el contenido.

In [None]:
%%time
#calculate the amount of time that the user spent interacting with the asset
data_prueba['time_duration_min'] = data_prueba.apply(lambda x: time_subtract(str(x['tunein']),
                                                                               str(x['tuneout'])),
                                                                                axis = 1)

CPU times: user 27.8 s, sys: 264 ms, total: 28 s
Wall time: 28.1 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
%%time

#Create a new column with the asset_id duration in minutes
data_prueba['runtime_min'] = data_prueba.asset_id.apply(lambda x: md[md.asset_id == x].run_time_min.values[0])

CPU times: user 7min 4s, sys: 20.5 s, total: 7min 25s
Wall time: 7min 26s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
data_prueba.head()

Unnamed: 0,customer_id,account_id,device_type,asset_id,tunein,tuneout,resume,time_duration_min,runtime_min
0,0,90627,STATIONARY,18332.0,2021-02-18 22:52:00,2021-02-18 23:35:00,0,43.0,43.0
1,0,90627,STATIONARY,24727.0,2021-03-24 23:17:00,2021-03-25 00:01:00,0,44.0,42.0
2,1,3387,STB,895.0,2021-03-15 10:05:00,2021-03-15 10:23:00,0,18.0,54.0
3,1,3387,STB,895.0,2021-03-15 10:23:00,2021-03-15 11:18:00,1,55.0,54.0
4,1,3387,STB,26062.0,2021-03-16 09:24:00,2021-03-16 09:44:00,0,20.0,49.0


In [None]:
#select only the needed features
columns = ['account_id', 'asset_id', 'time_duration_min', 'runtime_min', 'tunein', 'tuneout']
data_prueba = data_prueba[columns]
data_prueba.head()

Unnamed: 0,account_id,asset_id,time_duration_min,runtime_min,tunein,tuneout
0,90627,18332.0,43.0,43.0,2021-02-18 22:52:00,2021-02-18 23:35:00
1,90627,24727.0,44.0,42.0,2021-03-24 23:17:00,2021-03-25 00:01:00
2,3387,895.0,18.0,54.0,2021-03-15 10:05:00,2021-03-15 10:23:00
3,3387,895.0,55.0,54.0,2021-03-15 10:23:00,2021-03-15 11:18:00
4,3387,26062.0,20.0,49.0,2021-03-16 09:24:00,2021-03-16 09:44:00


In [None]:
#Create a new dataset with the unique combination of 'account_id' and 'asset_id' to store the new 'rating' feature
columns = ['account_id', 'asset_id', 'tunein', 'tuneout']
df_filtered = data_prueba.drop_duplicates(subset = ['account_id', 'asset_id'])[columns]
df_filtered.head()

Unnamed: 0,account_id,asset_id,tunein,tuneout
0,90627,18332.0,2021-02-18 22:52:00,2021-02-18 23:35:00
1,90627,24727.0,2021-03-24 23:17:00,2021-03-25 00:01:00
2,3387,895.0,2021-03-15 10:05:00,2021-03-15 10:23:00
4,3387,26062.0,2021-03-16 09:24:00,2021-03-16 09:44:00
6,3388,30840.0,2021-01-01 02:22:00,2021-01-01 02:34:00


In [None]:
#traer content_id
df_aux_md = md[['asset_id','content_id']]
df_aux_md.set_index(keys='asset_id',inplace=True)
df_filtered['content_id'] = [x for x in df_aux_md.loc[df_filtered.asset_id].content_id.values]

In [None]:
df_filtered = df_filtered.drop_duplicates(subset=['account_id','content_id'],keep='first')

In [None]:
%%time

#unique user_id values within the dataset
user_ids = df_filtered.account_id.value_counts().index

#for every unique user_id value
for user_id in user_ids:
  #bring all the unique assets (contents) that the user interact with
  asset_ids = df_filtered[df_filtered.account_id == user_id].asset_id.value_counts().index
  #for every unique asset
  for asset_id in asset_ids:
    #create a temporal dataframe
    temp_val = data_prueba[(data_prueba.asset_id == asset_id) & (data_prueba.account_id == user_id)]
    #total amount of time the user spent watching this unique asset
    time_duration_min = np.sum(temp_val.time_duration_min.values)
    #calculate the rating using the time spent and the total duration of the asset
    rating = rate_calculator(time_duration_min,
                             temp_val.runtime_min.values[0])
    #create a mask (indexes) and assign the new rating values to the filtered dataset
    mask = df_filtered[(df_filtered.asset_id == asset_id) & (df_filtered.account_id == user_id)].index
    df_filtered.loc[mask, 'rating'] = rating 

CPU times: user 15min 37s, sys: 35.4 s, total: 16min 12s
Wall time: 15min 45s


In [None]:
df_filtered.head()

Unnamed: 0,account_id,asset_id,tunein,tuneout,content_id,rating
0,90627,18332.0,2021-02-18 22:52:00,2021-02-18 23:35:00,2040.0,1.0
2,3387,895.0,2021-03-15 10:05:00,2021-03-15 10:23:00,1983.0,1.0
4,3387,26062.0,2021-03-16 09:24:00,2021-03-16 09:44:00,729.0,0.469388
6,3388,30840.0,2021-01-01 02:22:00,2021-01-01 02:34:00,2100.0,0.418605
12,3388,2540.0,2021-01-05 16:04:00,2021-01-05 16:43:00,691.0,0.65


In [None]:
df_filtered.shape

(129618, 6)

###Separamos en train y test el dataset df_filtered

In [None]:
print(df_filtered['tunein'].min())
print(df_filtered['tunein'].max())

2021-01-01 00:00:00
2021-03-31 23:59:00


In [None]:
train = df_filtered[(df_filtered['tunein'] >= datetime(year= 2021, month= 1, day=1)) &
              (df_filtered['tunein'] < datetime(year=2021, month=3, day=1))]
train.head()

Unnamed: 0,account_id,asset_id,tunein,tuneout,content_id,rating
0,90627,18332.0,2021-02-18 22:52:00,2021-02-18 23:35:00,2040.0,1.0
6,3388,30840.0,2021-01-01 02:22:00,2021-01-01 02:34:00,2100.0,0.418605
12,3388,2540.0,2021-01-05 16:04:00,2021-01-05 16:43:00,691.0,0.65
30,3388,29743.0,2021-01-24 22:06:00,2021-01-24 22:22:00,3487.0,0.190476
38,3388,8949.0,2021-02-08 10:39:00,2021-02-08 11:11:00,3038.0,0.727273


In [None]:
#amount of rows and columns in train dataset
print(train.shape)

#amount of different account_id
print(train.account_id.nunique())

(90408, 6)
15931


In [None]:
test = df_filtered[(df_filtered['tunein'] >= datetime(year = 2021, month= 3, day =1))]
test.head()

Unnamed: 0,account_id,asset_id,tunein,tuneout,content_id,rating
2,3387,895.0,2021-03-15 10:05:00,2021-03-15 10:23:00,1983.0,1.0
4,3387,26062.0,2021-03-16 09:24:00,2021-03-16 09:44:00,729.0,0.469388
60,3388,5168.0,2021-03-29 19:59:00,2021-03-29 20:57:00,3386.0,0.53211
68,3389,20645.0,2021-03-07 02:30:00,2021-03-07 02:32:00,3273.0,0.174419
71,3389,19971.0,2021-03-14 09:49:00,2021-03-14 10:16:00,3578.0,0.6


In [None]:
#amount of rows and columns in test dataset
print(test.shape)

#amount of different account_id 
print(test.account_id.nunique())

(39210, 6)
11936


A continuación, vemos si tenemos usuarios cold start.

In [None]:
test[~test.account_id.isin(train.account_id.unique())].account_id.nunique() 

2004

Dropeamos las columnas 'tunein' y 'tuneout' de nuestros datasets ya que no los necesitaremos.

In [None]:
columns = ['account_id', 'asset_id', 'content_id', 'rating']
df_filtered = df_filtered[columns]
train = train[columns]
test = test[columns]

###Creamos matriz de interacciones.

In [None]:
matrix_col =  ['account_id','content_id','rating']
interact = df_filtered[matrix_col]
interact.head()

Unnamed: 0,account_id,content_id,rating
0,90627,2040.0,1.0
2,3387,1983.0,1.0
4,3387,729.0,0.469388
6,3388,2100.0,0.418605
12,3388,691.0,0.65


In [None]:
inter_matrix = interact.pivot_table(index="account_id", columns="content_id", values="rating")
inter_matrix.head()

content_id,0.0,1.0,3.0,6.0,7.0,8.0,9.0,18.0,19.0,20.0,22.0,23.0,24.0,26.0,33.0,36.0,37.0,38.0,39.0,40.0,43.0,45.0,46.0,47.0,48.0,49.0,50.0,51.0,52.0,53.0,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,62.0,63.0,...,4329.0,4330.0,4331.0,4332.0,4333.0,4334.0,4335.0,4336.0,4337.0,4338.0,4339.0,4340.0,4341.0,4342.0,4343.0,4344.0,4345.0,4346.0,4347.0,4348.0,4349.0,4350.0,4351.0,4352.0,4353.0,4354.0,4355.0,4356.0,4357.0,4358.0,4359.0,4360.0,4361.0,4362.0,4363.0,4364.0,4365.0,4366.0,4368.0,4369.0
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
140,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
141,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,0.794872,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
142,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
143,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
144,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
inter_matrix.fillna(0, inplace = True)
inter_matrix.head()

content_id,0.0,1.0,3.0,6.0,7.0,8.0,9.0,18.0,19.0,20.0,22.0,23.0,24.0,26.0,33.0,36.0,37.0,38.0,39.0,40.0,43.0,45.0,46.0,47.0,48.0,49.0,50.0,51.0,52.0,53.0,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,62.0,63.0,...,4329.0,4330.0,4331.0,4332.0,4333.0,4334.0,4335.0,4336.0,4337.0,4338.0,4339.0,4340.0,4341.0,4342.0,4343.0,4344.0,4345.0,4346.0,4347.0,4348.0,4349.0,4350.0,4351.0,4352.0,4353.0,4354.0,4355.0,4356.0,4357.0,4358.0,4359.0,4360.0,4361.0,4362.0,4363.0,4364.0,4365.0,4366.0,4368.0,4369.0
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.794872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
inter_matrix = inter_matrix.drop(columns=inter_matrix.columns[0])

In [None]:
interactions_matrix_csr = csr_matrix(inter_matrix.values)

In [None]:
user_ids = list(inter_matrix.index)
user_dict = {}
counter = 0 
for i in user_ids:
    user_dict[i] = counter
    counter += 1

In [None]:
_model = LightFM(no_components=3, random_state=100, learning_rate=0.03)

In [None]:
md.end_vod_date = [isoparse(x) for x in md.end_vod_date]

In [None]:
stale_content = md[md.end_vod_date <= datetime(year=2021, month=3, day=31, tzinfo=tz.gettz('UTC'))].content_id
stale_content

0           0.0
2           2.0
4           4.0
6           6.0
7           7.0
          ...  
33128    1216.0
33133    1836.0
33134    1205.0
33141     943.0
33143    1041.0
Name: content_id, Length: 12797, dtype: float64

In [None]:
#popular content
popularity_df =df_filtered.groupby("content_id", as_index = False).agg({"account_id":"nunique"}).sort_values(by="account_id", ascending=False)

popularity_df.columns=["content_id", "popularity"]
popularity_df.head()
popular_content = [x for x in popularity_df.content_id.values if x not in stale_content][:20]

popular_content

[3900.0,
 729.0,
 3384.0,
 3386.0,
 3382.0,
 3578.0,
 3863.0,
 3572.0,
 774.0,
 3906.0,
 712.0,
 3592.0,
 3897.0,
 3343.0,
 3868.0,
 3530.0,
 3840.0,
 3402.0,
 3902.0,
 3569.0]

In [None]:
%%time
model = model.fit(interactions_matrix_csr, epochs=10)

CPU times: user 620 ms, sys: 2.18 ms, total: 622 ms
Wall time: 628 ms


In [None]:
#definimos dict donde vamos a ir almacenando las recomendaciones
recomms_dict = {
    'user_id': [],
    'recomms': []
}

#obtenemos cantidad de usuarios y cantidad de items
n_users, n_items = inter_matrix.shape
item_ids = np.arange(n_items)

#por cada usuario del dataset de test, generamos recomendaciones
for user in tqdm(test.account_id.unique()):
    #COMPLETAR: Validar si el usuario se encuentra en la matriz de interacciones (interactions_matrix.index)
    if user in list(inter_matrix.index):
      # Si el usuario esta en train, no es cold start. Usamos el modelo para recomendar
      user_x = user_dict[user] #buscamos el indice del usuario en la matriz (transformamos id a indice)

      #COMPLETAR: Generar las predicciones para el usuario x
      preds = model.predict(user_ids=user_x, item_ids = item_ids)

      #COMPLETAR: Basándose en el ejemplo anterior, ordenar las predicciones de menor a mayor y quedarse con 50.
      scores = pd.Series(preds)
      scores.index = inter_matrix.columns
      scores = list(pd.Series(scores.sort_values(ascending=False).index))[:50]

      #COMPLETAR: Obtener listado de contenidos vistos anteriormente por el usuario (en el set de train)
      watched_contents = df_filtered[df_filtered.account_id == user].content_id.unique()

      #COMPLETAR: Filtrar contenidos ya vistos y quedarse con los primeros 10
      recomms = [x for x in scores if (x not in watched_contents) & (x not in stale_content)][:20]

      # Guardamos las recomendaciones en el diccionario
      recomms_dict['user_id'].append(user)
      recomms_dict['recomms'].append(recomms)
    
    # En este else trataremos a los usuarios que no están en la matriz (cold start)
    else:
      recomms_dict['user_id'].append(user)
      # Les recomendamos contenido popular
      recomms_dict['recomms'].append(popular_content)

100%|██████████| 11936/11936 [01:42<00:00, 116.87it/s]


In [None]:
recomms_df = pd.DataFrame(recomms_dict)
recomms_df

Unnamed: 0,user_id,recomms
0,3387,"[3900.0, 3384.0, 3386.0, 3382.0, 3578.0]"
1,3388,"[3900.0, 729.0, 3384.0, 3382.0, 3578.0]"
2,3389,"[3900.0, 729.0, 3384.0, 3386.0, 3382.0]"
3,3393,"[3900.0, 729.0, 3384.0, 3386.0, 3382.0, 3578.0]"
4,3394,"[3900.0, 729.0, 3384.0, 3386.0, 3382.0, 3578.0]"
...,...,...
11931,5460,"[3900.0, 729.0, 3384.0, 3386.0, 3382.0, 3578.0]"
11932,107407,"[3900.0, 729.0, 3384.0, 3386.0, 3578.0, 3382.0]"
11933,111249,"[3900.0, 729.0, 3384.0, 3386.0, 3382.0, 3578.0]"
11934,40215,"[3900.0, 729.0, 3386.0, 3382.0, 3578.0]"


###Evaluación con MAP

Primero ordenamos los contenidos según los ratings (mayor a menor) que dieron los usuarios en el set de test.

Luego, agrupamos y armamos un listado de libros para cada usuario.

Este es el listado "ideal" contra el que vamos a comparar nuestras recomendaciones.

In [None]:
#creamos la columna is_seen en base al rating. si vio más el 50% consideramos que si fue visto el contenido.
test['is_seen'] = np.where(test['rating'] >= 0.5, True, False)
test.head()

Unnamed: 0,account_id,asset_id,content_id,rating,is_seen
2,3387,895.0,1983.0,1.0,True
4,3387,26062.0,729.0,0.469388,False
60,3388,5168.0,3386.0,0.53211,True
68,3389,20645.0,3273.0,0.174419,False
71,3389,19971.0,3578.0,0.6,True


In [None]:
ideal_recomms = test[test.is_seen]\
                  .sort_values(by=["account_id", "rating"], ascending=False)\
                  .groupby(["account_id"], as_index=False)\
                  .agg({"content_id": "unique"})\
                  .head()
ideal_recomms.head()

Unnamed: 0,account_id,content_id
0,140,[2200.0]
1,141,[4334.0]
2,142,"[3210.0, 2524.0, 3189.0, 4102.0]"
3,146,[4249.0]
4,148,"[1946.0, 4133.0, 2174.0, 3386.0, 1983.0]"


In [None]:
recomms_df.head()

Unnamed: 0,user_id,recomms
0,3387,"[3900.0, 3384.0, 3386.0, 3382.0, 3578.0]"
1,3388,"[3900.0, 729.0, 3384.0, 3382.0, 3578.0]"
2,3389,"[3900.0, 729.0, 3384.0, 3386.0, 3382.0]"
3,3393,"[3900.0, 729.0, 3384.0, 3386.0, 3382.0, 3578.0]"
4,3394,"[3900.0, 729.0, 3384.0, 3386.0, 3382.0, 3578.0]"


In [None]:
df_map = ideal_recomms.merge(recomms_df, how="left", left_on="account_id", right_on = 'user_id')[["account_id", "content_id", "recomms"]]
df_map.columns = ["account_id", "ideal", "recomms"]
df_map.head()

Unnamed: 0,account_id,ideal,recomms
0,140,[2200.0],"[3900.0, 729.0, 3384.0, 3386.0, 3382.0, 3578.0]"
1,141,[4334.0],"[3900.0, 729.0, 3384.0, 3386.0, 3382.0, 3578.0]"
2,142,"[3210.0, 2524.0, 3189.0, 4102.0]","[3900.0, 729.0, 3384.0, 3386.0, 3382.0]"
3,146,[4249.0],"[3900.0, 729.0, 3384.0, 3386.0, 3382.0, 3578.0]"
4,148,"[1946.0, 4133.0, 2174.0, 3386.0, 1983.0]","[3900.0, 729.0, 3384.0, 3382.0, 3578.0]"


In [None]:
aps = [] # lista vacía para ir almacenando la AP de cada recomendación

for pred, label in df_map[["ideal", "recomms"]].values:
  n = len(pred) # cantidad de elementos recomendados
  arange = np.arange(n, dtype=np.int32) + 1. # indexamos en base 1 
  rel_k = np.in1d(pred[:n], label) # lista de booleanos que indican la relevancia de cada ítem
  tp = np.ones(rel_k.sum(), dtype=np.int32).cumsum() # lista con el contador de verdaderos positivos
  denom = arange[rel_k] # posiciones donde se encuentran los ítems relantes
  ap = (tp / denom).sum() / len(label) # average precision
  aps.append(ap)

In [None]:
MAP = np.mean(aps)
print(f'mean average precision = {round(MAP, 5)}')

mean average precision = 0.0
