In [1]:
import os
import pandas as pd
import numpy as np
import pickle

import lightfm as lfm
from lightfm import data

from tqdm.notebook import tqdm

  "LightFM was compiled without OpenMP support. "


In [2]:
SEEN_PICKLE_PATH = '../resources/seen.pkl'
SIMPLE_PREDICTION_PICKLE_PATH = '../resources/simple_prediction.pkl'
SIMPLE_PREDICTION_CSV_PATH = '../resources/simple_prediction.csv'
LIGHTFM_PREDICTION_PICKLE_PATH = '../resources/lightfm_prediction.pkl'
LIGHTFM_PREDICTION_CSV_PATH = '../resources/lightfm_prediction.csv'

RANDOM_SEED = 671993

N_THREADS = 6
N_EPOCHS = 20

In [3]:
original_df = pd.read_csv('../resources/train.csv')

In [4]:
len(original_df["account_id"].unique())

113881

In [5]:
df_train = pd.read_csv(
    '../resources/train_full.csv',
    parse_dates=[
        'tunein',
        'tuneout',
        'end_vod_date'
    ]
)
# df_train = df_train[~np.isinf(df_train.pct_seen)]

In [6]:
len(df_train["account_id"].unique())

113879

In [7]:
missing_acc = set(original_df["account_id"].unique()) - set(df_train["account_id"].unique())
len(missing_acc)

2

In [12]:
available_contents = df_train[df_train['end_vod_date'] > '2021-12-31 23:59:59+00:00']['content_id'].unique()
len(available_contents)

937

### Seen by account
We process which contents have been seen by all the different accounts and store it on a dict.

In [13]:
if os.path.exists(SEEN_PICKLE_PATH):
    seen = pickle.load(open(SEEN_PICKLE_PATH, 'rb'))
else:
    seen = {}
    for account_id in tqdm(range(113880+1)):
        seen[account_id] = df_train\
            .loc[df_train["account_id"] == account_id, "content_id"]\
            .tolist()

    pickle.dump(seen, open(SEEN_PICKLE_PATH, 'wb'))

# Simple Prediction

In [14]:
gb = df_train.groupby("account_id", as_index=False).size()
gb2 = df_train.groupby("content_id", as_index=False).size()

In [16]:
simple_prediction = {}
for account_id in tqdm(seen.keys()):
    simple_prediction[account_id] = gb.index[:200].values
    simple_prediction[account_id] = [t for t in simple_prediction[account_id] if t not in seen[account_id] and t in available_contents]
    simple_prediction[account_id] = simple_prediction[account_id][:20]

    assert len(simple_prediction[account_id]) == 20

  0%|          | 0/113881 [00:00<?, ?it/s]

In [17]:
pickle.dump(simple_prediction, open(SIMPLE_PREDICTION_PICKLE_PATH, 'wb'))

with open(SIMPLE_PREDICTION_CSV_PATH, "wt") as f:
    _ = f.write("account_id,content_ids\n")

    for account_id in tqdm(simple_prediction.keys()):
        _ = f.write(f"{account_id},{' '.join(map(str, simple_prediction[account_id]))}\n")

  0%|          | 0/113881 [00:00<?, ?it/s]

In [18]:
df_train.head(5)

Unnamed: 0,customer_id,account_id,device_type,asset_id,tunein,tuneout,resume,content_id,run_time_min,end_vod_date,view_time_min
0,94636,0,STB,15900,2021-01-18 15:21:00,2021-01-18 17:29:00,0,1503.0,128.0,2021-03-05 23:59:59+00:00,128.0
1,94636,0,STB,13056,2021-01-13 00:12:00,2021-01-13 01:17:00,1,2866.0,86.0,2021-12-31 23:59:59+00:00,65.0
2,94636,0,STB,29811,2021-01-12 21:13:00,2021-01-12 22:32:00,0,3438.0,79.0,2021-05-11 23:59:00+00:00,79.0
3,94636,0,STB,29897,2021-01-17 01:45:00,2021-01-17 02:01:00,1,3498.0,96.0,2023-10-01 23:59:59+00:00,16.0
4,94636,0,CLOUD_CLIENT,6397,2021-02-25 21:05:00,2021-02-25 21:53:00,0,3845.0,43.0,2021-06-01 23:59:00+00:00,48.0


## Dataset Filtering Preparation

In [19]:
df_train.shape

(3657779, 11)

In [20]:
# # accounts which have seen at least N contents
# min_views_acc_cap = gb.loc[gb["size"] >= 5, "account_id"].unique()
#
# df_train = df_train[df_train["account_id"].isin(min_views_acc_cap)]
# df_train.shape

In [21]:
# # contents viewed at least by N accounts
# min_views_content_cap = gb2.loc[gb2["size"] >= 10, "content_id"].unique()
#
# df_train = df_train[df_train["content_id"].isin(min_views_content_cap)]
# df_train.shape

In [22]:
# # Filter contents which have been seen more than 20%
# df_train = df_train[df_train.pct_seen > 0.10]
#
# df_train.shape


# LightFM (Collaborative Filtering)

## Training
We will use lightfm to traing our model.

In [23]:
df_train = df_train[['account_id', 'content_id']]
df_train.shape

(3657779, 2)

In [24]:
df_train = df_train.drop_duplicates()
df_train.shape

(971564, 2)

In [28]:
df_train = df_train.dropna()
df_train.shape


(971470, 2)

In [29]:
missing_acc = set(original_df["account_id"].unique()) - set(df_train["account_id"].unique())
len(missing_acc)

6

In [30]:
ds = data.Dataset()
ds.fit(
    users=df_train["account_id"].unique(),
    items=df_train["content_id"].unique()
)
ds.interactions_shape()

(113875, 4064)

In [31]:
(interactions, weights) = ds.build_interactions(
    df_train[['account_id', 'content_id']].itertuples(index=False)
)
del weights
interactions

<113875x4064 sparse matrix of type '<class 'numpy.int32'>'
	with 971470 stored elements in COOrdinate format>

In [32]:
model = lfm.LightFM(
    learning_rate=0.05,
    loss='warp',
    no_components=20,
    random_state=RANDOM_SEED
)

model.fit(
    interactions,
    epochs=N_EPOCHS,
    num_threads=N_THREADS
)

<lightfm.lightfm.LightFM at 0x157ab67d0>

## Prediction

In [33]:
user_id_mapping, user_feature_mapping, item_id_mapping, item_feature_mapping = ds.mapping()
# item_id_mapping

In [34]:
# # Prediction Test
model.predict(
    np.array([user_id_mapping[2761], user_id_mapping[2834]], dtype=np.int32),
    np.array([item_id_mapping[2645], item_id_mapping[2220]], dtype=np.int32),
    num_threads=N_THREADS
)

array([-3.5945892, -1.30806  ], dtype=float32)

In [35]:
len(df_train["account_id"].unique())

113875

In [36]:
# TODO ver por que tenemos menos records
# (hipotesis es que en algun momento filtramos account_ids)
all_contents = df_train["content_id"].unique()

lightfm_prediction = {}

for account_id in tqdm(seen.keys()):
    not_seen = [t for t in all_contents if t not in seen[account_id]]

    if account_id in missing_acc:
        lightfm_prediction[account_id] = simple_prediction[account_id]
    else:
        pred = model.predict(
            user_id_mapping[account_id],
            np.array([item_id_mapping[t] for t in not_seen]),
            num_threads=N_THREADS
        )

        # Join lists ordered by prediction
        lightfm_prediction[account_id] = [t[1] for t in sorted(zip(pred, not_seen), reverse=True)[:200]]

        # Filtramos por contenidos que sigan disponibles despues del 31-12
        lightfm_prediction[account_id] = [t for t in lightfm_prediction[account_id] if t in available_contents]

        lightfm_prediction[account_id] = list(set(lightfm_prediction[account_id]))

        if len(lightfm_prediction[account_id]) < 20:
            # no_personalizado --> perfiles
            lightfm_prediction[account_id] = list(set(lightfm_prediction[account_id] + [p for p in simple_prediction[account_id] if p not in lightfm_prediction[account_id]]))

        if len(lightfm_prediction[account_id]) > 20:
            lightfm_prediction[account_id] = lightfm_prediction[account_id][:20]

    assert len(set(lightfm_prediction[account_id])) == 20

  0%|          | 0/113881 [00:00<?, ?it/s]

In [44]:
example = set(map(lambda x: x.astype(int), lightfm_prediction[1]))
example

{6,
 136,
 289,
 387,
 392,
 524,
 527,
 658,
 1176,
 1573,
 2186,
 2202,
 2314,
 2827,
 2972,
 3091,
 3353,
 3712,
 3726,
 3727}

In [45]:
pickle.dump(lightfm_prediction, open(LIGHTFM_PREDICTION_PICKLE_PATH, 'wb'))

with open(LIGHTFM_PREDICTION_CSV_PATH, "wt") as f:
    _ = f.write("account_id,content_ids\n")

    for account_id in tqdm(lightfm_prediction.keys()):
        contents = set(map(lambda x: x.astype(int), lightfm_prediction[account_id]))
        _ = f.write(f"{account_id},{' '.join(map(str, contents))}\n")




  0%|          | 0/113881 [00:00<?, ?it/s]