In [12]:
import pandas as pd
from common.data import DataLoader
from common.metrics import map_at_k
from models.popular import PopularRecommender, SegmentRecommender
from models.lightfm import WeightFeaturedLightFM
from recsys_course.const import *

In [2]:
data = DataLoader.from_folder(
    '../data/preprocessed/',
    user_col='user_id',
    item_col='item_id',
    date_col='last_watch_dt',
    watched_pct_min=0
)

In [11]:
submission = pd.read_csv('../data/raw/sample_submission.csv')

(
    set(submission['user_id'].tolist())
    .difference(set(data.interactions[USER_COL]))
    .intersection(set(data.users[USER_COL].unique().tolist()))
)

{393224,
 524297,
 131083,
 393235,
 655382,
 524312,
 917528,
 655391,
 524321,
 524323,
 655397,
 131118,
 524338,
 655411,
 131127,
 786501,
 70,
 917574,
 917577,
 786507,
 655437,
 917585,
 85,
 1048668,
 917597,
 97,
 131170,
 524387,
 1048675,
 655459,
 262242,
 786533,
 655471,
 786551,
 262263,
 124,
 1048700,
 262272,
 524421,
 135,
 1048713,
 1048714,
 1048718,
 1048720,
 524440,
 524441,
 786584,
 1048731,
 786585,
 131229,
 786588,
 1048735,
 262304,
 131235,
 393379,
 169,
 262314,
 1048750,
 786610,
 180,
 184,
 131260,
 190,
 262335,
 201,
 1048780,
 524493,
 655564,
 131278,
 786640,
 524499,
 1048791,
 220,
 786652,
 262367,
 393443,
 1048806,
 233,
 524521,
 393451,
 1048813,
 262384,
 248,
 1048825,
 262396,
 258,
 917765,
 262413,
 655630,
 262415,
 393491,
 524564,
 277,
 131351,
 786713,
 524575,
 524584,
 262440,
 298,
 262441,
 131372,
 302,
 786735,
 524594,
 262452,
 262454,
 655673,
 524603,
 131388,
 917824,
 786753,
 131396,
 917828,
 786758,
 917831,
 104

In [3]:
train, test = data.get_train_test(test_size=0.3)

train[DATE_COL] = pd.to_datetime(train[DATE_COL])
test[DATE_COL] = pd.to_datetime(test[DATE_COL])

In [13]:
cold_user_ids = set(test['user_id'].unique().tolist()).difference(set(train['user_id'].unique().tolist()))

test_cold = test[test['user_id'].isin(cold_user_ids)]
test_warm = test[~test['user_id'].isin(cold_user_ids)]

## Cold Predictions

In [14]:
test_cold_df = test_cold.groupby('user_id')['item_id'].apply(list).reset_index().rename(columns={'item_id': 'real'})

In [15]:
popular = PopularRecommender(
    fb__min_watched_pct=10,
    fb__total_dur_min=2000,
    days=10,
    date_col=DATE_COL,
    user_col=USER_COL,
    item_col=ITEM_COL
)

popular.fit(train)

test_cold_df['recs'] = popular.recommend(test_cold_df['user_id'].tolist(), 10)

map_at_k(
    10,
    test_cold_df['recs'],
    test_cold_df['real']
)

0.27165462904601767

In [17]:
popular = SegmentRecommender(
    fb__min_watched_pct=10,
    fb__total_dur_min=2000,
    days=10,
    date_col=DATE_COL,
    user_col=USER_COL,
    item_col=ITEM_COL,
    segment=['age', 'sex']
)

popular.add_user_features(data.users)

popular.fit(train)

test_cold_df['recs'] = popular.recommend(test_cold_df['user_id'].tolist(), 10)

map_at_k(
    10,
    test_cold_df['recs'],
    test_cold_df['real']
)

32it [00:10,  2.94it/s]


0.2930093404790069

In [20]:
popular.recommend([], 10)

0    [15297, 10440, 13865, 9728, 12192, 4151, 3734,...
Name: recs, dtype: object