In [1]:
import pandas as pd
from common.data import DataLoader
from common.metrics import map_at_k
from models.popular import SegmentRecommender, PopularRecommender
from models.lightfm import WeightFeaturedLightFM
from recsys_course.const import *



In [2]:
data = DataLoader.from_folder(
    '../data/preprocessed/',
    user_col='user_id',
    item_col='item_id',
    date_col='last_watch_dt',
    watched_pct_min=0
)

In [3]:
train = data.interactions.copy()
train[DATE_COL] = pd.to_datetime(train[DATE_COL])

In [4]:
test = pd.read_csv('../data/raw/sample_submission.csv').drop('item_id', axis=1)
test

Unnamed: 0,user_id
0,3
1,11
2,29
3,30
4,33
...,...
193108,1097527
193109,1097537
193110,1097538
193111,1097544


In [5]:
cold_user_ids = set(test['user_id'].unique().tolist()).difference(set(train['user_id'].unique().tolist()))

test_cold = test[test['user_id'].isin(cold_user_ids)].reset_index(drop=True)
test_warm = test[~test['user_id'].isin(cold_user_ids)].reset_index(drop=True)

## Cold Predictions

In [6]:
test_cold_no_features = test_cold.loc[~test_cold['user_id'].isin(data.users['user_id'].tolist())].reset_index(drop=True)
test_cold_features = test_cold.loc[test_cold['user_id'].isin(data.users['user_id'].tolist())].reset_index(drop=True)

In [7]:
fallback = PopularRecommender(
    fb__min_watched_pct=10,
    fb__total_dur_min=2000,
    days=10,
    date_col=DATE_COL,
    user_col=USER_COL,
    item_col=ITEM_COL,
)

fallback.fit(train)

test_cold_no_features['item_id'] = fallback.recommend(test_cold_no_features['user_id'].tolist(), 10)

In [8]:
popular = SegmentRecommender(
    fb__min_watched_pct=10,
    fb__total_dur_min=2000,
    days=10,
    date_col=DATE_COL,
    user_col=USER_COL,
    item_col=ITEM_COL,
    segment=['age', 'sex']
)

popular.add_user_features(data.users)

popular.fit(train)

test_cold_features['item_id'] = popular.recommend(test_cold_features['user_id'].tolist(), 10)

32it [00:07,  4.03it/s]


In [9]:
test_cold_features

Unnamed: 0,user_id,item_id
0,70,"[15297, 9728, 10440, 4151, 3734, 13865, 2657, ..."
1,85,"[15297, 10440, 9728, 13865, 512, 3734, 12192, ..."
2,97,"[15297, 10440, 9728, 13865, 12192, 3734, 512, ..."
3,124,"[15297, 10440, 9728, 13865, 12192, 3734, 512, ..."
4,135,"[9728, 10440, 15297, 13865, 3734, 14488, 4151,..."
...,...,...
46203,1097453,"[15297, 10440, 9728, 3734, 4151, 13865, 12192,..."
46204,1097494,"[9728, 10440, 15297, 13865, 3734, 14488, 4151,..."
46205,1097537,"[9728, 10440, 13865, 15297, 512, 12192, 14488,..."
46206,1097538,"[9728, 13865, 10440, 15297, 3734, 512, 4685, 1..."


## Warm Predictions

In [None]:
lfm = WeightFeaturedLightFM(
    notseen_watched_upper=95,
    notseen_watched_lower=5,
    no_components=50,
    date_col=DATE_COL,
    user_col=USER_COL,
    item_col=ITEM_COL,
    user_features_col=None,
    item_features_col=None,
    preprocess_array_split=None
)

lfm.fit(train)

In [None]:
ib, ie = lfm.lightfm.get_item_representations()
_, _, map_, _ = lfm.data.mapping()
item_embeddings = pd.DataFrame(zip(map_.keys(), *ie.T), columns=['item_id'] + [f'ie_{i}' for i in range(50)])

item_embeddings.shape

In [None]:
ub, ue = lfm.lightfm.get_user_representations()
map_, _, _, _ = lfm.data.mapping()
user_embeddings = pd.DataFrame(zip(map_.keys(), *ue.T), columns=['user_id'] + [f'ue_{i}' for i in range(20)])

user_embeddings.shape

In [None]:
cbc_df = train.loc[:, ['user_id', 'item_id']]
cbc_df.loc[:, 'y'] = 1
cbc_unused = data.unused.copy()
cbc_unused['y'] = 0

cbc_df = pd.merge(
    left=pd.merge(
        left=pd.merge(
            left=pd.merge(
                left=pd.concat([cbc_df, cbc_unused]),
                right=data.users,
                on=[USER_COL],
                how='left'
            ),
            right=data.items,
            on=[ITEM_COL],
            how='left'
        ),
        right=user_embeddings,
        on=[USER_COL],
        how='left'
    ),
    right=item_embeddings,
    on=[ITEM_COL],
    how='left'
)

cbc_df

In [None]:
cbc_df.info()

In [None]:
cbc_df['sex'] = cbc_df['sex'].fillna('unknown')
cbc_df['age'] = cbc_df['age'].fillna('age_unknown')
cbc_df['genres'] = cbc_df['genres'].fillna('genres_unknown')
cbc_df['income'] = cbc_df['income'].fillna('income_unknown')
cbc_df['genres'] = cbc_df['genres'].map(lambda x: x.replace(', ', ' '))

In [None]:
cbc_df['genres']

In [None]:
cbc_df['keywords'].map(lambda x: x.lower().replace(', '))

In [None]:
features = (
    ['sex', 'age', 'income', 'release_year_cat', 'content_type']
    + ['genres']
    + [f'ie_{i}' for i in range(20)]
    + [f'ue_{i}' for i in range(20)]
)

In [None]:
from catboost import CatBoostClassifier

cbc = CatBoostClassifier(
    cat_features=[0, 1, 2, 3, 4],
    text_features=[5],
    iterations=20
)

cbc.fit(cbc_df[features].values, cbc_df['y'])

In [None]:
test_warm['recs'] = lfm.recommend(test_warm['user_id'].tolist(), 120)

In [None]:
test_pred = pd.merge(
    pd.merge(
    left=pd.merge(
        left=pd.merge(
            left=test_warm.explode('recs').rename(columns={'recs': 'item_id'}),
            right=data.items,
            on=['item_id'],
            how='left'
        ),
        right=data.users,
        on=['user_id'],
        how='left'
    ),
    
        right=user_embeddings,
        on=[USER_COL],
        how='left'
    ),
    right=item_embeddings,
    on=[ITEM_COL],
    how='left'
)

test_pred['sex'] = test_pred['sex'].fillna('unknown')
test_pred['income'] = test_pred['income'].fillna('income_unknown')
test_pred['age'] = test_pred['age'].fillna('age_unknown')
test_pred['genres'] = test_pred['genres'].fillna('genres_unknown')

test_pred['rating'] = cbc.predict_proba(test_pred[features].values)[:, 1]
test_pred = (
    test_pred
    .groupby('user_id')
    .apply(lambda x: x.sort_values('rating', ascending=False)['item_id'].tolist()[:10])
)

In [None]:
(
    test.shape[0]
) == (
    test_cold_no_features.shape[0]
    + test_cold_features.shape[0]
    + test_pred.shape[0]
)

In [None]:
test_cold_no_features.isna().sum()

In [None]:
test_cold_features.isna().sum()

In [None]:
test_pred.reset_index().isna().sum()

In [None]:
pd.concat([
    test_cold_no_features,
    test_cold_features,
    test_pred.reset_index().rename(columns={0: 'item_id'})
]).to_csv('../data/submit.csv', index=None)