In [1]:
import pandas as pd
from common.data import DataLoader
from common.metrics import map_at_k
from models.popular import SegmentRecommender, PopularRecommender
from models.lightfm import WeightFeaturedLightFM
from recsys_course.const import *



In [2]:
data = DataLoader.from_folder(
    '../data/preprocessed/',
    user_col='user_id',
    item_col='item_id',
    date_col='last_watch_dt',
    watched_pct_min=0
)

In [3]:
train = data.interactions.copy()
train[DATE_COL] = pd.to_datetime(train[DATE_COL])

In [4]:
test = pd.read_csv('../data/raw/sample_submission.csv').drop('item_id', axis=1)
test

Unnamed: 0,user_id
0,3
1,11
2,29
3,30
4,33
...,...
193108,1097527
193109,1097537
193110,1097538
193111,1097544


In [5]:
cold_user_ids = set(test['user_id'].unique().tolist()).difference(set(train['user_id'].unique().tolist()))

test_cold = test[test['user_id'].isin(cold_user_ids)].reset_index(drop=True)
test_warm = test[~test['user_id'].isin(cold_user_ids)].reset_index(drop=True)

## Cold Predictions

In [6]:
test_cold_no_features = test_cold.loc[~test_cold['user_id'].isin(data.users['user_id'].tolist())].reset_index(drop=True)
test_cold_features = test_cold.loc[test_cold['user_id'].isin(data.users['user_id'].tolist())].reset_index(drop=True)

In [7]:
fallback = PopularRecommender(
    fb__min_watched_pct=10,
    fb__total_dur_min=2000,
    days=10,
    date_col=DATE_COL,
    user_col=USER_COL,
    item_col=ITEM_COL,
)

fallback.fit(train)

test_cold_no_features['item_id'] = fallback.recommend(test_cold_no_features['user_id'].tolist(), 10)

In [8]:
popular = SegmentRecommender(
    fb__min_watched_pct=10,
    fb__total_dur_min=2000,
    days=10,
    date_col=DATE_COL,
    user_col=USER_COL,
    item_col=ITEM_COL,
    segment=['age', 'sex']
)

popular.add_user_features(data.users)

popular.fit(train)

test_cold_features['item_id'] = popular.recommend(test_cold_features['user_id'].tolist(), 10)

32it [00:07,  4.07it/s]


In [9]:
test_cold_features

Unnamed: 0,user_id,item_id
0,70,"[15297, 9728, 10440, 4151, 3734, 13865, 2657, ..."
1,85,"[15297, 10440, 9728, 13865, 512, 3734, 12192, ..."
2,97,"[15297, 10440, 9728, 13865, 12192, 3734, 512, ..."
3,124,"[15297, 10440, 9728, 13865, 12192, 3734, 512, ..."
4,135,"[9728, 10440, 15297, 13865, 3734, 14488, 4151,..."
...,...,...
46203,1097453,"[15297, 10440, 9728, 3734, 4151, 13865, 12192,..."
46204,1097494,"[9728, 10440, 15297, 13865, 3734, 14488, 4151,..."
46205,1097537,"[9728, 10440, 13865, 15297, 512, 12192, 14488,..."
46206,1097538,"[9728, 13865, 10440, 15297, 3734, 512, 4685, 1..."


## Warm Predictions

In [10]:
lfm = WeightFeaturedLightFM(
    notseen_watched_upper=95,
    notseen_watched_lower=5,
    no_components=50,
    date_col=DATE_COL,
    user_col=USER_COL,
    item_col=ITEM_COL,
    user_features_col=None,
    item_features_col=None,
    preprocess_array_split=None
)

lfm.fit(train)

In [11]:
ib, ie = lfm.lightfm.get_item_representations()
_, _, map_, _ = lfm.data.mapping()
item_embeddings = pd.DataFrame(zip(map_.keys(), *ie.T), columns=['item_id'] + [f'ie_{i}' for i in range(50)])

item_embeddings.shape

(15706, 51)

In [12]:
ub, ue = lfm.lightfm.get_user_representations()
map_, _, _, _ = lfm.data.mapping()
user_embeddings = pd.DataFrame(zip(map_.keys(), *ue.T), columns=['user_id'] + [f'ue_{i}' for i in range(50)])

user_embeddings.shape

(962179, 51)

In [13]:
cbc_df = train.loc[:, ['user_id', 'item_id']]
cbc_df.loc[:, 'y'] = 1
cbc_unused = data.unused.copy()
cbc_unused['y'] = 0

cbc_df = pd.merge(
    left=pd.merge(
        left=pd.merge(
            left=pd.merge(
                left=pd.concat([cbc_df, cbc_unused]),
                right=data.users,
                on=[USER_COL],
                how='left'
            ),
            right=data.items,
            on=[ITEM_COL],
            how='left'
        ),
        right=user_embeddings,
        on=[USER_COL],
        how='left'
    ),
    right=item_embeddings,
    on=[ITEM_COL],
    how='left'
)

cbc_df

Unnamed: 0,user_id,item_id,y,age,income,sex,kids_flg,content_type,title,title_orig,...,ie_40,ie_41,ie_42,ie_43,ie_44,ie_45,ie_46,ie_47,ie_48,ie_49
0,176549,9506,1,age_35_44,income_40_60,m,0.0,film,холодное сердце,Frozen,...,-0.085968,-0.277016,-0.167539,-0.172448,0.145825,0.141672,0.212274,0.135558,0.225663,0.529132
1,699317,1659,1,age_35_44,income_40_60,m,0.0,film,три богатыря. ход конем,Tri bogatyrya. Khod konem,...,-0.255111,-0.478870,-0.111820,-0.482972,0.478152,0.031984,0.034501,-0.344941,-0.163555,0.598284
2,656683,7107,1,age_25_34,income_60_90,m,0.0,series,девятаев,V2. Escape from Hell,...,0.206653,0.135595,0.114553,0.312223,0.403134,-0.101877,-0.043148,0.063891,0.170578,-0.151763
3,864613,7638,1,age_65_inf,income_20_40,zh,0.0,series,мишель,Mishel',...,0.079247,0.586656,-0.195071,0.189152,-0.005232,0.459085,-0.198267,-0.107986,0.359136,-0.307118
4,964868,9506,1,age_25_34,income_20_40,zh,0.0,film,холодное сердце,Frozen,...,-0.085968,-0.277016,-0.167539,-0.172448,0.145825,0.141672,0.212274,0.135558,0.225663,0.529132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15098036,1097557,12597,0,age_35_44,income_20_40,zh,0.0,film,эволюция борна,The Bourne Legacy,...,-0.020262,0.072871,-0.017856,0.073200,-0.033969,-0.345137,0.185851,-0.026299,0.032985,-0.004365
15098037,1097557,9332,0,age_35_44,income_20_40,zh,0.0,film,заноза,Splinter,...,0.034966,0.240361,-0.005437,0.063272,-0.063044,-0.069337,0.065688,0.044222,0.011055,0.024862
15098038,1097557,11352,0,age_35_44,income_20_40,zh,0.0,film,тёмная сторона,Virtualia Episode Three: Dark Side,...,0.072225,-0.082565,0.007786,-0.114298,-0.123827,-0.244393,0.238899,-0.031445,0.037276,0.194009
15098039,1097557,15512,0,age_35_44,income_20_40,zh,0.0,series,в последний раз прощаюсь,,...,-0.058781,-0.034337,-0.066000,-0.046340,0.146761,-0.191709,0.301192,-0.082545,-0.087820,0.028705


In [14]:
cbc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15098041 entries, 0 to 15098040
Columns: 122 entries, user_id to ie_49
dtypes: float32(100), float64(3), int64(5), object(14)
memory usage: 8.2+ GB


In [15]:
cbc_df['sex'] = cbc_df['sex'].fillna('unknown')
cbc_df['age'] = cbc_df['age'].fillna('age_unknown')
cbc_df['genres'] = cbc_df['genres'].fillna('genres_unknown')
cbc_df['income'] = cbc_df['income'].fillna('income_unknown')
cbc_df['genres'] = cbc_df['genres'].map(lambda x: x.replace(', ', ' '))

In [19]:
features = (
    ['sex', 'age', 'income', 'release_year_cat', 'content_type']
    + ['genres']
    + [f'ie_{i}' for i in range(50)]
    + [f'ue_{i}' for i in range(50)]
)

In [None]:
from catboost import CatBoostClassifier

cbc = CatBoostClassifier(
    cat_features=[0, 1, 2, 3, 4],
    text_features=[5],
    iterations=20
)

cbc.fit(cbc_df[features].values, cbc_df['y'])

In [None]:
test_warm['recs'] = lfm.recommend(test_warm['user_id'].tolist(), 150)

In [None]:
test_pred = pd.merge(
    pd.merge(
    left=pd.merge(
        left=pd.merge(
            left=test_warm.explode('recs').rename(columns={'recs': 'item_id'}),
            right=data.items,
            on=['item_id'],
            how='left'
        ),
        right=data.users,
        on=['user_id'],
        how='left'
    ),
    
        right=user_embeddings,
        on=[USER_COL],
        how='left'
    ),
    right=item_embeddings,
    on=[ITEM_COL],
    how='left'
)

test_pred['sex'] = test_pred['sex'].fillna('unknown')
test_pred['income'] = test_pred['income'].fillna('income_unknown')
test_pred['age'] = test_pred['age'].fillna('age_unknown')
test_pred['genres'] = test_pred['genres'].fillna('genres_unknown')

test_pred['rating'] = cbc.predict_proba(test_pred[features].values)[:, 1]
test_pred = (
    test_pred
    .groupby('user_id')
    .apply(lambda x: x.sort_values('rating', ascending=False)['item_id'].tolist()[:10])
)

In [None]:
(
    test.shape[0]
) == (
    test_cold_no_features.shape[0]
    + test_cold_features.shape[0]
    + test_pred.shape[0]
)

In [None]:
test_cold_no_features.isna().sum()

In [None]:
test_cold_features.isna().sum()

In [None]:
test_pred.reset_index().isna().sum()

In [None]:
pd.concat([
    test_cold_no_features,
    test_cold_features,
    test_pred.reset_index().rename(columns={0: 'item_id'})
]).to_csv('../data/submit.csv', index=None)