# Cold Start Baselines

Algorithms

1. `best_memes_from_each_source`. Current production (with small diffs)
2. `most_liked`. Same as prod but with randomization of top 100
3. `best_memes_from_each_cluster`. Is based on custom clusters rather than sources

Also, alternative meme stats were tried. The difference is that the alternative version only counts reactions from users that have less than 200 reactions.

**Resolution**

* Most likes on alternative stats go to AB test


In [3]:
import hashlib
import json
from datetime import datetime

import numpy as np
import polars as pl
from sklearn.model_selection import train_test_split

In [68]:
coldstart_df = pl.read_parquet('coldstart_dataset.pq')
meme_features_daily_df = pl.read_parquet('meme_features_daily.pq')

# v2 is calculated on users with less than 200 responses
meme_features_daily_v2_df = pl.read_parquet('meme_features_daily_v2.pq')

# for cluster based approach
meme_clusters_df = pl.read_parquet('meme_clusters.pq')

In [5]:
coldstart_df.slice(2, 1)

user_id,hist_size,dtm,date_dtm,hist_memes,hist_reactions,target_memes,target_reactions
str,i64,datetime[μs],datetime[μs],list[str],list[i64],list[str],list[i64]
"""486,191,407""",20,2024-04-03 19:20:00,2024-04-03 00:00:00,"[""1,237,876"", ""2,829,942"", … ""3,755,263""]","[2, 2, … 1]","[""1,197,484"", ""3,546,640"", … ""893""]","[2, 2, … 2]"


In [70]:
# Using last two weeks of the dataset since the other data were used for models training for cluter-based approaches

validation_df = (
    coldstart_df
    .filter(pl.col('date_dtm') >= datetime(2024, 4, 1))
    .filter(pl.col('date_dtm') < datetime(2024, 4, 14))
    .filter(pl.col('hist_size') <= 30)
    .filter(pl.col('target_memes').list.len() < 200)
)
len(validation_df)

5113

In [12]:
meme_features_daily_df.head(1)

meme_id,language_code,n_memes_sent,n_likes,n_dislikes,age_days,date_dtm,meme_source_id
str,str,u32,i64,i64,i64,datetime[μs],i64
"""12,528""","""ru""",0,0,0,11,2024-03-01 00:00:00,46


In [77]:
class BaseRecommender():
    def recommend(self, user_id, date_dtm, prev_ids, prev_reactions, lang_code=None):
        raise NotImplementedError

    def filter_seen(self, recs, hist_memes):
        return [meme_id for meme_id in recs if meme_id not in hist_memes]

In [78]:
class BestMemeFromEachSource(BaseRecommender):
    """
    Similar to production. Simplifications:
    Memes without stats were omitted  
    Top impression feature is omitted (gives 1.0 vs 0.8 for top 1 meme from a source by its telegram impressions)
    Impressions without reactions are omitted
    """

    score = pl.when(pl.col('age_days') < 14).then(1.0).otherwise(0.8) * pl.col('n_likes') / (pl.col('n_likes') + pl.col('n_dislikes'))

    def __init__(self, meme_features_daily_df):

        self._cache = dict()
        for date_dtm in meme_features_daily_df.select('date_dtm').unique().get_column('date_dtm').to_list():
            for lang_code in ['ru', 'en', None]:
                recs = (
                    meme_features_daily_df
                    .filter(pl.col('date_dtm') == date_dtm)
                    .filter(pl.col('n_likes') + pl.col('n_dislikes') > 0)
                    .with_columns(self.score.alias('score'))
                    .sort('score', descending=True)
                    .group_by('meme_source_id')
                    .agg(pl.all().first())
                )
                if lang_code is not None:
                    recs = recs.filter(pl.col('language_code') == lang_code)

                self._cache[(date_dtm, lang_code)] = (
                    recs
                    .get_column('meme_id')
                    .to_list()
                )


    def recommend(self, user_id, date_dtm, prev_ids, prev_reactions, lang_code=None):
        return self.filter_seen(self._cache[(date_dtm, lang_code)], prev_ids)

In [79]:
class MostLiked(BaseRecommender):
    """
    Similar to production. Simplifications:
    Memes without stats were omitted  
    Top impression feature is omitted (gives 1.0 vs 0.8 for top 1 meme from a source by its telegram impressions)
    Impressions without reactions are omitted
    """

    score = pl.when(pl.col('age_days') < 14).then(1.0).otherwise(0.8) * pl.col('n_likes') / (pl.col('n_likes') + pl.col('n_dislikes'))

    def __init__(self, meme_features_daily_df):

        self._cache = dict()

        for date_dtm in meme_features_daily_df.select('date_dtm').unique().get_column('date_dtm').to_list():
            for lang_code in ['ru', 'en', None]:
                recs = (
                    meme_features_daily_df
                    .filter(pl.col('date_dtm') == date_dtm)
                    .filter(pl.col('n_memes_sent') > 10)
                    .with_columns(self.score.alias('score'))
                )
                if lang_code is not None:
                    recs = recs.filter(pl.col('language_code') == lang_code)

                self._cache[(date_dtm, lang_code)] = (
                    recs
                    .sort('score', descending=True)
                    .head(100)
                    .select(pl.col('meme_id').shuffle(int(date_dtm.timestamp())))
                    .get_column('meme_id')
                    .to_list()
                )

    def recommend(self, user_id, date_dtm, prev_ids, prev_reactions, lang_code=None):
        return self.filter_seen(self._cache[(date_dtm, lang_code)], prev_ids)

In [80]:
class RandomRecs(BaseRecommender):
    """
    Similar to production. Simplifications:
    Memes without stats were omitted  
    Top impression feature is omitted (gives 1.0 vs 0.8 for top 1 meme from a source by its telegram impressions)
    Impressions without reactions are omitted
    """

    score = pl.when(pl.col('age_days') < 14).then(1.0).otherwise(0.8) * pl.col('n_likes') / (pl.col('n_likes') + pl.col('n_dislikes'))

    def __init__(self, meme_features_daily_df):

        self.meme_features_daily_df = meme_features_daily_df

    def recommend(self, user_id, date_dtm, prev_ids, prev_reactions, lang_code=None):
        hash = int(hashlib.sha256(user_id.encode('utf-8')).hexdigest(), 16) % 10**8
        recs = (
            meme_features_daily_df
            .filter(pl.col('date_dtm') == date_dtm)
        )
        if lang_code is not None:
            recs = recs.filter(pl.col('language_code') == lang_code)

        recs = (
            recs.sample(100, seed=hash)
            .get_column('meme_id')
            .to_list()
        )
        return self.filter_seen(recs, prev_ids)

In [81]:
class BestFromEachCluster(BaseRecommender):
    """
    Cluster Based Approach
    """

    score = pl.when(pl.col('age_days') < 14).then(1.0).otherwise(0.8) * pl.col('n_likes') / (pl.col('n_likes') + pl.col('n_dislikes'))

    def __init__(self, meme_features_daily_df, meme_clusters_df):

        self._cache = dict()

        # top 10 memes from 10 clusters
        for date_dtm in meme_features_daily_df.select('date_dtm').unique().get_column('date_dtm').to_list():
            self._cache[date_dtm] = (
            meme_features_daily_df
            .filter(pl.col('date_dtm') == date_dtm)
            .filter(pl.col('n_memes_sent') > 10)
            .join(meme_clusters_df, on='meme_id')
            .with_columns(self.score.alias('score'))
            .sort('score', descending=True)
            .group_by('cluster_id')
            .agg(pl.col('meme_id').head(10))
            .explode(pl.col('meme_id'))
            .select(pl.col('meme_id').shuffle(int(date_dtm.timestamp())))
            .get_column('meme_id')
            .to_list()
        )
        for date_dtm in meme_features_daily_df.select('date_dtm').unique().get_column('date_dtm').to_list():
            for lang_code in ['ru', 'en', None]:
                recs = (
                    meme_features_daily_df
                    .filter(pl.col('date_dtm') == date_dtm)
                    .filter(pl.col('n_memes_sent') > 10)
                    .join(meme_clusters_df, on='meme_id')
                    .with_columns(self.score.alias('score'))
                )
                if lang_code is not None:
                    recs = recs.filter(pl.col('language_code') == lang_code)

                self._cache[(date_dtm, lang_code)] = (
                    recs
                    .sort('score', descending=True)
                    .group_by('cluster_id')
                    .agg(pl.col('meme_id').head(10))
                    .explode(pl.col('meme_id'))
                    .select(pl.col('meme_id').shuffle(int(date_dtm.timestamp())))
                    .get_column('meme_id')
                    .to_list()
                )

    def recommend(self, user_id, date_dtm, prev_ids, prev_reactions, lang_code=None):
        return self.filter_seen(self._cache[(date_dtm, lang_code)], prev_ids)

In [202]:
def estimate_one(recs, target_memes, target_reactions):
    """Matches recs with future seen memes from target list
    Calculates likes and dislikes"""
    likes = 0
    dislikes = 0
    for meme_id in recs:
        if meme_id not in target_memes:
            continue
        idx = target_memes.index(meme_id)
        reaction = target_reactions[idx]
        if reaction == 1:
            likes += 1
            continue
        if reaction == 2:
            dislikes += 1
            continue

    if (likes + dislikes) == 0:
        return None, None, None
    
    lr = likes / (likes + dislikes)

    return likes, dislikes, lr


def estimate(model: BaseRecommender, df: pl.DataFrame):
    rows = []
    for row in df.iter_rows(named=True):
        recs_ru = model.recommend(row['user_id'], row['date_dtm'], row['hist_memes'], row['hist_reactions'], lang_code='ru')[:100]
        recs_en = model.recommend(row['user_id'], row['date_dtm'], row['hist_memes'], row['hist_reactions'], lang_code='en')[:50]
        # recs_ru = []
        # recs_en = []
        # recs_all = []
        recs_all = model.recommend(row['user_id'], row['date_dtm'], row['hist_memes'], row['hist_reactions'])[:50]

        recs = recs_ru + recs_en + recs_all
        recs = list(set(recs))

        likes, dislikes, lr = estimate_one(recs, row['target_memes'], row['target_reactions'])

        rows.append({
            'user_id': row['user_id'],
            'hist_size': row['hist_size'],
            'date_dtm': row['date_dtm'],
            'likes': likes,
            'dislikes': dislikes,
            'lr': lr,
        })

    results_df = pl.DataFrame(rows)

    likes = results_df['likes'].sum()
    n = results_df['likes'].sum() + results_df['dislikes'].sum()
    lr = results_df['likes'].sum() / (results_df['likes'].sum() + results_df['dislikes'].sum())
    lr_micro = results_df['lr'].mean()
    std = np.sqrt(lr * (1 - lr) / n)

    print(f'Likes - {likes}, Like Rate = {lr:.3f} +-{std * 1.98:.3f}, Like Rate Micro = {lr_micro:.3f}')

In [203]:
random_model = RandomRecs(meme_features_daily_df)
estimate(random_model, validation_df)

Likes - 304, Like Rate = 0.528 +-0.041, Like Rate Micro = 0.530


In [204]:
best_meme_from_each_source = BestMemeFromEachSource(meme_features_daily_df)
estimate(best_meme_from_each_source, validation_df)

Likes - 822, Like Rate = 0.544 +-0.025, Like Rate Micro = 0.551


In [205]:
most_liked = MostLiked(meme_features_daily_df)
estimate(most_liked, validation_df)

Likes - 5279, Like Rate = 0.576 +-0.010, Like Rate Micro = 0.541


In [206]:
most_liked_v2 = MostLiked(meme_features_daily_v2_df)
estimate(most_liked_v2, validation_df)

Likes - 9557, Like Rate = 0.592 +-0.008, Like Rate Micro = 0.531


In [208]:
best_from_each_cluster = BestFromEachCluster(meme_features_daily_df, meme_clusters_df)
estimate(best_from_each_cluster, validation_df)

Likes - 4052, Like Rate = 0.577 +-0.012, Like Rate Micro = 0.538


In [207]:
best_from_each_cluster_v2 = BestFromEachCluster(meme_features_daily_v2_df, meme_clusters_df)
estimate(best_from_each_cluster_v2, validation_df)

Likes - 7152, Like Rate = 0.585 +-0.009, Like Rate Micro = 0.543


## Most liked on meme_stats_v2 are going for testing

In [66]:
with open('240421_meme_ids_100.json', 'w') as f:
    ids = most_liked_v2._cache[datetime(2024, 4, 13)]
    ids = [int(id.replace(',', '')) for id in ids]
    json.dump(ids, f)

## Code to generate query string for the bot

In [162]:
'/meme ' + ' '.join([s.replace(',', '') for s in best_from_each_cluster_v2._cache[(datetime(2024, 4, 10), 'ru')][:100]])

'/meme 3858183 1106732 1117990 321242 1721545 1901653 3859587 2279191 3618063 3933620'

## AB test

In [201]:
', '.join(
    [s.replace(',', '') for s in most_liked_v2._cache[(datetime(2024, 4, 10), 'en')][:50]]
    + [s.replace(',', '') for s in most_liked_v2._cache[(datetime(2024, 4, 10), 'ru')][:100]]
    + [s.replace(',', '') for s in most_liked_v2._cache[(datetime(2024, 4, 10), None)][:50]]
)

'4101086, 4442353, 3755262, 4524041, 914304, 1213657, 3477742, 3850309, 4106545, 3918656, 1976055, 3729527, 4370768, 4031941, 3902467, 3940729, 3966109, 4144377, 4131644, 4720051, 4438220, 943398, 3486879, 3958437, 3193252, 4011185, 3855063, 4261258, 4368086, 4255270, 1194244, 10222, 4818828, 3820043, 758408, 3188657, 4451345, 2050874, 4665040, 4106819, 3798967, 1825631, 3140601, 4840661, 4250457, 10202, 4363045, 3823857, 3755199, 4214428, 3604880, 3759401, 3928967, 3859587, 1240438, 4634391, 4002944, 2914449, 1955395, 1902244, 4256739, 1721327, 1285555, 1901653, 1584871, 3517077, 4493086, 4128512, 3570595, 3975285, 1484762, 1811655, 1071204, 4033401, 2294710, 4236782, 881987, 4180263, 1100991, 3867070, 1859048, 4285721, 1466518, 2262302, 4478289, 1859157, 4232654, 1202886, 978202, 2279188, 1892350, 961273, 4033397, 3513207, 3635346, 4320621, 4558947, 4252321, 1084225, 2350587, 4339982, 3724969, 3613758, 1768655, 4148626, 1285566, 2181541, 1103300, 3516406, 1197518, 4036174, 3537906, 2