# Cold Start Baselines

Algorithms

1. `best_memes_from_each_source`. Current production (with small diffs)
2. `most_liked`. Same as prod but with randomization of top 100
3. `best_memes_from_each_cluster`. Is based on custom clusters rather than sources

Also, alternative meme stats were tried. The difference is that the alternative version only counts reactions from users that have less than 200 reactions.

**Resolution**

* Most likes on alternative stats go to AB test


In [35]:
import hashlib
import json
from datetime import datetime

import numpy as np
import polars as pl
from sklearn.model_selection import train_test_split

In [36]:
coldstart_df = pl.read_parquet('coldstart_dataset.pq')
meme_features_daily_df = pl.read_parquet('meme_features_daily.pq')

# v2 is calculated on users with less than 200 responses
meme_features_daily_v2_df = pl.read_parquet('meme_features_daily_v2.pq')

In [37]:
coldstart_df.slice(2, 1)

user_id,hist_size,dtm,date_dtm,hist_memes,hist_reactions,target_memes,target_reactions
str,i64,datetime[μs],datetime[μs],list[str],list[i64],list[str],list[i64]
"""486,191,407""",20,2024-04-03 19:20:00,2024-04-03 00:00:00,"[""1,237,876"", ""2,829,942"", … ""3,755,263""]","[2, 2, … 1]","[""1,197,484"", ""3,546,640"", … ""893""]","[2, 2, … 2]"


In [5]:
user_ids = coldstart_df.get_column('user_id').unique().to_list()
train_user_ids, test_user_ids = train_test_split(user_ids, test_size=0.2, random_state=42)
print(f'train size = {len(train_user_ids)} users, test size = {len(test_user_ids)} users')

train size = 3288 users, test size = 823 users


In [6]:
train_df = (
    coldstart_df
    .filter(pl.col('user_id').is_in(train_user_ids))
    .filter(pl.col('hist_size') <= 30)
)
len(train_df)

7376

In [7]:
last_2w_df = (
    coldstart_df
    .filter(pl.col('date_dtm') >= datetime(2024, 4, 1))
    .filter(pl.col('hist_size') <= 30)
    .filter(pl.col('target_memes').list.len() < 100)
)
len(last_2w_df)

4831

In [8]:
meme_features_daily_df.head(1)

meme_id,n_memes_sent,n_likes,n_dislikes,age_days,date_dtm,meme_source_id
str,u32,i64,i64,i64,datetime[μs],i64
"""12,528""",0,0,0,11,2024-03-01 00:00:00,46


In [39]:
# Similar to production. Simplifications:
# Memes without stats were omitted
# Top impression feature is omitted (gives 1.0 vs 0.8 for top 1 meme from a source by its telegram impressions)
# Impressions without reactions are omitted

score = pl.when(pl.col('age_days') < 14).then(1.0).otherwise(0.8) * pl.col('n_likes') / (pl.col('n_likes') + pl.col('n_dislikes'))


def best_meme_from_each_source(meme_features_daily_df, date_dtm):
    return (
        meme_features_daily_df
        .filter(pl.col('date_dtm') == date_dtm)
        .filter(pl.col('n_likes') + pl.col('n_dislikes') > 0)
        .with_columns(score.alias('score'))
        .sort('score', descending=True)
        .group_by('meme_source_id')
        .agg(pl.all().first())
        .get_column('meme_id')
        .to_list()
    )

best_meme_from_each_source_cache = {}
for date_dtm in train_df.select('date_dtm').unique().get_column('date_dtm').to_list():
    best_meme_from_each_source_cache[date_dtm] = best_meme_from_each_source(meme_features_daily_df, date_dtm)

best_meme_from_each_source_v2_cache = {}
for date_dtm in train_df.select('date_dtm').unique().get_column('date_dtm').to_list():
    best_meme_from_each_source_v2_cache[date_dtm] = best_meme_from_each_source(meme_features_daily_v2_df, date_dtm)

In [40]:
# Most liked

score = pl.when(pl.col('age_days') < 14).then(1.0).otherwise(0.8) * pl.col('n_likes') / (pl.col('n_likes') + pl.col('n_dislikes'))


def most_liked(meme_features_daily_df, date_dtm):
    return (
        meme_features_daily_df
        .filter(pl.col('date_dtm') == date_dtm)
        .filter(pl.col('n_memes_sent') > 10)
        .with_columns(score.alias('score'))
        .sort('score', descending=True)
        .head(100)
        .select(pl.col('meme_id').shuffle(int(date_dtm.timestamp())))
        .get_column('meme_id')
        .to_list()
    )

most_liked_cache = {}
for date_dtm in train_df.select('date_dtm').unique().get_column('date_dtm').to_list():
    most_liked_cache[date_dtm] = most_liked(meme_features_daily_df, date_dtm)

most_liked_v2_cache = {}
for date_dtm in train_df.select('date_dtm').unique().get_column('date_dtm').to_list():
    most_liked_v2_cache[date_dtm] = most_liked(meme_features_daily_v2_df, date_dtm)

In [12]:
# Similar to production. Simplifications:
# Memes without stats were omitted
# Top impression feature is omitted (gives 1.0 vs 0.8 for top 1 meme from a source by its telegram impressions)
# Impressions without reactions are omitted

score = pl.when(pl.col('age_days') < 14).then(1.0).otherwise(0.8) * pl.col('n_likes') / (pl.col('n_likes') + pl.col('n_dislikes'))

def random_recs(user_id, meme_features_daily_df, date_dtm):
    hash = int(hashlib.sha256(user_id.encode('utf-8')).hexdigest(), 16) % 10**8
    return (
        meme_features_daily_df
        .filter(pl.col('date_dtm') == date_dtm)
        .sample(1000, seed=hash)
        .get_column('meme_id')
        .to_list()
    )

In [13]:
def filter_seen(recs, hist_memes):
    """Filters memes that were seen"""
    return [meme_id for meme_id in recs if meme_id not in hist_memes]

In [17]:
def estimate(recs, target_memes, target_reactions):
    """Matches recs with future seen memes from target list
    Calculates likes and dislikes"""
    likes = 0
    dislikes = 0
    for meme_id in recs:
        if meme_id not in target_memes:
            continue
        idx = target_memes.index(meme_id)
        reaction = target_reactions[idx]
        if reaction == 1:
            likes += 1
            continue
        if reaction == 2:
            dislikes += 1
            continue

    if (likes + dislikes) == 0:
        return None, None, None
    
    lr = likes / (likes + dislikes)

    return likes, dislikes, lr

In [19]:
# Testing best_meme_from_each_source
# Taking tops from the cache
# Filtering seen
# Taking top 100 (need to choose thresholds, for this alg top-10 performed similarly)

rows = []
for row in train_df.iter_rows(named=True):
    recs = best_meme_from_each_source_cache[row['date_dtm']]
    recs = filter_seen(recs, row['hist_memes'])
    recs = recs[:100]
    likes, dislikes, lr = estimate(recs, row['target_memes'], row['target_reactions'])
    rows.append({
        'user_id': row['user_id'],
        'hist_size': row['hist_size'],
        'date_dtm': row['date_dtm'],
        'likes': likes,
        'dislikes': dislikes,
        'lr': lr,
    })
results_df = pl.DataFrame(rows)

likes = results_df['likes'].sum()
n = results_df['likes'].sum() + results_df['dislikes'].sum()
lr = results_df['likes'].sum() / (results_df['likes'].sum() + results_df['dislikes'].sum())
std = np.sqrt(lr * (1 - lr) / n)
print(f'Likes - {likes}, Like Rate = {lr:.3f} +-{std * 1.98:.3f}')

Likes - 1829, Like Rate = 0.474 +-0.016


In [20]:
# Testing random recs
# I think it has no sense due to biased dataset (no actually random memes in production)

rows = []
for row in train_df.iter_rows(named=True):
    recs = random_recs(row['user_id'], meme_features_daily_df, row['date_dtm'])
    recs = filter_seen(recs, row['hist_memes'])
    recs = recs[:100]
    likes, dislikes, lr = estimate(recs, row['target_memes'], row['target_reactions'])
    rows.append({
        'user_id': row['user_id'],
        'hist_size': row['hist_size'],
        'date_dtm': row['date_dtm'],
        'likes': likes,
        'dislikes': dislikes,
        'lr': lr
    })
results_df = pl.DataFrame(rows)

likes = results_df['likes'].sum()
n = results_df['likes'].sum() + results_df['dislikes'].sum()
lr = results_df['likes'].sum() / (results_df['likes'].sum() + results_df['dislikes'].sum())
std = np.sqrt(lr * (1 - lr) / n)
print(f'Likes - {likes}, Like Rate = {lr:.3f} +-{std * 1.98:.3f}')


Likes - 710, Like Rate = 0.489 +-0.026


In [21]:
# Testing most liked

rows = []
for row in train_df.iter_rows(named=True):
    recs = most_liked_cache[row['date_dtm']]
    recs = filter_seen(recs, row['hist_memes'])
    recs = recs[:100]
    likes, dislikes, lr = estimate(recs, row['target_memes'], row['target_reactions'])
    rows.append({
        'user_id': row['user_id'],
        'hist_size': row['hist_size'],
        'date_dtm': row['date_dtm'],
        'likes': likes,
        'dislikes': dislikes,
        'lr': lr
    })
results_df = pl.DataFrame(rows)

likes = results_df['likes'].sum()
n = results_df['likes'].sum() + results_df['dislikes'].sum()
lr = results_df['likes'].sum() / (results_df['likes'].sum() + results_df['dislikes'].sum())
std = np.sqrt(lr * (1 - lr) / n)
print(f'Likes - {likes}, Like Rate = {lr:.3f} +-{std * 1.98:.3f}')


Likes - 6812, Like Rate = 0.613 +-0.009


## Testing best meme from each cluster algorithm

Clusters achieved using ALS + KMeans

Num clusters = 10

Validation details

* Train = 1.03.24 - 1.04.24
* Test = 1.04.24 - 13.04.24

In [52]:
meme_clusters_df = pl.read_parquet('meme_clusters.pq')

In [53]:
# cluster based

score = pl.when(pl.col('age_days') < 14).then(1.0).otherwise(0.8) * pl.col('n_likes') / (pl.col('n_likes') + pl.col('n_dislikes'))

def best_memes_from_each_cluster(meme_features_daily_df, date_dtm):
    return (
        meme_features_daily_df
        .filter(pl.col('date_dtm') == date_dtm)
        .filter(pl.col('n_memes_sent') > 10)
        .join(meme_clusters_df, on='meme_id')
        .with_columns(score.alias('score'))
        .sort('score', descending=True)
        .group_by('cluster_id')
        .agg(pl.col('meme_id').head(10))
        .explode(pl.col('meme_id'))
        .select(pl.col('meme_id').shuffle(int(date_dtm.timestamp())))
        .get_column('meme_id')
        .to_list()
    )

best_memes_from_each_cluster_cache = {}
for date_dtm in train_df.select('date_dtm').unique().get_column('date_dtm').to_list():
    best_memes_from_each_cluster_cache[date_dtm] = best_memes_from_each_cluster(meme_features_daily_df, date_dtm)

best_memes_from_each_cluster_v2_cache = {}
for date_dtm in train_df.select('date_dtm').unique().get_column('date_dtm').to_list():
    best_memes_from_each_cluster_v2_cache[date_dtm] = best_memes_from_each_cluster(meme_features_daily_v2_df, date_dtm)

In [54]:
rows = []
for row in last_2w_df.iter_rows(named=True):
    recs = best_meme_from_each_source_cache[row['date_dtm']]
    recs = filter_seen(recs, row['hist_memes'])
    recs = recs[:100]
    likes, dislikes, lr = estimate(recs, row['target_memes'], row['target_reactions'])
    rows.append({
        'user_id': row['user_id'],
        'hist_size': row['hist_size'],
        'date_dtm': row['date_dtm'],
        'likes': likes,
        'dislikes': dislikes,
        'lr': lr,
    })
results_df = pl.DataFrame(rows)

likes = results_df['likes'].sum()
n = results_df['likes'].sum() + results_df['dislikes'].sum()
lr = results_df['likes'].sum() / (results_df['likes'].sum() + results_df['dislikes'].sum())
lr_micro = results_df['lr'].mean()
std = np.sqrt(lr * (1 - lr) / n)
print(f'Likes - {likes}, Like Rate = {lr:.3f} +-{std * 1.98:.3f}, Like Rate Micro = {lr_micro:.3f}')

Likes - 512, Like Rate = 0.537 +-0.032, Like Rate Micro = 0.536


In [43]:
rows = []
for row in last_2w_df.iter_rows(named=True):
    recs = best_memes_from_each_cluster_cache[row['date_dtm']]
    recs = filter_seen(recs, row['hist_memes'])
    recs = recs[:100]
    likes, dislikes, lr = estimate(recs, row['target_memes'], row['target_reactions'])
    rows.append({
        'user_id': row['user_id'],
        'hist_size': row['hist_size'],
        'date_dtm': row['date_dtm'],
        'likes': likes,
        'dislikes': dislikes,
        'lr': lr,
    })
results_df = pl.DataFrame(rows)

likes = results_df['likes'].sum()
n = results_df['likes'].sum() + results_df['dislikes'].sum()
lr = results_df['likes'].sum() / (results_df['likes'].sum() + results_df['dislikes'].sum())
lr_micro = results_df['lr'].mean()
std = np.sqrt(lr * (1 - lr) / n)
print(f'Likes - {likes}, Like Rate = {lr:.3f} +-{std * 1.98:.3f}, Like Rate Micro = {lr_micro:.3f}')

Likes - 5215, Like Rate = 0.591 +-0.010, Like Rate Micro = 0.532


In [56]:
rows = []
for row in last_2w_df.iter_rows(named=True):
    recs = best_memes_from_each_cluster_v2_cache[row['date_dtm']]
    recs = filter_seen(recs, row['hist_memes'])
    recs = recs[:100]
    likes, dislikes, lr = estimate(recs, row['target_memes'], row['target_reactions'])
    rows.append({
        'user_id': row['user_id'],
        'hist_size': row['hist_size'],
        'date_dtm': row['date_dtm'],
        'likes': likes,
        'dislikes': dislikes,
        'lr': lr,
    })
results_df = pl.DataFrame(rows)

likes = results_df['likes'].sum()
n = results_df['likes'].sum() + results_df['dislikes'].sum()
lr = results_df['likes'].sum() / (results_df['likes'].sum() + results_df['dislikes'].sum())
lr_micro = results_df['lr'].mean()
std = np.sqrt(lr * (1 - lr) / n)
print(f'Likes - {likes}, Like Rate = {lr:.3f} +-{std * 1.98:.3f}, Like Rate Micro = {lr_micro:.3f}')

Likes - 5030, Like Rate = 0.573 +-0.010, Like Rate Micro = 0.526


In [44]:
# Testing most liked

rows = []
for row in last_2w_df.iter_rows(named=True):
    recs = most_liked_cache[row['date_dtm']]
    recs = filter_seen(recs, row['hist_memes'])
    recs = recs[:100]
    likes, dislikes, lr = estimate(recs, row['target_memes'], row['target_reactions'])
    rows.append({
        'user_id': row['user_id'],
        'hist_size': row['hist_size'],
        'date_dtm': row['date_dtm'],
        'likes': likes,
        'dislikes': dislikes,
        'lr': lr,
    })
results_df = pl.DataFrame(rows)

likes = results_df['likes'].sum()
n = results_df['likes'].sum() + results_df['dislikes'].sum()
lr = results_df['likes'].sum() / (results_df['likes'].sum() + results_df['dislikes'].sum())
lr_micro = results_df['lr'].mean()
std = np.sqrt(lr * (1 - lr) / n)
print(f'Likes - {likes}, Like Rate = {lr:.3f} +-{std * 1.98:.3f}, Like Rate Micro = {lr_micro:.3f}')

Likes - 1801, Like Rate = 0.567 +-0.017, Like Rate Micro = 0.504


In [45]:
# Testing most liked v2

rows = []
for row in last_2w_df.iter_rows(named=True):
    recs = most_liked_v2_cache[row['date_dtm']]
    recs = filter_seen(recs, row['hist_memes'])
    recs = recs[:100]
    likes, dislikes, lr = estimate(recs, row['target_memes'], row['target_reactions'])
    rows.append({
        'user_id': row['user_id'],
        'hist_size': row['hist_size'],
        'date_dtm': row['date_dtm'],
        'likes': likes,
        'dislikes': dislikes,
        'lr': lr,
    })
results_df = pl.DataFrame(rows)

likes = results_df['likes'].sum()
n = results_df['likes'].sum() + results_df['dislikes'].sum()
lr = results_df['likes'].sum() / (results_df['likes'].sum() + results_df['dislikes'].sum())
lr_micro = results_df['lr'].mean()
std = np.sqrt(lr * (1 - lr) / n)
print(f'Likes - {likes}, Like Rate = {lr:.3f} +-{std * 1.98:.3f}, Like Rate Micro = {lr_micro:.3f}')

Likes - 5150, Like Rate = 0.578 +-0.010, Like Rate Micro = 0.531


## Most liked on meme_stats_v2 are going for testing

In [57]:
with open('meme_ids.json', 'w') as f:
    ids = most_liked_v2_cache[datetime(2024, 4, 13)]
    ids = [int(id.replace(',', '')) for id in ids]
    json.dump(ids, f)

## Code to generate query string for the bot

In [285]:
'/meme ' + ' '.join([s.replace(',', '') for s in best_memes_from_each_cluster_cache[datetime(2024, 4, 10)][:100]])

'/meme 3537906 961188 3486879 3940783 2829942 861412 1213515 1964789 3940785 3517077 10222 3772372 3902342 3516170 3644890 3316350 2895478 1103300 3546640 3823857 3600766 2279191 2086964 3454946 3564685 2694493 2886632 1623255 1994543 3513207 3574336 2350587 881987 3652728 3940729 3477742 3644716 1197491 3635346 3697354 3604881 2377586 3467378 3140410 1245424 3551923 3719351 3724910 1267028 3569939 229958 2740166 3746267 3495787 3604880 3644177 3533161 3702327 3520485 3539067 3875767 2625535 2279188 693384 3859587 1169378 2352815 3569301 2713467 2392622 3775191 10152 3855063 2005796 2050874 2249024 3471463 3592903 1988648 3745276 3925003 2501564 1825631 2953444 3520907 1202825 2821161 3551186 476907 1068395 3587199 1190719 2060890 3565438 3661643 3570595 2091903 3832346 321242 3574793'