# Cold Start Baselines

Algorithms

1. `best_memes_from_each_source`. Current production (with small diffs)

In [22]:
import hashlib
from datetime import datetime

import numpy as np
import polars as pl
from sklearn.model_selection import train_test_split

In [9]:
coldstart_df = pl.read_parquet('coldstart_dataset.pq')
meme_features_daily_df = pl.read_parquet('meme_features_daily.pq')

In [10]:
coldstart_df.slice(2, 1)

user_id,hist_size,dtm,date_dtm,hist_memes,hist_reactions,target_memes,target_reactions
str,i64,datetime[μs],datetime[μs],list[str],list[i64],list[str],list[i64]
"""486,191,407""",20,2024-04-03 19:20:00,2024-04-03 00:00:00,"[""1,237,876"", ""2,829,942"", … ""3,755,263""]","[2, 2, … 1]","[""1,197,484"", ""3,546,640"", … ""893""]","[2, 2, … 2]"


In [11]:
user_ids = coldstart_df.get_column('user_id').unique().to_list()
train_user_ids, test_user_ids = train_test_split(user_ids, test_size=0.2, random_state=42)
print(f'train size = {len(train_user_ids)} users, test size = {len(test_user_ids)} users')

train size = 3288 users, test size = 823 users


In [12]:
train_df = (
    coldstart_df
    .filter(pl.col('user_id').is_in(train_user_ids))
    .filter(pl.col('hist_size') <= 30)
)
len(train_df)

7337

In [13]:
meme_features_daily_df.head(1)

meme_id,n_memes_sent,n_likes,n_dislikes,age_days,date_dtm,meme_source_id
str,u32,i64,i64,i64,datetime[μs],i64
"""12,528""",0,0,0,11,2024-03-01 00:00:00,46


In [14]:
# Similar to production. Simplifications:
# Memes without stats were omitted
# Top impression feature is omitted (gives 1.0 vs 0.8 for top 1 meme from a source by its telegram impressions)
# Impressions without reactions are omitted

score = pl.when(pl.col('age_days') < 14).then(1.0).otherwise(0.8) * pl.col('n_likes') / (pl.col('n_likes') + pl.col('n_dislikes'))


def best_meme_from_each_source(meme_features_daily_df, date_dtm):
    return (
        meme_features_daily_df
        .filter(pl.col('date_dtm') == date_dtm)
        .filter(pl.col('n_likes') + pl.col('n_dislikes') > 0)
        .with_columns(score.alias('score'))
        .sort('score', descending=True)
        .group_by('meme_source_id')
        .agg(pl.all().first())
        .get_column('meme_id')
        .to_list()
    )

best_meme_from_each_source_cache = {}
for date_dtm in train_df.select('date_dtm').unique().get_column('date_dtm').to_list():
    best_meme_from_each_source_cache[date_dtm] = best_meme_from_each_source(meme_features_daily_df, date_dtm)

In [15]:
best_meme_from_each_source_cache[datetime(2024, 4, 1)][:10]

['1,474,105',
 '3,970,854',
 '3,755,263',
 '1,288,860',
 '3,719,949',
 '1,405,096',
 '469',
 '1,085,286',
 '3,098,702',
 '232']

In [16]:
# Similar to production. Simplifications:
# Memes without stats were omitted
# Top impression feature is omitted (gives 1.0 vs 0.8 for top 1 meme from a source by its telegram impressions)
# Impressions without reactions are omitted

score = pl.when(pl.col('age_days') < 14).then(1.0).otherwise(0.8) * pl.col('n_likes') / (pl.col('n_likes') + pl.col('n_dislikes'))

def random_recs(user_id, meme_features_daily_df, date_dtm):
    hash = int(hashlib.sha256(user_id.encode('utf-8')).hexdigest(), 16) % 10**8
    return (
        meme_features_daily_df
        .filter(pl.col('date_dtm') == date_dtm)
        .sample(1000, seed=hash)
        .get_column('meme_id')
        .to_list()
    )

In [17]:
def filter_seen(recs, hist_memes):
    """Filters memes that were seen"""
    return [meme_id for meme_id in recs if meme_id not in hist_memes]

In [18]:
def estimate(recs, target_memes, target_reactions):
    """Matches recs with future seen memes from target list
    Calculates likes and dislikes"""
    likes = 0
    dislikes = 0
    for meme_id in recs:
        if meme_id not in target_memes:
            continue
        idx = target_memes.index(meme_id)
        reaction = target_reactions[idx]
        if reaction == 1:
            likes += 1
            continue
        if reaction == 2:
            dislikes += 1
            continue

    if (likes + dislikes) == 0:
        return None, None

    return likes, dislikes

In [26]:
# Testing best_meme_from_each_source
# Taking tops from the cache
# Filtering seen
# Taking top 100 (need to choose thresholds, for this alg top-10 performed similarly)

rows = []
for row in train_df.iter_rows(named=True):
    recs = best_meme_from_each_source_cache[row['date_dtm']]
    recs = filter_seen(recs, row['hist_memes'])
    recs = recs[:100]
    likes, dislikes = estimate(recs, row['target_memes'], row['target_reactions'])
    rows.append({
        'user_id': row['user_id'],
        'hist_size': row['hist_size'],
        'date_dtm': row['date_dtm'],
        'likes': likes,
        'dislikes': dislikes,
    })
results_df = pl.DataFrame(rows)

likes = results_df['likes'].sum()
n = results_df['likes'].sum() + results_df['dislikes'].sum()
lr = results_df['likes'].sum() / (results_df['likes'].sum() + results_df['dislikes'].sum())
std = np.sqrt(lr * (1 - lr) / n)
print(f'Likes - {likes}, Like Rate = {lr:.3f} +-{std * 1.98:.3f}')

Likes - 1883, Like Rate = 0.485 +-0.016


In [25]:
# Testing random recs
# I think it has no sense due to biased dataset (no actually random memes in production)

rows = []
for row in train_df.iter_rows(named=True):
    recs = random_recs(row['user_id'], meme_features_daily_df, row['date_dtm'])
    recs = filter_seen(recs, row['hist_memes'])
    recs = recs[:100]
    likes, dislikes = estimate(recs, row['target_memes'], row['target_reactions'])
    rows.append({
        'user_id': row['user_id'],
        'hist_size': row['hist_size'],
        'date_dtm': row['date_dtm'],
        'likes': likes,
        'dislikes': dislikes,
    })
results_df = pl.DataFrame(rows)

likes = results_df['likes'].sum()
n = results_df['likes'].sum() + results_df['dislikes'].sum()
lr = results_df['likes'].sum() / (results_df['likes'].sum() + results_df['dislikes'].sum())
std = np.sqrt(lr * (1 - lr) / n)
print(f'Likes - {likes}, Like Rate = {lr:.3f} +-{std * 1.98:.3f}')


Likes - 715, Like Rate = 0.495 +-0.026
