# Cold Start - AB 240422 Analysis

**ab_240422** appeared to have the same performance as the previous **best_meme_from_each_source**. The reasons why these results differ from the previous offline simulation

* The main probable reason is the bias in the training data
* In the previous simulation, I removed users that blocked the bot
* In the current simulation I left only to 40 items for each new user

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import hashlib
import json
from datetime import datetime

import numpy as np
import polars as pl
from sklearn.model_selection import train_test_split

In [3]:
from models import ColdStartRecommender, BestMemeFromEachSource, MostLiked, SelectedSources, CustomSources
from estimate import estimate_cs, estimate_cs_prod

In [64]:
coldstart_df = pl.read_parquet('coldstart.pq')
meme_features_daily_df = pl.read_parquet('meme_features_daily.pq')

# v2 is calculated on users with less than 200 responses
meme_features_daily_v2_df = pl.read_parquet('meme_features_daily_v2.pq')

In [65]:
meme_df = pl.read_parquet('meme.pq')
meme_source_df = pl.read_parquet('meme_source.pq')

In [6]:
coldstart_df.slice(2, 1)

user_id,dtm,date_dtm,target_memes,target_reactions,target_recommended_by
i64,datetime[μs],datetime[μs],list[i64],list[i64],list[str]
211558,2024-03-18 04:07:42.589074,2024-03-18 00:00:00,"[1195419, 171119, … 790223]","[2, 2, … 2]","[""best_meme_from_each_source"", ""best_meme_from_each_source"", … ""best_meme_from_each_source""]"


In [7]:
validation_df = (
    coldstart_df
    .filter(pl.col('date_dtm') >= datetime(2024, 4, 1))
    .filter(pl.col('date_dtm') < datetime(2024, 5, 8))
    .with_columns(pl.col('target_memes').list.head(30))
    .with_columns(pl.col('target_reactions').list.head(30))
    .with_columns(pl.col('target_recommended_by').list.head(30))
)

In [8]:
estimate_cs_prod('best_meme_from_each_source', validation_df)

Likes - 18785, Like Rate = 0.475 +- 0.013, Like Rate Micro = 0.215 +- 0.008


In [9]:
estimate_cs_prod('random_best_ab_240422', validation_df)

Likes - 4292, Like Rate = 0.512 +- 0.027, Like Rate Micro = 0.046 +- 0.005


In [10]:
best_meme_from_each_source = BestMemeFromEachSource(meme_features_daily_df, min_sent_thr=0)
estimate_cs(best_meme_from_each_source, validation_df)

Likes - 1090, Like Rate = 0.526 +- 0.024, Like Rate Micro = 0.497 +- 0.025


In [11]:
most_liked = MostLiked(meme_features_daily_df)
estimate_cs(most_liked, validation_df)

Likes - 1225, Like Rate = 0.538 +- 0.032, Like Rate Micro = 0.462 +- 0.027


In [12]:
most_liked_v2 = MostLiked(meme_features_daily_v2_df)
estimate_cs(most_liked_v2, validation_df)

Likes - 3987, Like Rate = 0.553 +- 0.015, Like Rate Micro = 0.490 +- 0.015


In [54]:
selected_sources = SelectedSources(meme_features_daily_df, min_sent_thr=10)
estimate_cs(selected_sources, validation_df, lang_codes=None)

Likes - 2066, Like Rate = 0.548 +- 0.021, Like Rate Micro = 0.500 +- 0.021


In [20]:
selected_sources = SelectedSources(meme_features_daily_v2_df, min_sent_thr=10)
estimate_cs(selected_sources, validation_df, lang_codes=None)

Likes - 1679, Like Rate = 0.546 +- 0.022, Like Rate Micro = 0.512 +- 0.022


In [55]:
selected_sources = SelectedSources(meme_features_daily_v2_df, min_sent_thr=10)
estimate_cs(selected_sources, validation_df, lang_codes=None)

Likes - 1731, Like Rate = 0.556 +- 0.022, Like Rate Micro = 0.519 +- 0.023


In [26]:
selected_sources = SelectedSources(meme_features_daily_v2_df, min_sent_thr=10)

In [67]:
selected_sources = SelectedSources(meme_features_daily_df, min_sent_thr=20)

In [70]:
for i in range(10):
    meme_ids = selected_sources.recommend(100, datetime(2024, 5, 9))[i*10:(i+1)*10]
    print('/meme ' + ' '.join([str(meme_id) for meme_id in meme_ids]))

/meme 7021127 7406558 7309298 7800850 6930231 7711423 7429731 6738541 7263923 7653362
/meme 7650435 7463358 7728665 5762991 7570548 7800821 7510581 7273747 6762141 7021128
/meme 6611543 7760103 7626659 6725244 7159369 6952525 6738682 6593078 7186939 7319145
/meme 6691237 7800853 6691124 7330873 7761635 7272474 6738684 7336684 7001085 7313587
/meme 7589133 7746601 6489929 7743723 3530822 7564457 7264007 6452942 7447514 7317863
/meme 7589421 6623572 7425200 6738189 6593069 7631669 7263240 6931153 6513288 7234885
/meme 7040075 7309297 5894155 7826016 7527928 7728617 7341026 7341027 6695762 5323574
/meme 7809768 6690818 6522792 7462187 7826020 7762417 6895401 7341028 5497329 6965529
/meme 7041011 6953202 7657763 7701364 4558947
/meme 


In [72]:
selected_sources = SelectedSources(meme_features_daily_v2_df, min_sent_thr=10)

In [73]:
for i in range(10):
    meme_ids = selected_sources.recommend(100, datetime(2024, 5, 9))[i*10:(i+1)*10]
    print('/meme ' + ' '.join([str(meme_id) for meme_id in meme_ids]))

/meme 12632 2687384 4691317 7800835 7263240 7800833 7273747 5594231 1190355 7313587
/meme 7737207 121592 5762991 1173406 6393105 6470439 7648698 6522792 7746601 1023569
/meme 121513 6680926 7003156 2163445 7728617 3855063 6305615 7564457 6953202 2341860
/meme 7462187 7650435 6691124 7425200 7800853 5688394 7309298 7743723 130092 6494305
/meme 6931153 1173405 2744097 1976242 1220485 3530822 1017007 5903961 7398465 6952525
/meme 285571 6965529 7510581 2010680 2086971 1573399 6902677 7264007 6623571 7515039
/meme 5111892 7527928 7186939 7341028 6606148 7234885 7447514 12684 7118266 6690818
/meme 7797882 7570548 5439952 6452942 7532191 5048161 1190740 7109188 12664 6611543
/meme 
/meme 


In [74]:
all_meme_ids = selected_sources.recommend(1000, datetime(2024, 5, 9))
res = ''
batch_idx = 0
batch_size = 5
while batch_idx * batch_size < len(all_meme_ids):
    batch = all_meme_ids[batch_idx * batch_size : (batch_idx + 1) * batch_size]
    res += ', '.join([str(meme_id) for meme_id in batch])
    res += '\n'
    batch_idx += 1
print(res)

12632, 2687384, 4691317, 7800835, 7263240
7800833, 7273747, 5594231, 1190355, 7313587
7737207, 121592, 5762991, 1173406, 6393105
6470439, 7648698, 6522792, 7746601, 1023569
121513, 6680926, 7003156, 2163445, 7728617
3855063, 6305615, 7564457, 6953202, 2341860
7462187, 7650435, 6691124, 7425200, 7800853
5688394, 7309298, 7743723, 130092, 6494305
6931153, 1173405, 2744097, 1976242, 1220485
3530822, 1017007, 5903961, 7398465, 6952525
285571, 6965529, 7510581, 2010680, 2086971
1573399, 6902677, 7264007, 6623571, 7515039
5111892, 7527928, 7186939, 7341028, 6606148
7234885, 7447514, 12684, 7118266, 6690818
7797882, 7570548, 5439952, 6452942, 7532191
5048161, 1190740, 7109188, 12664, 6611543

