# Cold Start Dataset

Two files are prepared in this notebook

* Train set for the cold start offline tests
* Historical features

## Train set

Train samples are collections of seen and future (target) items at some moment of time

Fields

* `user_id`
* `hist_size`: memes with reactions at the moment of time
* `dtm`: timestamp of the train sample
* `hist_memes`, `hist_reactions`: memes from history
* `target_memes`, `target_reactions`: memes from future

Be careful with out-of-time validation. User samples contains same values (target goes to history through the time). Consider only user or both user and time splits.

## Historical features

Features for every day. Contains fields from meme_stats which is used in prod.

Fields

* `meme_id`
* `n_likes`
* `n_dislikes`
* `n_memes_sent`
* `age_days`

In [1]:
import polars as pl
from datetime import datetime
import numpy as np
from tqdm.notebook import tqdm

In [2]:
dtm_fmt = "%B %-d, %Y, %I:%M %p"

In [3]:
user_meme_df = pl.read_csv('user_meme_reaction_240401_240506.csv')
user_df = pl.read_csv('user_240507.csv')
meme_df = pl.read_csv('meme_240507.csv')

In [4]:
# Only new users

user_proc_df = (
    user_df
    .with_columns(pl.col('created_at').str.to_datetime(dtm_fmt))
    .filter(pl.col('created_at') >= datetime(2024, 3, 1))
    .drop('type', 'blocked_bot_at', 'inviter_id', 'last_active_at')
)

In [8]:
# Remove non-reacts
# Add reaction number
# Leave only new users

user_meme_proc_df = (
    user_meme_df
    .with_columns(pl.col('sent_at').str.to_datetime(dtm_fmt))
    .drop('reacted_at')
    .with_columns(pl.col('reaction_id').fill_null(2))
    .sort('user_id', 'sent_at')
    .with_columns(reaction_num=pl.first().cum_count().over('user_id'))
    .join(user_proc_df.select('id'), left_on='user_id', right_on='id', how='inner')
)

In [9]:
user_meme_df.filter(pl.col('recommended_by') == 'random_best_ab_240422').count()

user_id,meme_id,recommended_by,sent_at,reaction_id,reacted_at
u32,u32,u32,u32,u32,u32
10423,10423,10423,10423,9292,9292


In [10]:
def prepare_data_user(user_meme_proc_df, user_id, chunk_size=10):
    """
    Splits reactions on chunks
    Fills historical and target memes
    The number of output samples is equal to the number of chunks
    """
    res = []
    user_meme_slice = user_meme_proc_df.filter(pl.col('user_id') == user_id).sort('sent_at')
    
    if len(user_meme_slice) == 0:
        return res

    n_reactions = user_meme_slice.select('reaction_num').max().item()

    cur_idx = 0

    while cur_idx < n_reactions:

        filt_hist = pl.col('reaction_num') <= cur_idx
        filt_target = pl.col('reaction_num') > cur_idx

        d = {
            'user_id': user_id,
            'hist_size': len(user_meme_slice.filter(filt_hist)),
            'dtm': user_meme_slice.filter(filt_target).select('sent_at').min().item(),
            'date_dtm': user_meme_slice.filter(filt_target).select(pl.col('sent_at').dt.truncate('1d')).min().item(),
            'hist_memes': user_meme_slice.filter(filt_hist).get_column('meme_id').to_list(),
            'hist_reactions': user_meme_slice.filter(filt_hist).get_column('reaction_id').to_list(),
            'target_memes': user_meme_slice.filter(filt_target).get_column('meme_id').to_list(),
            'target_reactions': user_meme_slice.filter(filt_target).get_column('reaction_id').to_list(),
            'target_recommended_by': user_meme_slice.filter(filt_target).get_column('recommended_by').to_list(),
        }
        res.append(d)
        cur_idx += chunk_size

    return res

In [11]:
test_user_meme_df = pl.DataFrame({'user_id': [1] * 5, 'reaction_num': list(range(1, 6)), 'reaction_id': [1] * 5, 'meme_id': list(range(1, 6)), 'sent_at': [datetime(2024, 1, i, 1) for i in range(1, 6)], 'recommended_by': ['best_meme_from_each_source'] * 5})
prepare_data_user(test_user_meme_df, 1, chunk_size=2)

[{'user_id': 1,
  'hist_size': 0,
  'dtm': datetime.datetime(2024, 1, 1, 1, 0),
  'date_dtm': datetime.datetime(2024, 1, 1, 0, 0),
  'hist_memes': [],
  'hist_reactions': [],
  'target_memes': [1, 2, 3, 4, 5],
  'target_reactions': [1, 1, 1, 1, 1],
  'target_recommended_by': ['best_meme_from_each_source',
   'best_meme_from_each_source',
   'best_meme_from_each_source',
   'best_meme_from_each_source',
   'best_meme_from_each_source']},
 {'user_id': 1,
  'hist_size': 2,
  'dtm': datetime.datetime(2024, 1, 3, 1, 0),
  'date_dtm': datetime.datetime(2024, 1, 3, 0, 0),
  'hist_memes': [1, 2],
  'hist_reactions': [1, 1],
  'target_memes': [3, 4, 5],
  'target_reactions': [1, 1, 1],
  'target_recommended_by': ['best_meme_from_each_source',
   'best_meme_from_each_source',
   'best_meme_from_each_source']},
 {'user_id': 1,
  'hist_size': 4,
  'dtm': datetime.datetime(2024, 1, 5, 1, 0),
  'date_dtm': datetime.datetime(2024, 1, 5, 0, 0),
  'hist_memes': [1, 2, 3, 4],
  'hist_reactions': [1, 1, 

In [12]:
res = []
chunk_size = 10

for user_id in tqdm(user_proc_df.get_column('id').to_list()):
    res.extend(prepare_data_user(user_meme_proc_df, user_id, chunk_size=chunk_size))
coldstart_dataset_df = pl.DataFrame(res)

  0%|          | 0/8293 [00:00<?, ?it/s]

In [13]:
coldstart_dataset_df.write_parquet('coldstart_dataset.pq')

## Meme features v1

Trying to reimplement the current production meme_stats table

In [14]:
meme_proc_df = (
    meme_df
    .with_columns(pl.col('created_at').str.to_datetime(dtm_fmt))
    .select('id', 'created_at', 'language_code')
)

In [15]:
user_meme_proc_v2_df = (
    user_meme_df
    .with_columns(pl.col('sent_at').str.to_datetime(dtm_fmt))
    .drop('reacted_at')
    .join(user_df.select('id'), left_on='user_id', right_on='id', how='inner')
)

In [16]:
def get_meme_stats_day(user_meme_proc_df, meme_proc_df, date_dtm):
    stats = (
        user_meme_proc_df
        .filter(pl.col('sent_at') < date_dtm)
        .group_by('meme_id').agg(
            (pl.col('reaction_id').count() + pl.col('reaction_id').null_count()).alias('n_memes_sent'),
            (pl.col('reaction_id') == 1).cast(pl.Int64).sum().alias('n_likes'),
            (pl.col('reaction_id') == 2).cast(pl.Int64).sum().alias('n_dislikes'),
        )
    )
    return (
        meme_proc_df
        .select(pl.col('id').alias('meme_id'), 'language_code', 'created_at')
        .filter(pl.col('created_at') < date_dtm)
        .join(stats, on='meme_id', how='left')
        .fill_null(0)
        .with_columns((pl.lit(date_dtm) - pl.col('created_at')).dt.total_days().alias('age_days'))
        .drop('created_at')
        .with_columns(pl.lit(date_dtm).dt.truncate('1d').alias('date_dtm'))
    )

In [17]:
# Testing
get_meme_stats_day(user_meme_proc_v2_df, meme_proc_df, datetime(2024, 4, 1))

meme_id,language_code,n_memes_sent,n_likes,n_dislikes,age_days,date_dtm
str,str,u32,i64,i64,i64,datetime[μs]
"""1,694,876""","""fr""",0,0,0,22,2024-04-01 00:00:00
"""5,028""","""en""",0,0,0,42,2024-04-01 00:00:00
"""12,486""","""ru""",0,0,0,42,2024-04-01 00:00:00
"""3,770,729""","""ru""",0,0,0,2,2024-04-01 00:00:00
"""3,323,726""","""ru""",0,0,0,6,2024-04-01 00:00:00
…,…,…,…,…,…,…
"""12,504""","""ru""",0,0,0,42,2024-04-01 00:00:00
"""3,946,802""","""ru""",0,0,0,0,2024-04-01 00:00:00
"""7,096""","""ru""",0,0,0,42,2024-04-01 00:00:00
"""2,377,569""","""ru""",0,0,0,15,2024-04-01 00:00:00


In [18]:
# Not very reliable though no constants
# Drop the first day
dates_dtm = user_meme_proc_v2_df.select(pl.col('sent_at').dt.truncate('1d')).unique().sort('sent_at').get_column('sent_at').to_list()

In [19]:
res = []
for date_dtm in dates_dtm:
    res.append(get_meme_stats_day(user_meme_proc_v2_df, meme_proc_df, date_dtm))

meme_features_daily_df = (
    pl.concat(res)
    .join(meme_df.select('id', 'meme_source_id'), left_on='meme_id', right_on='id')
)

In [20]:
meme_features_daily_df.write_parquet('meme_features_daily.pq')

## User features

Trying to reimplement the current production user_stats table

In [21]:
user_created_df = (
    user_df
    .with_columns(pl.col('created_at').str.to_datetime(dtm_fmt))
    .select('id', 'created_at')
)

In [22]:
def get_user_stats_day(user_meme_proc_df, date_dtm):
    return (
        user_meme_proc_df
        .filter(pl.col('sent_at') < date_dtm)
        .group_by('user_id').agg(
            (pl.col('reaction_id').count() + pl.col('reaction_id').null_count()).alias('n_memes_sent'),
            (pl.col('reaction_id') == 1).cast(pl.Int64).sum().alias('n_likes'),
            (pl.col('reaction_id') == 2).cast(pl.Int64).sum().alias('n_dislikes'),
        )
        .join(user_created_df, left_on='user_id', right_on='id')
        .with_columns((pl.lit(date_dtm) - pl.col('created_at')).dt.total_days().alias('user_age_days'))
        .drop('created_at')
        .with_columns(pl.lit(date_dtm).dt.truncate('1d').alias('date_dtm'))
    )


In [23]:
# Testing
get_user_stats_day(user_meme_proc_v2_df, datetime(2024, 4, 1))

user_id,n_memes_sent,n_likes,n_dislikes,user_age_days,date_dtm
str,u32,i64,i64,i64,datetime[μs]


In [24]:
res = []
for date_dtm in dates_dtm:
    res.append(get_user_stats_day(user_meme_proc_v2_df, date_dtm))

user_features_daily_df = (
    pl.concat(res)
)

In [25]:
user_features_daily_df.write_parquet('user_features_daily.pq')

## Meme features V2 - more strict filtration

Ignore old users (more than 200 reactions) in meme stats

The idea is that users with many responces have too high like rate with sometimes strange taste of good memes.

In [26]:
user_features_daily_df.head(2)

user_id,n_memes_sent,n_likes,n_dislikes,user_age_days,date_dtm
str,u32,i64,i64,i64,datetime[μs]
"""1,443,261,702""",17,6,10,43,2024-04-02 00:00:00
"""486,191,407""",18,5,12,0,2024-04-02 00:00:00


In [27]:
def get_meme_stats_day_v2(user_meme_proc_df, meme_proc_df, user_features_daily_df, date_dtm):
    fresh_users = (
        user_features_daily_df
        .filter(pl.col('n_memes_sent') < 200)
        .select('user_id', 'date_dtm')
    )

    stats = (
        user_meme_proc_df
        .filter(pl.col('sent_at') < date_dtm)
        .with_columns(pl.col('sent_at').dt.truncate('1d').alias('date_dtm'))
        .join(fresh_users, on=['user_id', 'date_dtm'], how='inner')
        .drop('date_dtm')
        .group_by('meme_id').agg(
            (pl.col('reaction_id').count() + pl.col('reaction_id').null_count()).alias('n_memes_sent'),
            (pl.col('reaction_id') == 1).cast(pl.Int64).sum().alias('n_likes'),
            (pl.col('reaction_id') == 2).cast(pl.Int64).sum().alias('n_dislikes'),
        )
    )
    return (
        meme_proc_df
        .select(pl.col('id').alias('meme_id'), 'language_code', 'created_at')
        .filter(pl.col('created_at') < date_dtm)
        .join(stats, on='meme_id', how='left')
        .fill_null(0)
        .with_columns((pl.lit(date_dtm) - pl.col('created_at')).dt.total_days().alias('age_days'))
        .drop('created_at')
        .with_columns(pl.lit(date_dtm).dt.truncate('1d').alias('date_dtm'))
    )

In [28]:
# Testing
get_meme_stats_day_v2(user_meme_proc_v2_df, meme_proc_df, user_features_daily_df, datetime(2024, 4, 1))

meme_id,language_code,n_memes_sent,n_likes,n_dislikes,age_days,date_dtm
str,str,u32,i64,i64,i64,datetime[μs]
"""1,694,876""","""fr""",0,0,0,22,2024-04-01 00:00:00
"""5,028""","""en""",0,0,0,42,2024-04-01 00:00:00
"""12,486""","""ru""",0,0,0,42,2024-04-01 00:00:00
"""3,770,729""","""ru""",0,0,0,2,2024-04-01 00:00:00
"""3,323,726""","""ru""",0,0,0,6,2024-04-01 00:00:00
…,…,…,…,…,…,…
"""12,504""","""ru""",0,0,0,42,2024-04-01 00:00:00
"""3,946,802""","""ru""",0,0,0,0,2024-04-01 00:00:00
"""7,096""","""ru""",0,0,0,42,2024-04-01 00:00:00
"""2,377,569""","""ru""",0,0,0,15,2024-04-01 00:00:00


In [29]:
# Not very reliable though no constants
# Drop the first day
dates_dtm = user_meme_proc_v2_df.select(pl.col('sent_at').dt.truncate('1d')).unique().sort('sent_at').get_column('sent_at').to_list()

In [30]:
res = []
for date_dtm in dates_dtm:
    res.append(get_meme_stats_day_v2(user_meme_proc_v2_df, meme_proc_df, user_features_daily_df, date_dtm))

meme_features_daily_v2_df = (
    pl.concat(res)
    .join(meme_df.select('id', 'meme_source_id'), left_on='meme_id', right_on='id')
)

In [31]:
meme_features_daily_v2_df.write_parquet('meme_features_daily_v2.pq')