# Cold Start Dataset

Two files are prepared in this notebook

* Train set for the cold start offline tests
* Historical features

## Train set

Train samples are collections of seen and future (target) items at some moment of time

Fields

* `user_id`
* `hist_size`: memes with reactions at the moment of time
* `dtm`: timestamp of the train sample
* `hist_memes`, `hist_reactions`: memes from history
* `target_memes`, `target_reactions`: memes from future

Be careful with out-of-time validation. User samples contains same values (target goes to history through the time). Consider only user or both user and time splits.

## Historical features

Features for every day. Contains fields from meme_stats which is used in prod.

Fields

* `meme_id`
* `n_likes`
* `n_dislikes`
* `n_memes_sent`
* `age_days`

In [15]:
import polars as pl
from datetime import datetime
import numpy as np
from tqdm.notebook import tqdm

In [16]:
dtm_fmt = "%B %-d, %Y, %I:%M %p"

In [17]:
user_meme_df = pl.read_csv('user_meme_reaction_240301_240413.csv')
user_df = pl.read_csv('user_240414.csv')
meme_df = pl.read_csv('memes_240414.csv')
meme_source_df = pl.read_csv('meme_source_240414.csv')

In [18]:
# Only new users
# Not bots

user_proc_df = (
    user_df
    .with_columns(pl.col('created_at').str.to_datetime(dtm_fmt))
    .filter(pl.col('type') != 'blocked_bot')
    .filter(pl.col('created_at') >= datetime(2024, 3, 1))
    .drop('type', 'blocked_bot_at', 'inviter_id', 'last_active_at')
)

In [5]:
# Remove non-reacts
# Add reaction number
# Leave only new users

user_meme_proc_df = (
    user_meme_df
    .with_columns(pl.col('sent_at').str.to_datetime(dtm_fmt))
    .drop('reacted_at')
    .filter(pl.col('reaction_id').is_not_null())
    .sort('user_id', 'sent_at')
    .with_columns(reaction_num=pl.first().cum_count().over('user_id'))
    .join(user_proc_df.select('id'), left_on='user_id', right_on='id', how='inner')
)

In [6]:
user_meme_proc_df.head(1)

user_id,meme_id,recommended_by,sent_at,reaction_id,reaction_num
str,str,str,datetime[μs],i64,u32
"""1,002,452,686""","""1,237,876""","""like_spread_an…",2024-03-24 04:09:00,1,1


In [7]:
def prepare_data_user(user_meme_proc_df, user_id, chunk_size=10):
    """
    Splits reactions on chunks
    Fills historical and target memes
    The number of output samples is equal to the number of chunks
    """
    res = []
    user_meme_slice = user_meme_proc_df.filter(pl.col('user_id') == user_id).sort('sent_at')
    
    if len(user_meme_slice) == 0:
        return res

    n_reactions = user_meme_slice.select('reaction_num').max().item()

    cur_idx = 0

    while cur_idx < n_reactions:

        filt_hist = pl.col('reaction_num') <= cur_idx
        filt_target = pl.col('reaction_num') > cur_idx

        d = {
            'user_id': user_id,
            'hist_size': len(user_meme_slice.filter(filt_hist)),
            'dtm': user_meme_slice.filter(filt_target).select('sent_at').min().item(),
            'date_dtm': user_meme_slice.filter(filt_target).select(pl.col('sent_at').dt.truncate('1d')).min().item(),
            'hist_memes': user_meme_slice.filter(filt_hist).get_column('meme_id').to_list(),
            'hist_reactions': user_meme_slice.filter(filt_hist).get_column('reaction_id').to_list(),
            'target_memes': user_meme_slice.filter(filt_target).get_column('meme_id').to_list(),
            'target_reactions': user_meme_slice.filter(filt_target).get_column('reaction_id').to_list(),
        }
        res.append(d)
        cur_idx += chunk_size

    return res

In [8]:
test_user_meme_df = pl.DataFrame({'user_id': [1] * 5, 'reaction_num': list(range(1, 6)), 'reaction_id': [1] * 5, 'meme_id': list(range(1, 6)), 'sent_at': [datetime(2024, 1, i, 1) for i in range(1, 6)]})
prepare_data_user(test_user_meme_df, 1, chunk_size=2)

[{'user_id': 1,
  'hist_size': 0,
  'dtm': datetime.datetime(2024, 1, 1, 1, 0),
  'date_dtm': datetime.datetime(2024, 1, 1, 0, 0),
  'hist_memes': [],
  'hist_reactions': [],
  'target_memes': [1, 2, 3, 4, 5],
  'target_reactions': [1, 1, 1, 1, 1]},
 {'user_id': 1,
  'hist_size': 2,
  'dtm': datetime.datetime(2024, 1, 3, 1, 0),
  'date_dtm': datetime.datetime(2024, 1, 3, 0, 0),
  'hist_memes': [1, 2],
  'hist_reactions': [1, 1],
  'target_memes': [3, 4, 5],
  'target_reactions': [1, 1, 1]},
 {'user_id': 1,
  'hist_size': 4,
  'dtm': datetime.datetime(2024, 1, 5, 1, 0),
  'date_dtm': datetime.datetime(2024, 1, 5, 0, 0),
  'hist_memes': [1, 2, 3, 4],
  'hist_reactions': [1, 1, 1, 1],
  'target_memes': [5],
  'target_reactions': [1]}]

In [None]:
res = []
chunk_size = 10

for user_id in tqdm(user_proc_df.get_column('id').to_list()):
    res.extend(prepare_data_user(user_meme_proc_df, user_id, chunk_size=chunk_size))
coldstart_dataset_df = pl.DataFrame(res)

In [10]:
coldstart_dataset_df.write_parquet('coldstart_dataset.pq')

## Historical features

In [19]:
meme_created_df = (
    meme_df
    .with_columns(pl.col('created_at').str.to_datetime(dtm_fmt))
    .select('id', 'created_at')
)

In [20]:
# Remove bots

user_meme_proc_v2_df = (
    user_meme_df
    .with_columns(pl.col('sent_at').str.to_datetime(dtm_fmt))
    .drop('reacted_at')
    .join(user_df.filter(pl.col('type') == 'user').select('id'), left_on='user_id', right_on='id', how='inner')
)

In [21]:
def get_meme_stats_day(user_meme_proc_df, meme_created_df, date_dtm):
    return (
        user_meme_proc_df
        .filter(pl.col('sent_at') < date_dtm)
        .group_by('meme_id').agg(
            (pl.col('reaction_id').count() + pl.col('reaction_id').null_count()).alias('n_memes_sent'),
            (pl.col('reaction_id') == 1).cast(pl.Int64).sum().alias('n_likes'),
            (pl.col('reaction_id') == 2).cast(pl.Int64).sum().alias('n_dislikes'),
        )
        .join(meme_created_df, left_on='meme_id', right_on='id')
        .with_columns((pl.lit(date_dtm) - pl.col('created_at')).dt.total_days().alias('age_days'))
        .drop('created_at')
        .with_columns(pl.lit(date_dtm).dt.truncate('1d').alias('date_dtm'))
    )

In [22]:
# Testing
get_meme_stats_day(user_meme_proc_v2_df, meme_created_df, datetime(2024, 4, 1))

meme_id,n_memes_sent,n_likes,n_dislikes,age_days,date_dtm
str,u32,i64,i64,i64,datetime[μs]
"""3,282,095""",6,2,4,6,2024-04-01 00:00:00
"""3,379,190""",19,3,16,5,2024-04-01 00:00:00
"""1,200,106""",1,0,1,27,2024-04-01 00:00:00
"""3,609,318""",56,29,25,3,2024-04-01 00:00:00
"""3,494,088""",12,7,5,4,2024-04-01 00:00:00
…,…,…,…,…,…
"""2,812,451""",5,1,4,11,2024-04-01 00:00:00
"""3,081,777""",148,78,60,8,2024-04-01 00:00:00
"""3,297,779""",24,10,10,6,2024-04-01 00:00:00
"""3,862,528""",17,14,3,1,2024-04-01 00:00:00


In [27]:
# Not very reliable though no constants
# Drop the first day
dates_dtm = user_meme_proc_v2_df.select(pl.col('sent_at').dt.truncate('1d')).unique().sort('sent_at').get_column('sent_at').to_list()

In [33]:
res = []
for date_dtm in dates_dtm:
    res.append(get_meme_stats_day(user_meme_proc_v2_df, meme_created_df, date_dtm))

meme_features_daily_df = (
    pl.concat(res)
    .join(meme_df.select('id', 'meme_source_id'), left_on='meme_id', right_on='id')
)

In [34]:
meme_features_daily_df.write_parquet('meme_features_daily.pq')

In [23]:
user_created_df = (
    user_df
    .with_columns(pl.col('created_at').str.to_datetime(dtm_fmt))
    .select('id', 'created_at')
)

In [24]:
def get_user_stats_day(user_meme_proc_df, meme_created_df, date_dtm):
    return (
        user_meme_proc_df
        .filter(pl.col('sent_at') < date_dtm)
        .group_by('user_id').agg(
            (pl.col('reaction_id').count() + pl.col('reaction_id').null_count()).alias('n_memes_sent'),
            (pl.col('reaction_id') == 1).cast(pl.Int64).sum().alias('n_likes'),
            (pl.col('reaction_id') == 2).cast(pl.Int64).sum().alias('n_dislikes'),
        )
        .join(user_created_df, left_on='user_id', right_on='id')
        .with_columns((pl.lit(date_dtm) - pl.col('created_at')).dt.total_days().alias('user_age_days'))
        .drop('created_at')
        .with_columns(pl.lit(date_dtm).dt.truncate('1d').alias('date_dtm'))
    )


In [25]:
# Testing
get_user_stats_day(user_meme_proc_v2_df, meme_created_df, datetime(2024, 4, 1))

user_id,n_memes_sent,n_likes,n_dislikes,user_age_days,date_dtm
str,u32,i64,i64,i64,datetime[μs]
"""853,469,181""",1862,93,1750,42,2024-04-01 00:00:00
"""1,137,935,514""",1,0,0,42,2024-04-01 00:00:00
"""258,848,832""",14,9,5,42,2024-04-01 00:00:00
"""5,474,862,476""",1000,981,5,42,2024-04-01 00:00:00
"""1,385,198,552""",2354,2248,12,42,2024-04-01 00:00:00
…,…,…,…,…,…
"""1,863,992,215""",524,293,207,40,2024-04-01 00:00:00
"""648,981,358""",10305,7144,3043,42,2024-04-01 00:00:00
"""1,157,135,335""",5263,26,5204,42,2024-04-01 00:00:00
"""1,994,356,606""",1,0,0,1,2024-04-01 00:00:00


In [31]:
res = []
for date_dtm in dates_dtm:
    res.append(get_user_stats_day(user_meme_proc_v2_df, meme_created_df, date_dtm))

user_features_daily_df = (
    pl.concat(res)
)

In [35]:
user_features_daily_df.write_parquet('user_features_daily.pq')