# Cold Start Dataset

Two files are prepared in this notebook

* Train set for the cold start offline tests
* Historical features

## Train set

Train samples are collections of seen and future (target) items at some moment of time

Fields

* `user_id`
* `hist_size`: memes with reactions at the moment of time
* `dtm`: timestamp of the train sample
* `hist_memes`, `hist_reactions`: memes from history
* `target_memes`, `target_reactions`: memes from future

Be careful with out-of-time validation. User samples contains same values (target goes to history through the time). Consider only user or both user and time splits.

## Historical features

Features for every day. Contains fields from meme_stats which is used in prod.

Fields

* `meme_id`
* `n_likes`
* `n_dislikes`
* `n_memes_sent`
* `age_days`

In [1]:
import os
import polars as pl
from datetime import datetime
import numpy as np
from tqdm.notebook import tqdm
from dotenv import load_dotenv

In [2]:
dtm_fmt = "%B %-d, %Y, %I:%M %p"

In [30]:
user_meme_df = pl.read_parquet('user_meme_reaction.pq')
user_df = pl.read_parquet('user.pq')
meme_df = pl.read_parquet('meme.pq').filter(pl.col('status').is_in(['ok', 'published']))
meme_source_df = pl.read_parquet('meme_source.pq')
meme_raw_vk_df = pl.read_parquet('meme_raw_vk.pq')

In [4]:
user_meme_proc_df = (
    user_meme_df
    .with_columns(pl.col('reaction_id').fill_null(2))
    .sort('user_id', 'sent_at')
    .with_columns(reaction_num=pl.first().cum_count().over('user_id'))
)

# Cold Start dataset

In [5]:
coldstart_df = (
    user_meme_proc_df
    .sort('user_id', 'reaction_num')
    .filter(pl.col('reaction_num') <= 100)
    .group_by('user_id')
    .agg(
        pl.col('sent_at').min().alias('dtm'),
        pl.col('sent_at').min().dt.truncate('1d').alias('date_dtm'),
        pl.col('meme_id').alias('target_memes'),
        pl.col('reaction_id').alias('target_reactions'),
        pl.col('recommended_by').alias('target_recommended_by'),
    )
)

In [6]:
coldstart_df.head()

user_id,dtm,date_dtm,target_memes,target_reactions,target_recommended_by
i64,datetime[μs],datetime[μs],list[i64],list[i64],list[str]
23045,2024-03-04 23:03:03.047408,2024-03-04 00:00:00,"[1197419, 1200107, … 2201702]","[2, 2, … 2]","[""sorted_by_user_source_lr_meme_lr_meme_age"", ""sorted_by_user_source_lr_meme_lr_meme_age"", … ""classic""]"
45933,2024-04-01 10:08:03.474343,2024-04-01 00:00:00,"[1197491, 3918601, … 1267028]","[1, 1, … 2]","[""like_spread_and_recent"", ""like_spread_and_recent"", … ""like_spread_and_recent""]"
211558,2024-03-18 04:07:42.589074,2024-03-18 00:00:00,"[1195419, 171119, … 790223]","[2, 2, … 2]","[""best_meme_from_each_source"", ""best_meme_from_each_source"", … ""best_meme_from_each_source""]"
283585,2024-03-27 19:10:20.263604,2024-03-27 00:00:00,"[2618594, 3024098, … 4897140]","[1, 1, … 2]","[""like_spread_and_recent"", ""like_spread_and_recent"", … ""best_meme_from_each_source""]"
291455,2024-04-01 18:10:23.036944,2024-04-01 00:00:00,"[2320851, 3898147, 3724910]","[2, 2, 2]","[""like_spread_and_recent"", ""like_spread_and_recent"", ""like_spread_and_recent""]"


In [7]:
coldstart_df.write_parquet('coldstart.pq')

# Sequential dataset

In [8]:
def prepare_data_user(user_meme_proc_df, user_id, chunk_size=30):
    """
    Splits reactions on chunks
    Fills historical and target memes
    The number of output samples is equal to the number of chunks
    """
    res = []
    window_size_backward = chunk_size
    window_size_forward = chunk_size * 3

    user_meme_slice = user_meme_proc_df.filter(pl.col('user_id') == user_id).sort('sent_at')
    
    if len(user_meme_slice) == 0:
        return res

    n_reactions = user_meme_slice.select('reaction_num').max().item()

    cur_idx = 0

    while cur_idx < n_reactions:

        filt_hist = (pl.col('reaction_num') <= cur_idx) & (pl.col('reaction_num') > cur_idx - window_size_backward)
        filt_target = (pl.col('reaction_num') > cur_idx) & (pl.col('reaction_num') < cur_idx + window_size_forward)

        d = {
            'user_id': user_id,
            'hist_size': len(user_meme_slice.filter(filt_hist)),
            'dtm': user_meme_slice.filter(filt_target).select('sent_at').min().item(),
            'date_dtm': user_meme_slice.filter(filt_target).select(pl.col('sent_at').dt.truncate('1d')).min().item(),
            'hist_memes': user_meme_slice.filter(filt_hist).get_column('meme_id').to_list(),
            'hist_reactions': user_meme_slice.filter(filt_hist).get_column('reaction_id').to_list(),
            'target_memes': user_meme_slice.filter(filt_target).get_column('meme_id').to_list(),
            'target_reactions': user_meme_slice.filter(filt_target).get_column('reaction_id').to_list(),
            'target_recommended_by': user_meme_slice.filter(filt_target).get_column('recommended_by').to_list(),
        }
        res.append(d)
        cur_idx += chunk_size

    return res

In [9]:
test_user_meme_df = pl.DataFrame({'user_id': [1] * 5, 'reaction_num': list(range(1, 6)), 'reaction_id': [1] * 5, 'meme_id': list(range(1, 6)), 'sent_at': [datetime(2024, 1, i, 1) for i in range(1, 6)], 'recommended_by': ['best_meme_from_each_source'] * 5})
prepare_data_user(test_user_meme_df, 1, chunk_size=2)

[{'user_id': 1,
  'hist_size': 0,
  'dtm': datetime.datetime(2024, 1, 1, 1, 0),
  'date_dtm': datetime.datetime(2024, 1, 1, 0, 0),
  'hist_memes': [],
  'hist_reactions': [],
  'target_memes': [1, 2, 3, 4, 5],
  'target_reactions': [1, 1, 1, 1, 1],
  'target_recommended_by': ['best_meme_from_each_source',
   'best_meme_from_each_source',
   'best_meme_from_each_source',
   'best_meme_from_each_source',
   'best_meme_from_each_source']},
 {'user_id': 1,
  'hist_size': 2,
  'dtm': datetime.datetime(2024, 1, 3, 1, 0),
  'date_dtm': datetime.datetime(2024, 1, 3, 0, 0),
  'hist_memes': [1, 2],
  'hist_reactions': [1, 1],
  'target_memes': [3, 4, 5],
  'target_reactions': [1, 1, 1],
  'target_recommended_by': ['best_meme_from_each_source',
   'best_meme_from_each_source',
   'best_meme_from_each_source']},
 {'user_id': 1,
  'hist_size': 2,
  'dtm': datetime.datetime(2024, 1, 5, 1, 0),
  'date_dtm': datetime.datetime(2024, 1, 5, 0, 0),
  'hist_memes': [3, 4],
  'hist_reactions': [1, 1],
  'ta

In [10]:
res = []
chunk_size = 30

for user_id in tqdm(user_df.get_column('id').to_list()):
    res.extend(prepare_data_user(user_meme_proc_df, user_id, chunk_size=chunk_size))
sequential_dataset = pl.DataFrame(res)

  0%|          | 0/11139 [00:00<?, ?it/s]

In [11]:
sequential_dataset.write_parquet('sequential_dataset.pq')

# VK percentiles

In [12]:
vk_stats_df = (
    meme_raw_vk_df
    .select(pl.col('id').alias('raw_meme_id'), 'views', 'likes', 'meme_source_id')
    .join(meme_source_df.select(pl.col('id').alias('meme_source_id'), 'url'), on='meme_source_id')
    .group_by('meme_source_id', 'url')
    .agg(
        pl.count().alias('n_memes'),
        pl.quantile('likes', 0.25).alias('raw_likes_p25'),
        pl.quantile('likes', 0.50).alias('raw_likes_p50'),
        pl.quantile('likes', 0.75).alias('raw_likes_p75'),
    )
    .sort('n_memes', descending=True)
)
vk_stats_df

  pl.count().alias('n_memes'),


meme_source_id,url,n_memes,raw_likes_p25,raw_likes_p50,raw_likes_p75
i64,str,u32,f64,f64,f64
3,"""https://vk.com…",3518,159.0,216.0,300.0
62,"""https://vk.com…",1761,525.0,736.0,1083.0
49,"""https://vk.com…",1384,313.0,471.0,794.0
67,"""https://vk.com…",1266,78.0,120.0,184.0
7,"""https://vk.com…",1191,197.0,271.0,505.0
…,…,…,…,…,…
73,"""https://vk.com…",127,453.0,647.0,924.0
51,"""https://vk.com…",112,281.0,546.0,917.0
58,"""https://vk.com…",109,108.0,159.0,296.0
55,"""https://vk.com…",95,223.0,347.0,552.0


In [13]:
vk_meme_stats_df = (
    meme_df
    .select(pl.col('id').alias('meme_id'), 'raw_meme_id', 'meme_source_id')
    .join(meme_source_df.select('id', 'type'), left_on='meme_source_id', right_on='id')
    .filter(pl.col('type') == 'vk')
    .join(meme_raw_vk_df.select(pl.col('id').alias('raw_meme_id'), 'likes', 'views'), on='raw_meme_id')
    .join(vk_stats_df, on='meme_source_id')
    .select('meme_id', pl.col('meme_source_id').alias('vk_meme_source_id'), pl.col('views').alias('raw_views'), pl.col('likes').alias('raw_likes'), 'raw_likes_p25', 'raw_likes_p50', 'raw_likes_p75')
)

In [14]:
vk_meme_stats_df

meme_id,vk_meme_source_id,raw_views,raw_likes,raw_likes_p25,raw_likes_p50,raw_likes_p75
i64,i64,i64,i64,f64,f64,f64
84939,71,1570,14,74.0,122.0,188.0
816078,3,9264,146,159.0,216.0,300.0
1374823,69,4231,30,55.0,89.0,159.0
673517,62,22667,501,525.0,736.0,1083.0
460988,63,50272,1391,622.0,926.0,1597.0
…,…,…,…,…,…,…
8137816,54,7012,176,226.0,312.0,497.0
7438661,56,28875,97,47.0,71.0,130.0
7438663,56,20514,172,47.0,71.0,130.0
8181555,50,17414,925,452.0,788.0,1769.0


In [15]:
vk_stats_df.write_parquet('vk_stats.pq')

## Meme features v1

Trying to reimplement the current production meme_stats table

In [16]:
def get_meme_stats_day(date_dtm, user_features_daily_df=None):
    stats = (
        user_meme_proc_df
        .filter(pl.col('sent_at') < date_dtm)
        .with_columns(pl.col('sent_at').dt.truncate('1d').alias('date_dtm'))
    )
    if user_features_daily_df is not None:
        fresh_users = (
            user_features_daily_df
            .filter(pl.col('n_memes_sent') < 300)
            .select('user_id', 'date_dtm')
        )
        stats = stats.join(fresh_users, on=['user_id', 'date_dtm'], how='inner')
    
    stats = (
        stats
        .group_by('meme_id').agg(
            (pl.col('reaction_id').count() + pl.col('reaction_id').null_count()).alias('n_memes_sent'),
            (pl.col('reaction_id') == 1).cast(pl.Int64).sum().alias('n_likes'),
            (pl.col('reaction_id') == 2).cast(pl.Int64).sum().alias('n_dislikes'),
        )
    )

    return (
        meme_df
        .select(pl.col('id').alias('meme_id'), 'language_code', 'created_at', 'meme_source_id', 'status')
        .join(meme_source_df.select('id', 'url'), left_on='meme_source_id', right_on='id')
        .filter(pl.col('created_at') < date_dtm)
        .join(stats, on='meme_id', how='left')
        .join(vk_meme_stats_df, on='meme_id', how='left')
        .fill_null(0)
        .with_columns((pl.lit(date_dtm) - pl.col('created_at')).dt.total_days().alias('age_days'))
        .drop('created_at')
        .with_columns(pl.lit(date_dtm).dt.truncate('1d').alias('date_dtm'))
    )

In [17]:
# Testing
get_meme_stats_day(datetime(2024, 4, 1))

meme_id,language_code,meme_source_id,status,url,n_memes_sent,n_likes,n_dislikes,vk_meme_source_id,raw_views,raw_likes,raw_likes_p25,raw_likes_p50,raw_likes_p75,age_days,date_dtm
i64,str,i64,str,str,u32,i64,i64,i64,i64,i64,f64,f64,f64,i64,datetime[μs]
334508,"""ru""",109,"""ok""","""https://t.me/k…",14,6,8,0,0,0,0.0,0.0,0.0,38,2024-04-01 00:00:00
221082,"""ru""",110,"""ok""","""https://t.me/w…",6,2,4,0,0,0,0.0,0.0,0.0,40,2024-04-01 00:00:00
317457,"""en""",27,"""ok""","""https://t.me/a…",18,9,9,0,0,0,0.0,0.0,0.0,39,2024-04-01 00:00:00
1645268,"""ru""",110,"""ok""","""https://t.me/w…",26,13,13,0,0,0,0.0,0.0,0.0,22,2024-04-01 00:00:00
212537,"""ru""",97,"""ok""","""https://t.me/t…",18,5,13,0,0,0,0.0,0.0,0.0,40,2024-04-01 00:00:00
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
3560,"""ru""",91,"""ok""","""https://t.me/a…",62,21,41,0,0,0,0.0,0.0,0.0,42,2024-04-01 00:00:00
84932,"""ru""",109,"""ok""","""https://t.me/k…",23,9,14,0,0,0,0.0,0.0,0.0,41,2024-04-01 00:00:00
585,"""ru""",80,"""ok""","""https://t.me/m…",16,4,12,0,0,0,0.0,0.0,0.0,43,2024-04-01 00:00:00
384892,"""ru""",23,"""ok""","""https://t.me/l…",34,14,20,0,0,0,0.0,0.0,0.0,38,2024-04-01 00:00:00


In [18]:
# Not very reliable though no constants
# Drop the first day
dates_dtm = user_meme_df.select(pl.col('sent_at').dt.truncate('1d')).unique().sort('sent_at').get_column('sent_at').to_list()

In [19]:
res = []
for date_dtm in dates_dtm:
    res.append(get_meme_stats_day(date_dtm))

meme_features_daily_df = (
    pl.concat(res)
    .join(meme_df.select('id', 'meme_source_id'), left_on='meme_id', right_on='id')
)

In [20]:
meme_features_daily_df.write_parquet('meme_features_daily.pq')

## User features

Trying to reimplement the current production user_stats table

In [21]:
def get_user_stats_day(user_meme_proc_df, date_dtm):
    return (
        user_meme_proc_df
        .select('user_id', 'meme_id', 'sent_at', 'reaction_id')
        .filter(pl.col('sent_at') < date_dtm)
        .group_by('user_id').agg(
            (pl.col('reaction_id').count() + pl.col('reaction_id').null_count()).alias('n_memes_sent'),
            (pl.col('reaction_id') == 1).cast(pl.Int64).sum().alias('n_likes'),
            (pl.col('reaction_id') == 2).cast(pl.Int64).sum().alias('n_dislikes'),
        )
        .join(user_df.select('id', 'created_at'), left_on='user_id', right_on='id')
        .with_columns((pl.lit(date_dtm) - pl.col('created_at')).dt.total_days().alias('user_age_days'))
        .drop('created_at')
        .with_columns(pl.lit(date_dtm).dt.truncate('1d').alias('date_dtm'))
    )


In [22]:
# Testing
get_user_stats_day(user_meme_df, datetime(2024, 4, 1))

user_id,n_memes_sent,n_likes,n_dislikes,user_age_days,date_dtm
i64,u32,i64,i64,i64,datetime[μs]
320313545,4378,62,4273,42,2024-04-01 00:00:00
1137935514,1,0,0,42,2024-04-01 00:00:00
912433863,350,273,37,42,2024-04-01 00:00:00
6369180715,17,14,0,6,2024-04-01 00:00:00
984118712,85,43,38,42,2024-04-01 00:00:00
…,…,…,…,…,…
1396732769,10782,9415,1261,41,2024-04-01 00:00:00
355168099,1330,87,1222,40,2024-04-01 00:00:00
329524183,5210,72,5098,42,2024-04-01 00:00:00
6906848105,6118,5654,420,41,2024-04-01 00:00:00


In [23]:
res = []
for date_dtm in dates_dtm:
    res.append(get_user_stats_day(user_meme_proc_df, date_dtm))

user_features_daily_df = (
    pl.concat(res)
)

In [24]:
user_features_daily_df.write_parquet('user_features_daily.pq')

## Meme features V2 - more strict filtration

Ignore old users (more than 200 reactions) in meme stats

The idea is that users with many responces have too high like rate with sometimes strange taste of good memes.

In [25]:
res = []
for date_dtm in dates_dtm:
    res.append(get_meme_stats_day(date_dtm, user_features_daily_df))

meme_features_daily_v2_df = (
    pl.concat(res)
    .join(meme_df.select('id', 'meme_source_id'), left_on='meme_id', right_on='id')
)

In [26]:
meme_features_daily_v2_df.filter(pl.col('meme_id') == 112898).filter(pl.col('date_dtm') == datetime(2024, 5, 1))

meme_id,language_code,meme_source_id,status,url,n_memes_sent,n_likes,n_dislikes,vk_meme_source_id,raw_views,raw_likes,raw_likes_p25,raw_likes_p50,raw_likes_p75,age_days,date_dtm,meme_source_id_right
i64,str,i64,str,str,u32,i64,i64,i64,i64,i64,f64,f64,f64,i64,datetime[μs],i64
112898,"""ru""",12,"""ok""","""https://t.me/m…",16,13,3,0,0,0,0.0,0.0,0.0,71,2024-05-01 00:00:00,12


In [27]:
meme_features_daily_v2_df.write_parquet('meme_features_daily_v2.pq')