Copyright (c) Meta Platforms, Inc. and affiliates.
All rights reserved.

This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.


In [4]:
import pandas as pd

**The dataset can be downloaded at https://socialmediaarchive.org/record/55?&ln=en**

In [5]:
data_file_name = "instagram_notification_auction_base_dataset.csv"
df = pd.read_csv(data_file_name)

In [6]:
df.columns

Index(['ndid', 'user_id', 'user_id_anonymized', 'notification_type',
       'notification_value', 'dummy_bidder_bid', 'pacing_multiplier',
       'clicked', 'sent', 'auction_event_time', 'auction_type', 'event_id',
       'auction_event_date'],
      dtype='object')

## Derived Dataset 1

In [7]:
# time window = 1 day
# if a user receives less than 2 types of notifications within a time window, we drop them
threshold = 2
keeps = set(
    df.groupby(
        ['auction_event_date', 'user_id_anonymized']
    ).agg(
        {'notification_type': 'nunique'}
    ).loc[
        lambda x: x['notification_type'] >= threshold, :
    ].index
)

df_keeps = df.loc[
    df.apply(
        lambda x: (x['auction_event_date'], x['user_id_anonymized']) in keeps, axis=1
    ), :
]

In [8]:
print('{} entries out of {} entries removed.'.format(
    df.shape[0] - df_keeps.shape[0], df.shape[0]
))
print('***' * 20)
for auction_event_date in sorted(df['auction_event_date'].unique()):
    print('{} users out of {} users are kept in day {}'.format(
        df_keeps[
            df_keeps['auction_event_date'] == auction_event_date
        ]['user_id_anonymized'].nunique(),
        df[
            df['auction_event_date'] == auction_event_date
        ]['user_id_anonymized'].nunique(),
        auction_event_date
    ))

318722 entries out of 409698 entries removed.
************************************************************
4601 users out of 32144 users are kept in day 2022-12-14
3909 users out of 29052 users are kept in day 2022-12-15
4163 users out of 29812 users are kept in day 2022-12-16
3704 users out of 28425 users are kept in day 2022-12-17


2910 users out of 26267 users are kept in day 2022-12-18
2908 users out of 26575 users are kept in day 2022-12-19
3615 users out of 29600 users are kept in day 2022-12-20
3945 users out of 31536 users are kept in day 2022-12-21
4420 users out of 32523 users are kept in day 2022-12-22


4440 users out of 32284 users are kept in day 2022-12-23


In [9]:
# for each day and for each user, combine the notifications generated for them
df_derived_1 = df_keeps.groupby(
    ['auction_event_date', 'user_id_anonymized']
).agg({
    'auction_event_time': 'last',
    'notification_type': list,
    'notification_value': list
}).reset_index()

In [11]:
df_derived_1.to_csv('instagram_notification_auction_derived_dataset_one_day_window.csv', index=False)

## Derived Dataset 2

In [12]:
# time window = 2 days 
# note: time window is the only difference compared to dervied dataset 1
# if a user receives less than 2 types of notifications within a time window, we drop them
threshold = 2

date_map = {
    '2022-12-14': '2022-12-14~15',
    '2022-12-15': '2022-12-14~15',
    '2022-12-16': '2022-12-16~17',
    '2022-12-17': '2022-12-16~17',
    '2022-12-18': '2022-12-18~19',
    '2022-12-19': '2022-12-18~19',
    '2022-12-20': '2022-12-20~21',
    '2022-12-21': '2022-12-20~21',
    '2022-12-22': '2022-12-22~23',
    '2022-12-23': '2022-12-22~23',
}

df['auction_event_date'] = df['auction_event_date'].apply(lambda x: date_map[x])

keeps = set(
    df.groupby(
        ['auction_event_date', 'user_id_anonymized']
    ).agg(
        {'notification_type': 'nunique'}
    ).loc[
        lambda x: x['notification_type'] >= threshold, :
    ].index
)

df_keeps = df.loc[
    df.apply(
        lambda x: (x['auction_event_date'], x['user_id_anonymized']) in keeps, axis=1
    ), :
]

In [13]:
print('{} entries removed.'.format(df.shape[0] - df_keeps.shape[0]))
print('***' * 20)
for auction_event_date in sorted(df['auction_event_date'].unique()):
    print('{} users out of {} users are kept in day {}'.format(
        df_keeps[
            df_keeps['auction_event_date'] == auction_event_date
        ]['user_id_anonymized'].nunique(),
        df[
            df['auction_event_date'] == auction_event_date
        ]['user_id_anonymized'].nunique(),
        auction_event_date
    ))

276112 entries removed.
************************************************************
8909 users out of 42773 users are kept in day 2022-12-14~15
7857 users out of 40206 users are kept in day 2022-12-16~17
6319 users out of 38226 users are kept in day 2022-12-18~19
8379 users out of 43735 users are kept in day 2022-12-20~21


9608 users out of 45714 users are kept in day 2022-12-22~23


In [14]:
# for each day and for each user, combine the notifications generated for them
df_derived_2 = df_keeps.groupby(
    ['auction_event_date', 'user_id_anonymized']
).agg({
    'auction_event_time': 'last',
    'notification_type': list,
    'notification_value': list
}).reset_index()

In [16]:
df_derived_2.to_csv('instagram_notification_auction_derived_dataset_two_day_window.csv', index=False)