In [2]:
# Do all imports
import pandas as pd
import scipy
import numpy as np
from numpy.f2py.auxfuncs import throw_error
from pandas import interval_range
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

import os

PWD = os.getcwd()

import sys

sys.path.insert(1, PWD + './transformations.py')
sys.path.insert(1, PWD + './helpers.py')

from transformations import Winsorizer, FeatureMultiplier, Average
from helpers import logistic_map

import psycopg2

import json
import requests
import time
from http import HTTPStatus

import requests
from requests.exceptions import HTTPError


import dotenv

dotenv.load_dotenv()

True

In [3]:
APP_ID = 2
CAMPAIGN_ID = "4,5"
TIME_WINDOW = '7d'
FREQ = "D"  ## or "D" for daily or  "3D" 3 daily, "W-Mon"
TARGET_COUNTRY = "United_States"
VIEWS_THRESHOLD = 100

# how many % do we pull rpm's closer to its groups mean
# when rpm is under the mean
PAYOUT_UPLIFTING_PERC = 0.15

DEFAULT_RPM = 2 ## rpm we are paying to creators used as a base, currently 2$
PROCEEDS_PERCENTAGE_FOR_ATTRIBUTION = 0.8
RPM_CAP = 4  ## max rpm allowed

TARGET_ROI = 3.0

print(f"""
APP_ID: {APP_ID}
CAMPAIGN_ID: {CAMPAIGN_ID}
TIME_WINDOW: {TIME_WINDOW}
FREQ: {FREQ}
TARGET_COUNTRY: {TARGET_COUNTRY}
VIEWS_THRESHOLD: {VIEWS_THRESHOLD}

PAYOUT_UPLIFTING_PERC: {PAYOUT_UPLIFTING_PERC * 100}%

DEFAULT_RPM: {DEFAULT_RPM}$
PROCEEDS_PERCENTAGE_FOR_ATTRIBUTION: {PROCEEDS_PERCENTAGE_FOR_ATTRIBUTION * 100}%
RPM_CAP: {RPM_CAP}$

TARGET_ROI: {TARGET_ROI}
""")


APP_ID: 2
CAMPAIGN_ID: 4,5
TIME_WINDOW: 7d
FREQ: D
TARGET_COUNTRY: United_States
VIEWS_THRESHOLD: 100

PAYOUT_UPLIFTING_PERC: 15.0%

DEFAULT_RPM: 2$
PROCEEDS_PERCENTAGE_FOR_ATTRIBUTION: 80.0%
RPM_CAP: 4$

TARGET_ROI: 3.0



In [26]:
# define engagement factor rates
engagement_factor_weights = {
    "likes_rate":        1.0,
    "comments_rate":     1.2,
    "saves_rate":        0.5,
    "shares_rate":       0.0,
    "likes_to_comments": 1.0,
}

# engagement volume weights
engagement_volume_weights = {
    "views":    0.8,
    "likes":    1.0,
    "comments": 1.2,
    "saves":    0.3,
    "shares":   0.3,
}

# roi we are targeting for each group of views,
# posts with more views are likely to get paid more, so we cut their revenue more
# give it to posts with low views, like robin hood principal
target_roi_by_groups = [
    (-np.inf,   5_000,       1.5),
    (5_000,     10_000,      1.8),
    (10_000,    50_000,      2.3),
    (50_000,    100_000,     2.9),
    (100_000,   1_000_000,   3.0),
    (1_000_000, np.inf,      3.5),
]

# Controls how strongly marginal returns decay as volume increases; higher values imply stronger saturation effects.
#    - 1 => full effect
#    - 0 => no effect
#
# diminishing return effect by categories
# low views are less affected by diminishing returns effect as they
# don't have much room to show their effects
diminishing_returns_effect_by_group = [
    (-np.inf,   5_000,       0.2),
    (5_000,     10_000,      0.3),
    (10_000,    50_000,      0.4),
    (50_000,    100_000,     0.5),
    (100_000,   1_000_000,   0.8),
    (1_000_000, np.inf,      0.9),
]

# other configuration variables


# group data into groups by views to calculate incentive boost within each group
vol_cut_ranges = [
      -np.inf,
            0,
        1_000,
       10_000,
      100_000,
    1_000_000,
       np.inf,
]

# %-+ allowed incentive boost
incentive_boost_effect = 0.2
# how curved or linear incentive boost should be, less is more linear
incentive_boost_order = 2

## Load Data

In [13]:
query = f"""
select
    toDate(toString(person.properties.applicationInstalledAt)) as date,
    sum(properties.proceeds) as proceeds,
    properties.$geoip_country_name as country,
    person.properties.attribution_source as attributionSource
FROM events
WHERE
    event in  ('sw_trial_converted', 'sw_subscription_start', 'sw_renewal', 'sw_refund')
    and attributionSource in ('tiktok', 'instagram') and person.properties.applicationInstalledAt is not null
    and properties.$geoip_country_name in ('{TARGET_COUNTRY.replace("_", " ")}', 'United Kingdom')
    and "date" >= NOW() - INTERVAL '{TIME_WINDOW.rstrip('d')} days'
group by "date", attributionSource, country
order by "date" desc
"""

revenueRetries = 5
revenueRetryCodes = [
    HTTPStatus.TOO_MANY_REQUESTS,
    HTTPStatus.INTERNAL_SERVER_ERROR,
    HTTPStatus.BAD_GATEWAY,
    HTTPStatus.SERVICE_UNAVAILABLE,
    HTTPStatus.GATEWAY_TIMEOUT,
]

revenueResponse = None

for n in range(revenueRetries):
    try:
        revenueResponse = requests.post(
            f'https://us.posthog.com/api/projects/{276292}/query/',
            headers={
                "Authorization": f'Bearer {os.getenv("POSTHOG_KEY")}',
                'Content-Type': 'application/json',
            },
            data=json.dumps({
                'query': {
                    'kind': "HogQLQuery",
                    'query': query
                },
                'async': False,
                'refresh': 'force_blocking', # ignore cache
                'name': "Cohorted Proceeds by date, country and attribution source",
            })
        )
        revenueResponse.raise_for_status()

        break

    except HTTPError as exc:
        code = exc.response.status_code

        if code in revenueRetryCodes:
            # exponential backoff
            time.sleep(2 ** (n + 1))
            continue

        customExc = None
        try:
            customExc = Exception(f"{exc.response.json()}")
        except:
            pass

        if customExc is not None:
            raise customExc
        else:
            raise exc

if revenueResponse is None or revenueResponse.status_code != HTTPStatus.OK or revenueResponse.headers.get('Content-Type') != 'application/json':
    raise Exception(f"Unexpected revenue response code from posthog {revenueResponse.status_code}, {revenueResponse.headers.get('Content-Type')}")

revenueJson = revenueResponse.json()
revenueDataRaw = revenueJson['results']
revenueColumns = revenueJson['columns']

In [28]:
revenue_attribution_map = {
    'tiktok': 'TikTok',
    'instagram': 'Instagram'
}

df_revenue = (
    pd.DataFrame(revenueDataRaw, columns=revenueColumns)
    .assign(
        date=lambda c: pd.to_datetime(c.date, format="%Y-%m-%d"),
        country=lambda x: x.country.map({"United_Kingdom": "United_States"}).fillna(x.country),
        channel=lambda x: x.attributionSource.map(revenue_attribution_map)
    )
    .drop(columns=['attributionSource'])
)

Unnamed: 0,date,proceeds,country,channel
0,2026-02-05,50.326506,United States,Instagram
1,2026-02-05,28.837631,United States,TikTok
2,2026-02-04,50.98,United States,Instagram
3,2026-02-04,8.49,United States,TikTok
4,2026-02-03,54.28067,United States,Instagram


In [29]:
stat_cols = [
    "views_diff",
    "likes_diff",
    "comments_diff",
    "saves_diff",
    "shares_diff"
]

df_scrape_data = (
    pd
    .read_sql_query(
        sql=f"""
            select
                sma.date,
                sma.id as account_id,
                sma.campaign_id as campaign_id,
                sma.country as account_country,
                sma.channel as account_channel,
                sma.total_views_diff as views_diff,
                sma.total_likes_diff as likes_diff,
                sma.total_comments_diff as comments_diff,
                sma.total_saves_diff as saves_diff,
                sma.total_shares_diff as shares_diff,
                sma.total_posts_diff as posts_diff
            from social_media_accounts_daily_diff sma
            where
                sma.date >= CURRENT_DATE - INTERVAL %(TIME_WINDOW)s
                and total_views_diff > %(VIEWS_THRESHOLD)s
                and total_views_diff > sma.total_likes_diff
                and sma.country = %(TARGET_COUNTRY)s
                and sma.campaign_id in ({CAMPAIGN_ID})
            order by sma.date desc
        """,
        con=os.getenv("DATABASE_URL"),
        params={
            "TIME_WINDOW": TIME_WINDOW,
            "VIEWS_THRESHOLD": VIEWS_THRESHOLD,
            "TARGET_COUNTRY": TARGET_COUNTRY
        },
        parse_dates=['date']
    )

)
df_scrape_data[stat_cols] = df_scrape_data[stat_cols].clip(0, np.inf)

## Data prep

In [30]:
def add_aggregate_metrics(df):
    df = (
        df
        .assign(
            likes_rate=lambda x: x['likes_diff'] / x["views_diff"],
            comments_rate=lambda x: x['comments_diff'] / x["views_diff"],
            saves_rate=lambda x: x['saves_diff'] / x["views_diff"],
            shares_rate=lambda x: x['shares_diff'] / x["views_diff"],
            likes_to_comments=lambda x: (x['comments_diff'] / x["likes_diff"]).clip(0, 1),
        )
    )
    return df

In [31]:
def get_labels_for_groups(series: pd.Series, groups_with_labels):
    """
    Label a pandas Series based on value ranges.

    Parameters
    ----------
    series : pd.Series
        Numeric pandas Series to label.
    groups_with_labels : list of tuples
        Each tuple: (lower_bound, upper_bound, label)
        Bounds are inclusive on the left, exclusive on the right.

    Returns
    -------
    pd.Series
        Labeled Series.
    """
    labels = pd.Series(index=series.index)

    for low, high, label in groups_with_labels:
        mask = (series >= low) & (series < high)
        labels.loc[mask] = label

    return labels.values

In [32]:
# aggregate to get account level metrics per date
df_posts_model = (
    df_scrape_data
    .pipe(lambda d: add_aggregate_metrics(d))
    .sort_values("likes_rate")
    .dropna()
    .groupby([
        pd.Grouper(key="date", freq=FREQ, label="left", closed="left"),
        "account_id",
        "campaign_id",
        "account_country",
        "account_channel"
    ], as_index=False)
    # .resample(FREQ, on  = "date", label = "left", closed = 'left')
    .agg({v: "sum" for _, v in enumerate(stat_cols)})
    .pipe(lambda x: add_aggregate_metrics(x).fillna(0))
)

df_revenue_model = (
    df_revenue
    .groupby([
        pd.Grouper(key="date", freq=FREQ, label="left", closed="left"),
        "country",
        "channel"
    ])
    .agg({
        "proceeds": "sum"
    })
    .reset_index()
)

### Computation

#### Engagement Factor / Diminishing Returns

  - Weighted average of engagements rates (e. g. likes/views, comments/views, comments/likes etc.)
  - Because of its negative correlation with views, we can use this as a diminishing returns proxy

In [33]:
def compute_engagement_factor(rates, weights):
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("weights", FeatureMultiplier(weights)),
        ("average", Average()),
        ("winsor", Winsorizer(lower_quantile=0.03, upper_quantile=0.99)),  ## clip extremes
        ("minmax", MinMaxScaler(feature_range=(0, 1)))  # scale to 0-1
    ])

    return pipeline.fit_transform(rates)

engagement_factor_weights_series = pd.Series(engagement_factor_weights)

df_posts_model["engagement_factor_raw"] = compute_engagement_factor(
    rates=df_posts_model[["likes_rate", "comments_rate", "saves_rate", "shares_rate", "likes_to_comments"]].clip(0, 1),
    weights=engagement_factor_weights_series,
)

# amplitude of diminishing returns effect is decided by the views
df_posts_model["engagement_factor"] = (
        df_posts_model["engagement_factor_raw"]
        ** get_labels_for_groups(df_posts_model['views_diff'], diminishing_returns_effect_by_group)
)

In [34]:
# scatter_plot(
#     df_posts_model.query("engagement_factor > 0").assign(views_log = lambda x: np.log(x.views_diff+1)),
#     "views_log",
#     "engagement_factor",
#     xlabel = "views_log",
#     ylabel = "engagement_factor",
#     title = "Log Views vs. Engagement Factor"
# )

#### Incentive boost

- Statistical transformation on the engagement_factor
    - boxcox to make the distribution near normal
    - abs max scaling
    - stratified normalization --> to make every number comparable for ranking
    - logistic mapping for incentive multiplier
- here how it works:
    - mean performance is 1
    - the output is in 0.8 to 1.2
      - if above mean ==> max 20% boost (1.2)
      - if belov mean ==> min 20% deboost (0.8)

In [35]:
def compute_incentive_boost(df, engagement_col, group_col, boxcox_lambda=None):
    boost, _ = scipy.stats.boxcox(df[engagement_col] + 1, lmbda=boxcox_lambda)
    df = df[[engagement_col, group_col]].copy()

    df['boost'] = boost
    df['boost'] /= df['boost'].max()
    df['boost'] = df['boost'] / df.groupby(group_col, observed=False)['boost'].transform('mean')

    return df['boost'].clip(0, 2)


df_posts_model["vol_cut"] = pd.cut(df_posts_model['views_diff'], vol_cut_ranges)

df_posts_model["incentive_boost_raw"] = compute_incentive_boost(
    df=df_posts_model,
    engagement_col="engagement_factor",
    group_col="vol_cut",
)

df_posts_model['incentive_boost'] = logistic_map(df_posts_model['incentive_boost_raw'], d=incentive_boost_effect, k=incentive_boost_order)

In [36]:
# x = np.linspace(0,2,100)
# y = logistic_map(x, d=incentive_boost_effect, k=incentive_boost_order)
#
# fig, ax = plt.subplots(figsize = (12,5))
# ax.plot(x, y, ls = "", marker = 'o', markersize = 2)

In [37]:
# scatter_plot(
#     df_posts_model.query("engagement_factor > 0").assign(views_log = lambda x: np.log(x.views_diff+1)),
#     "views_log",
#     "incentive_boost_raw",
#     xlabel = "views_log",
#     ylabel = "incentive_boost",
#     title = "Log Views vs. Incentive Boost"
# )

#### Engagement Volume

  - Weighted average of volume metrics (e. g. views, shares, comments, likes)

In [38]:
def compute_engagement_volume(volumes, weights):
    pipeline = Pipeline([
        ("weights", FeatureMultiplier(weights)),
        ("average", Average()),
        ("winsor", Winsorizer(lower_quantile=0, upper_quantile=1)),
    ])

    return pipeline.fit_transform(volumes)


engagement_volume_weights_series = pd.Series(engagement_volume_weights)

df_posts_model['engagement_volume'] = compute_engagement_volume(
    volumes=(df_posts_model[stat_cols]).pipe(lambda x: x / x.quantile(0.9)).values,
    weights=engagement_volume_weights_series
)

#### Quality Volume

  - Main metric for attribution, defined as:
    - engagement_volume * engagement_factor * incentive_boost

In [39]:
df_posts_model["quality_volume"] = (
    df_posts_model["engagement_volume"] *
    df_posts_model["engagement_factor"] *
    df_posts_model["incentive_boost"]
)

#### Computing the payout

- **attr_revenue_on_view**: Attributed proceeds, if the allocation is made purely on the view percentages.
- **attr_revenue_on_quality**: Attributed proceeds, if the allocation is made on the quality volume percentage.
- **payout_raw**: Payout if a flat RPM of DEFAULT_RPM is being choosed,
- **payout_on_view**: Payout, computed on the attribted proceeds and assumed ROI, if the allocation is made purely on the view percentage.
- **payout_on_quality**: Payout, computed on the attribted proceeds and assumed ROI, if the allocation is made on the quality volume percentage.
    - payout is capped to the RPM_CAP to prevent unrealistic payouts for small creators

In [40]:
def cap_rpm(views, payout):
    rpm = payout / (views / 1000)
    return np.where(rpm < RPM_CAP, payout, RPM_CAP * (views / 1000))

In [41]:
df_payout = (
    df_posts_model
    .merge(
        df_revenue_model, left_on=["date", 'account_channel'], right_on=["date", 'channel']
    )
    .assign(
        total_views_on_day=lambda x: x.groupby(["date", "account_channel"]).views_diff.transform('sum'),
        view_percentage=lambda x: x['views_diff'] / x.groupby(["date", "account_channel"]).views_diff.transform('sum'),
        quality_percentage=lambda x: x['quality_volume'] / x.groupby(["date", "account_channel"]).quality_volume.transform('sum'),
        revenue_on_view=lambda x: x.proceeds * x["view_percentage"],
        revenue_on_quality=lambda x: x.proceeds * x["quality_percentage"],
        attr_revenue_on_view=lambda x: x.revenue_on_view * PROCEEDS_PERCENTAGE_FOR_ATTRIBUTION,
        attr_revenue_on_quality=lambda x: x.revenue_on_quality * PROCEEDS_PERCENTAGE_FOR_ATTRIBUTION,
    )
    .assign(
        payout_raw=lambda x: x.views_diff * DEFAULT_RPM / 1000,
        payout_on_view=lambda x: x.attr_revenue_on_view / get_labels_for_groups(x.views_diff, target_roi_by_groups),

        payout_on_quality_without_uplifting= lambda x: cap_rpm(x.views_diff, x.attr_revenue_on_quality / get_labels_for_groups(x.views_diff, target_roi_by_groups)),
        rpm_without_uplifting= lambda x: (x.payout_on_quality_without_uplifting / (x.views_diff / 1000)),
    )
    .assign(
        rpm_without_uplifting_mean=lambda x: x.groupby("vol_cut", observed=True).rpm_without_uplifting.transform('mean'),
        rpm=lambda x: ((x.rpm_without_uplifting_mean - x.rpm_without_uplifting) * PAYOUT_UPLIFTING_PERC * (x.rpm_without_uplifting < x.rpm_without_uplifting_mean) + x.rpm_without_uplifting).clip(0, RPM_CAP),
        payout_on_quality=lambda x: (x.views_diff / 1000) * x.rpm,
        payout_percentage=lambda x: x.payout_on_quality / x.groupby(["date", "account_channel"]).payout_on_quality.transform('sum'),
        payout_stayyou_custom_temp=lambda x: np.minimum(x.revenue_on_view / 3.0, (x.views_diff / 1000) * RPM_CAP)
    )
    .sort_values("views_diff", ascending=False)
)
# df_payout[[
#     'date',
#     'channel',
#     'account_id',
#     'proceeds',
#     'views_diff',
#     'view_percentage',
#     'payout_stayyou_custom_temp'
# ]]

Unnamed: 0,date,channel,account_id,proceeds,views_diff,view_percentage,payout_stayyou_custom_temp
0,2026-02-03,Instagram,12451,54.28067,126975,0.902458,16.328683
10,2026-02-05,TikTok,12281,28.837631,22675,0.976655,9.388139
13,2026-02-05,Instagram,12471,50.326506,21132,0.841343,14.113943
4,2026-02-04,TikTok,12281,8.49,18180,0.926417,2.621759
3,2026-02-03,Instagram,12485,54.28067,10914,0.07757,1.403514
9,2026-02-04,Instagram,12485,50.98,8475,0.791612,13.45213
14,2026-02-05,Instagram,12485,50.326506,3985,0.158657,2.661559
2,2026-02-03,Instagram,12471,54.28067,2617,0.0186,0.33654
8,2026-02-04,Instagram,12471,50.98,2029,0.18952,3.220575
5,2026-02-04,TikTok,12283,8.49,1270,0.064717,0.183148


## Summary

In [42]:
# filter last 3 days as they data in these days are not complete

# cutoff = now minus 3 days
three_days_before = pd.Timestamp.now() - pd.Timedelta(days=3)

# filter rows BEFORE last 3 days
df_payout_filtered = df_payout[df_payout['date'] < three_days_before]
df_revenue_filtered = df_revenue[df_revenue['date'] < three_days_before]

In [43]:
# Overview of the weekly data
(
    df_payout_filtered[[
        'date',
        'account_id',
        'views_diff',
        'likes_diff',
        'comments_diff',
        'saves_diff',
        'shares_diff',
        'revenue_on_view',
        'revenue_on_quality',
        'payout_on_quality_without_uplifting',
        'payout_on_quality'
    ]]
    .groupby(['account_id'])
    .resample('W-MON', on='date')
    .sum(numeric_only=True)
    .assign(
        rpm=lambda x: (x.payout_on_quality / (x.views_diff / 1000)).round(2).fillna(0),
    )
    .sort_values("views_diff", ascending=False)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,account_id,views_diff,likes_diff,comments_diff,saves_diff,shares_diff,revenue_on_view,revenue_on_quality,payout_on_quality_without_uplifting,payout_on_quality,rpm
account_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
12451,2026-02-09,12451,126975,5474,11,0,0,48.986048,32.076961,8.553856,8.553856,0.07
12281,2026-02-09,24562,40855,5369,2,90,24,36.029695,32.167744,11.188781,11.789919,0.29
12471,2026-02-09,37413,25778,199,6,0,0,53.013173,73.714185,27.298966,27.298966,1.06
12485,2026-02-09,37455,23374,493,6,0,0,52.551609,49.553638,22.249182,23.244503,0.99
12283,2026-02-09,24566,1590,79,1,18,2,0.946914,4.5711,1.770676,2.078192,1.31
12449,2026-02-09,24898,396,27,0,5,1,0.351022,0.588787,0.31402,0.340025,0.86
12452,2026-02-09,24904,395,7,0,0,0,1.036345,0.242391,0.129275,0.182807,0.46


In [44]:
# overview of the data group by vol cuts
(
    df_payout_filtered
    .query("views_diff > 0")
    .groupby("vol_cut", observed=False)
    .agg(
        {
            "account_id": "count",
            "views_diff": "sum",
            "total_views_on_day": "sum",
            "quality_volume": "sum",
            "revenue_on_view": "sum",
            "revenue_on_quality": "sum",
            "attr_revenue_on_view": "sum",
            "attr_revenue_on_quality": "sum",
            "payout_raw": "sum",
            "payout_on_view": "sum",
            "payout_on_quality_without_uplifting": "sum",
            "payout_on_quality": "sum"
        }
    )
    .assign(
        view_perc=lambda x: x.views_diff / x.views_diff.sum(),
        attr_perc_on_quality=lambda x: x.attr_revenue_on_quality / x.attr_revenue_on_quality.sum(),
        attr_perc_on_view=lambda x: x.attr_revenue_on_view / x.attr_revenue_on_view.sum(),
        rpm_without_uplifting=lambda x: x.payout_on_quality_without_uplifting / (x.views_diff / 1000),
        rpm=lambda x: x.payout_on_quality / (x.views_diff / 1000),

    )
)[[
    'account_id',
    'views_diff',
    'revenue_on_view',
    'payout_on_quality_without_uplifting',
    'rpm_without_uplifting',
    'payout_on_quality',
    'rpm'
]]

Unnamed: 0_level_0,account_id,views_diff,revenue_on_view,payout_on_quality_without_uplifting,rpm_without_uplifting,payout_on_quality,rpm
vol_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"(-inf, 0.0]",0,0,0.0,0.0,,0.0,
"(0.0, 1000.0]",5,1111,1.784836,1.723295,1.551121,1.802832,1.622711
"(1000.0, 10000.0]",5,18376,59.561854,33.722969,1.835164,34.962249,1.902604
"(10000.0, 100000.0]",4,72901,82.582067,27.504636,0.377287,28.16933,0.386405
"(100000.0, 1000000.0]",1,126975,48.986048,8.553856,0.067366,8.553856,0.067366
"(1000000.0, inf]",0,0,0.0,0.0,,0.0,


In [45]:
# final rpm
virality_perc = 1.25
results = {
    "total_payout_without_uplifting": df_payout_filtered['payout_on_quality_without_uplifting'].sum().round(1),
    "total_payout": df_payout_filtered['payout_on_quality'].sum().round(1),
    "total_payout_with_virality_costs": (df_payout_filtered['payout_on_quality'].sum() * virality_perc).round(1),
    "total_revenue": df_revenue_filtered['proceeds'].sum().round(1),
    "rpm_before_virality_costs": (df_revenue_filtered['proceeds'].sum() / (df_payout_filtered['payout_on_quality'].sum())).round(2),
    "overall_rpm": (df_revenue_filtered['proceeds'].sum() / (df_payout_filtered['payout_on_quality'].sum() * virality_perc)).round(2)
}
for k, v in results.items():
    print("{:<35} {:>10}".format(k, v))

total_payout_without_uplifting            71.5
total_payout                              73.5
total_payout_with_virality_costs          91.9
total_revenue                            192.9
rpm_before_virality_costs                 2.63
overall_rpm                                2.1


## Upsert Data to Postgres

In [47]:
# PREPARE INSERTION DATA

# we store always the last two weeks although window is always 30 days
# in order
two_week_ago = pd.Timestamp.now() - pd.Timedelta(days=14)
df_payout_last_two_weeks = df_payout[df_payout['date'] >= two_week_ago]

# MAKE DATA CONTINUOUS BY DATE
# filling missing entries because no views were generated on that day
# however some videos do not pass this threshold and therefore removed from datapoints
# we need to fill these gaps to make sure data is continuous
group_cols = ['account_id', 'campaign_id', 'account_country', 'account_channel']

df_payout_last_two_weeks_continuous = (
    df_payout_last_two_weeks.groupby(group_cols)['date']
      .apply(lambda s: pd.date_range(s.min(), s.max(), freq="D"))
      .explode()
      .rename("date")
      .reset_index()
)

zero_cols = list(set(df_payout_last_two_weeks.columns.values) - set(group_cols + ['vol_cut', 'date']))
default_values_dict = dict.fromkeys(zero_cols, 0)
default_values_dict['vol_cut'] = pd.Interval(left=-np.inf, right=0, closed='right')

df_payout_last_two_weeks = (
    df_payout_last_two_weeks_continuous.merge(
        df_payout_last_two_weeks,
        on=group_cols + ['date'],
        how='left',
    )
    .fillna(default_values_dict)
    .assign(
        date_str=lambda x: x.date.dt.strftime("%Y-%m-%d"),
        vol_cut_str=lambda x: x.vol_cut.map(lambda y: str(y)),
        # TODO: changed temporarily for custom rpm calculation
        payout_on_quality=lambda x: x.payout_stayyou_custom_temp,
        rpm=lambda x: (x.payout_stayyou_custom_temp / (x.views_diff / 1000)).fillna(0)
    )
)

Unnamed: 0,account_id,campaign_id,account_country,account_channel,date,views_diff,likes_diff,comments_diff,saves_diff,shares_diff,...,payout_on_view,payout_on_quality_without_uplifting,rpm_without_uplifting,rpm_without_uplifting_mean,rpm,payout_on_quality,payout_percentage,payout_stayyou_custom_temp,date_str,vol_cut_str
0,12281,4,United_States,TikTok,2026-02-04,18180,2425,1,43,5,...,2.735749,2.583777,0.142122,0.362561,0.144211,2.621759,0.780725,2.621759,2026-02-04,"(10000.0, 100000.0]"
1,12281,4,United_States,TikTok,2026-02-05,22675,2944,1,47,19,...,9.796319,8.605003,0.379493,0.362561,0.41403,9.388139,0.849566,9.388139,2026-02-05,"(10000.0, 100000.0]"
2,12283,4,United_States,TikTok,2026-02-04,1270,74,0,17,2,...,0.293037,0.490676,0.386359,2.000615,0.144211,0.183148,0.195662,0.183148,2026-02-04,"(1000.0, 10000.0]"
3,12283,4,United_States,TikTok,2026-02-05,320,5,1,1,0,...,0.211984,1.28,4.0,1.23077,0.41403,0.13249,0.126374,0.13249,2026-02-05,"(0.0, 1000.0]"
4,12449,5,United_States,TikTok,2026-02-04,174,7,0,1,1,...,0.040148,0.075532,0.434094,1.23077,0.144211,0.025093,0.023612,0.025093,2026-02-04,"(0.0, 1000.0]"
5,12449,5,United_States,TikTok,2026-02-05,222,20,0,4,0,...,0.147064,0.238488,1.074269,1.23077,0.41403,0.091915,0.02406,0.091915,2026-02-05,"(0.0, 1000.0]"
6,12451,4,United_States,Instagram,2026-02-03,126975,5474,11,0,0,...,13.062946,8.553856,0.067366,0.067366,0.128598,16.328683,0.459711,16.328683,2026-02-03,"(100000.0, 1000000.0]"
7,12452,4,United_States,Instagram,2026-02-03,193,2,0,0,0,...,0.039711,0.02388,0.123731,1.23077,0.128598,0.024819,0.003006,0.024819,2026-02-03,"(0.0, 1000.0]"
8,12452,4,United_States,Instagram,2026-02-04,202,5,0,0,0,...,0.513006,0.105395,0.521758,1.23077,1.587272,0.320629,0.006177,0.320629,2026-02-04,"(0.0, 1000.0]"
9,12471,5,United_States,Instagram,2026-02-03,2617,46,2,0,0,...,0.538464,6.400395,2.445699,2.000615,0.128598,0.33654,0.343977,0.33654,2026-02-03,"(1000.0, 10000.0]"


In [48]:
INSERT_BATCH_SIZE = 100

payout_insert_cols = {
    'date_str': 'date',
    'account_id': 'account_id',
    'campaign_id': 'campaign_id',
    'account_country': 'account_country',
    'views_diff': 'views_diff',
    'likes_diff': 'likes_diff',
    'comments_diff': 'comments_diff',
    'saves_diff': 'saves_diff',
    'shares_diff': 'shares_diff',
    'likes_rate': 'likes_rate',
    'comments_rate': 'comments_rate',
    'saves_rate': 'saves_rate',
    'shares_rate': 'shares_rate',
    'likes_to_comments': 'likes_to_comments',
    'engagement_factor_raw': 'engagement_factor_raw',
    'engagement_factor': 'engagement_factor',
    'vol_cut_str': 'vol_cut',
    'incentive_boost_raw': 'incentive_boost_raw',
    'incentive_boost': 'incentive_boost',
    'engagement_volume': 'engagement_volume',
    'quality_volume': 'quality_volume',
    'proceeds': 'proceeds',
    'total_views_on_day': 'total_views_on_day',
    'view_percentage': 'view_percentage',
    'quality_percentage': 'quality_percentage',
    'revenue_on_view': 'revenue_on_view',
    'revenue_on_quality': 'revenue_on_quality',
    'attr_revenue_on_view': 'attr_revenue_on_view',
    'attr_revenue_on_quality': 'attr_revenue_on_quality',
    'payout_raw': 'payout_raw',
    'payout_on_view': 'payout_on_view',
    'payout_on_quality_without_uplifting': 'payout_on_quality_without_uplifting',
    'rpm_without_uplifting': 'rpm_without_uplifting',
    'rpm_without_uplifting_mean': 'rpm_without_uplifting_mean',
    'rpm': 'rpm',
    'payout_on_quality': 'payout_on_quality',
    'payout_percentage': 'payout_percentage',
}

upsert_ignore_list = ['date', 'account_id', 'campaign_id']
upsert_keys = filter(lambda x: x not in upsert_ignore_list, payout_insert_cols.values())

df_payout_last_two_weeks_reduced = df_payout_last_two_weeks[payout_insert_cols.keys()]

data = list(df_payout_last_two_weeks_reduced.itertuples(index=False, name=None))
UPSERT_COMMA_SEPERATOR = ",\n"

with psycopg2.connect(os.getenv("DATABASE_URL")) as conn:
    with conn.cursor() as cur:
        insert_query = f"""
            insert into public.account_daily_rpm_calculations ({", ".join(payout_insert_cols.values())}) values %s
            on conflict on constraint account_daily_rpm_calculations_pk do update
            set
                {UPSERT_COMMA_SEPERATOR.join(list(map(lambda x: f"{x} = excluded.{x}", upsert_keys)))},
                updated_at = NOW()
        """
        psycopg2.extras.execute_values (
            cur, insert_query, data, template=None, page_size=INSERT_BATCH_SIZE
        )
    conn.commit()
