In [1]:
import pandas as pd
import numpy as np

In [2]:
dtype = {
    "Sale": np.int32,
    "SalesAmountInEuro": np.float64,
    "Time_delay_for_conversion": np.int32,
    "click_timestamp": np.int32,
    "nb_clicks_1week": pd.Int64Dtype(),
    "product_price": np.float64,
    "product_age_group": str,
    "device_type": str,
    "audience_id": str,
    "product_gender": str,
    "product_brand": str,
    "product_category1": str,
    "product_category2": str,
    "product_category3": str,
    "product_category4": str,
    "product_category5": str,
    "product_category6": str,
    "product_category7": str,
    "product_country": str,
    "product_id": str,
    "product_title": str,
    "partner_id": str,
    "user_id": str,
}
na_values = {
    "click_timestamp": "0",
    "nb_clicks_1week": "-1",
    "product_price": "-1",
    "product_age_group": "-1",
    "device_type": "-1",
    "audience_id": "-1",
    "product_gender": "-1",
    "product_brand": "-1",
    "product_category1": "-1",
    "product_category2": "-1",
    "product_category3": "-1",
    "product_category4": "-1",
    "product_category5": "-1",
    "product_category6": "-1",
    "product_category7": "-1",
    "product_country": "-1",
    "product_id": "-1",
    "product_title": "-1",
    "partner_id": "-1",
    "user_id": "-1",
}

criteo = pd.read_csv(
    "../data/criteo/Criteo_Conversion_Search/CriteoSearchData",
    names=dtype.keys(),
    dtype=dtype,
    na_values=na_values,
    header=None,
    sep="\t",
)

In [3]:
criteo = criteo.dropna(subset=["partner_id", "user_id", "product_id"])

In [4]:
"""
0a) What is the description of the dataset?
"""

criteo.describe()

Unnamed: 0,Sale,SalesAmountInEuro,Time_delay_for_conversion,click_timestamp,nb_clicks_1week,product_price
count,12167300.0,12167300.0,12167300.0,12167300.0,9251207.0,12167300.0
mean,0.1051584,11.98982,37812.05,1600548000.0,390.399443,11.33515
std,0.3067574,95.72121,227648.5,2261341.0,1342.472732,120.5586
min,0.0,-1.0,-1.0,1596439000.0,0.0,0.0
25%,0.0,-1.0,-1.0,1598628000.0,5.0,0.0
50%,0.0,-1.0,-1.0,1600681000.0,37.0,0.0
75%,0.0,-1.0,-1.0,1602489000.0,204.0,0.0
max,1.0,62458.77,2588699.0,1604302000.0,25390.0,76161.01


In [5]:
"""
0b) What is the description of the dataset per user?
"""

criteo.groupby(["user_id"]).size().describe()

count    1.043725e+07
mean     1.165756e+00
std      6.184687e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      4.240000e+02
dtype: float64

In [6]:
"""
1a) What is the description of the conversions?
"""

criteo[criteo.Sale == 1].describe()

Unnamed: 0,Sale,SalesAmountInEuro,Time_delay_for_conversion,click_timestamp,nb_clicks_1week,product_price
count,1279493.0,1279493.0,1279493.0,1279493.0,951362.0,1279493.0
mean,1.0,122.5263,359580.9,1600399000.0,239.66282,107.7912
std,0.0,271.0661,614097.1,2255931.0,990.883248,357.5153
min,1.0,0.0,-1.0,1596439000.0,0.0,0.0
25%,1.0,25.87189,707.0,1598493000.0,3.0,17.9
50%,1.0,56.66,7260.0,1600444000.0,22.0,40.0
75%,1.0,133.25,463808.0,1602357000.0,124.0,99.99
max,1.0,62458.77,2588699.0,1604302000.0,23822.0,76161.01


In [7]:
"""
1b) What is the description of the conversions per user?
"""

criteo[criteo.Sale == 1].groupby(["user_id"]).size().describe()

count    1.192588e+06
mean     1.072871e+00
std      3.443121e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      3.900000e+01
dtype: float64

In [8]:
"""
1c) What is the description of the users who convert more than once?
"""

user_conversions = criteo[criteo.Sale == 1].groupby(["user_id"]).size()
users_converting_multiple_times = user_conversions[user_conversions > 1]
users_converting_multiple_times.describe()

count    69781.000000
mean         2.245396
std          0.752219
min          2.000000
25%          2.000000
50%          2.000000
75%          2.000000
max         39.000000
dtype: float64

In [9]:
"""
1d) What is the description of the users who convert more than once by advertisers?
"""

user_advertiser_conversions = (
    criteo[criteo.Sale == 1].groupby(["user_id", "partner_id"]).size()
)
users_converting_multiple_times_by_advertiser = user_advertiser_conversions[
    user_advertiser_conversions > 1
].reset_index(name="conversion_count")
print(users_converting_multiple_times_by_advertiser.describe())
users_converting_multiple_times_by_advertiser.sort_values(
    by=["conversion_count"], ascending=False
).head(50)

       conversion_count
count      61844.000000
mean           2.237048
std            0.719252
min            2.000000
25%            2.000000
50%            2.000000
75%            2.000000
max           33.000000


Unnamed: 0,user_id,partner_id,conversion_count
11064,2DE4628E1966648A141ACA926D160E91,E3DDEB04F8AFF944B11943BB57D2F620,33
13632,3874BC72C4E8C39D4F475097F6004DCE,319A2412BDB0EF669733053640B80112,29
11550,2FED9D0F367311691498360CE6D6EE98,319A2412BDB0EF669733053640B80112,28
5531,1763D336EC150511173C0D9785493879,E3DDEB04F8AFF944B11943BB57D2F620,23
14571,3C6ED50E3B2CE4E430B8AB6F915CCEF6,E3DDEB04F8AFF944B11943BB57D2F620,23
16525,446E482E5A7472B5F045458D2F587DDD,319A2412BDB0EF669733053640B80112,23
27380,712EA806588FA01E34A6A8427E8537CC,4D4806B2F138ED89BEB51D3010AF2033,22
30147,7C8EB12D8897021A3894F1EEEB750B55,319A2412BDB0EF669733053640B80112,22
41900,AC8D778C9BF92A9CD075CFC9D1928769,5D4FE9350AF7E4CAE2CD24EDEC6F9050,17
29548,7A1114DDD7E3841696BCF39E1E011C01,925829CF82DFDC74DE3F88F003CE7BEF,17


In [10]:
"""
1e) What is the description of the multi-conversion rate per advertiser?
Where multi-conversion rate is
(number of users who convert more than once for an advertiser) / (number of users who convert for an advertiser)
"""

conversions = criteo[criteo.Sale == 1]
user_advertiser_conversions = conversions.groupby(["user_id", "partner_id"]).size()
multiconverters = (
    user_advertiser_conversions[user_advertiser_conversions > 1]
    .reset_index(name="multiconverter_count")
    .groupby(["partner_id"])
    .size()
    .reset_index(name="multiconverter_count")
)
converters = (
    user_advertiser_conversions.reset_index(name="converter_count")
    .groupby(["partner_id"])
    .size()
    .reset_index(name="converter_count")
)

multiconverter_rates = pd.merge(
    multiconverters, converters, how="inner", on="partner_id"
)
multiconverter_rates = (
    multiconverter_rates.assign(
        multiconverter_rate=multiconverter_rates.apply(
            lambda r: r.multiconverter_count / r.converter_count,
            axis=1,
        )
    )
    .sort_values(by=["multiconverter_rate"], ascending=False)
    .reset_index(drop=True)
)

multiconverter_rates.head(40)

Unnamed: 0,partner_id,multiconverter_count,converter_count,multiconverter_rate
0,319A2412BDB0EF669733053640B80112,2178,26280,0.082877
1,E3DDEB04F8AFF944B11943BB57D2F620,39097,477888,0.081812
2,4FE3DF46D9F3D57C872A28B9AEA876E3,384,5313,0.072276
3,E1703315F9986F033724DA149DC0B052,248,3582,0.069235
4,5D4FE9350AF7E4CAE2CD24EDEC6F9050,1102,16096,0.068464
5,F122B91F6D102E4630817566839A4F1F,2837,46732,0.060708
6,9F30550A2589106D327B2F7C829CA250,113,1918,0.058916
7,CE13C4ECD6D0EF4A9F02EB0E17B45766,2,34,0.058824
8,919377389312B4EE6C56FD94A6997C16,73,1264,0.057753
9,C6619E0137BA48395F6D230E5824A585,608,10741,0.056606


In [11]:
multiconverter_rates.sort_values(by=["converter_count"], ascending=False)

Unnamed: 0,partner_id,multiconverter_count,converter_count,multiconverter_rate
1,E3DDEB04F8AFF944B11943BB57D2F620,39097,477888,0.081812
10,9D9E93D1D461D7BAE47FB67EC0E01B62,2767,52064,0.053146
5,F122B91F6D102E4630817566839A4F1F,2837,46732,0.060708
77,9FF550C0B17A3C493378CB6E2DEEE6E4,916,45704,0.020042
0,319A2412BDB0EF669733053640B80112,2178,26280,0.082877
...,...,...,...,...
27,362124BA45449ED677E2A20FE6FE41B7,2,57,0.035088
71,1CD374F58ED161103C9A74E7EB8B4AC2,1,48,0.020833
52,968C498E6F9CA55F8C3DF73AAAC9CF86,1,41,0.024390
7,CE13C4ECD6D0EF4A9F02EB0E17B45766,2,34,0.058824


In [12]:
"""
2a) What is the description of the impressions?
"""

criteo.loc[criteo.Sale != 1].describe()

Unnamed: 0,Sale,SalesAmountInEuro,Time_delay_for_conversion,click_timestamp,nb_clicks_1week,product_price
count,10887803.0,10887803.0,10887803.0,10887800.0,8299845.0,10887803.0
mean,0.0,-1.0,-1.0,1600566000.0,407.677488,0.0
std,0.0,0.0,0.0,2261325.0,1375.996325,0.0
min,0.0,-1.0,-1.0,1596439000.0,0.0,0.0
25%,0.0,-1.0,-1.0,1598647000.0,6.0,0.0
50%,0.0,-1.0,-1.0,1600723000.0,40.0,0.0
75%,0.0,-1.0,-1.0,1602514000.0,216.0,0.0
max,0.0,-1.0,-1.0,1604302000.0,25390.0,0.0


In [13]:
"""
2b) What is the description of the impressions per user?
"""

criteo.loc[criteo.Sale != 1].groupby(["user_id"]).size().describe()

count    9.495914e+06
mean     1.146578e+00
std      5.823203e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      4.240000e+02
dtype: float64

In [14]:
"""
3) You have a 100% attribution rate right?
"""

num_conversions_without_impressions = criteo.loc[
    (criteo.Sale == 1) & (criteo.click_timestamp == -1)
].shape[0]
print(
    "100% attribution rate"
    if num_conversions_without_impressions == 0
    else "<100% attribution rate"
)

100% attribution rate


In [15]:
"""
4) What is the distribution of contribution values after capping?
"""


def __compute_product_count(conversion, cap: int) -> int:
    sell_price = conversion["SalesAmountInEuro"]
    offer_price = conversion["product_price"]
    if sell_price and offer_price:
        return min(cap, max(1, sell_price // offer_price))
    else:
        return 1


max_purchase_counts = 5

conversions = criteo[criteo.Sale == 1]
capped_conversions = conversions.assign(
    count=conversions.apply(
        lambda conversion: __compute_product_count(conversion, max_purchase_counts),
        axis=1,
    ),
)
capped_conversions["count"].describe()

count    1.279493e+06
mean     1.680549e+00
std      1.311158e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      5.000000e+00
Name: count, dtype: float64

In [16]:
"""
5) you have 30 days available right?
"""

print(
    (criteo.click_timestamp.max() - criteo.click_timestamp.min()) // (60 * 60 * 24),
    "days of criteo data",
)

90 days of criteo data


In [29]:
"""
6) What is the domain size of product_category3 for these partners?
"""

x = [
    ("domain size of product_category3 for all advertisers", criteo),
    (
        "domain size of product_category3 for advertisers with converting users",
        conversions,
    ),
]

for msg, d in x:
    pc3 = d.groupby(["partner_id"]).product_category3.nunique()
    print(msg)
    print(pc3.describe())
    print("\n")

domain size of product_category3 for all advertisers
count    292.000000
mean      63.965753
std      100.010990
min        0.000000
25%        7.000000
50%       29.000000
75%       79.250000
max      809.000000
Name: product_category3, dtype: float64


domain size of product_category3 for advertisers with converting users
count    274.000000
mean      39.430657
std       68.147899
min        0.000000
25%        5.000000
50%       17.000000
75%       48.000000
max      723.000000
Name: product_category3, dtype: float64




In [30]:
big_advertiser = "E3DDEB04F8AFF944B11943BB57D2F620"

for msg, d in x:
    d = d[d.partner_id != big_advertiser]
    pc3 = d.groupby(["partner_id"]).product_category3.nunique()
    print(msg)
    print(pc3.describe())
    print("\n")

domain size of product_category3 for all advertisers
count    291.000000
mean      61.405498
std       90.089154
min        0.000000
25%        7.000000
50%       29.000000
75%       78.500000
max      667.000000
Name: product_category3, dtype: float64


domain size of product_category3 for advertisers with converting users
count    273.000000
mean      36.926740
std       54.194317
min        0.000000
25%        5.000000
50%       17.000000
75%       48.000000
max      411.000000
Name: product_category3, dtype: float64




In [31]:
"""
7) How many distinct partners/advertisers are in this dataset?
"""

x = [
    ("advertisers in the dataset", criteo),
    ("advertisers with users who have converted", conversions),
]
for msg, d in x:
    print(msg, d.partner_id.unique().shape[0])

advertisers in the dataset 292
advertisers with users who have converted 274


In [32]:
big_advertiser = "E3DDEB04F8AFF944B11943BB57D2F620"

for msg, d in x:
    d = d[d.partner_id != big_advertiser]
    print(msg, d.partner_id.unique().shape[0])

advertisers in the dataset 291
advertisers with users who have converted 273
