In [1]:
import pandas as pd
import numpy as np

In [2]:
dtype = {
    "Sale": np.int32,
    "SalesAmountInEuro": np.float64,
    "Time_delay_for_conversion": np.int32,
    "click_timestamp": np.int32,
    "nb_clicks_1week": pd.Int64Dtype(),
    "product_price": np.float64,
    "product_age_group": str,
    "device_type": str,
    "audience_id": str,
    "product_gender": str,
    "product_brand": str,
    "product_category1": str,
    "product_category2": str,
    "product_category3": str,
    "product_category4": str,
    "product_category5": str,
    "product_category6": str,
    "product_category7": str,
    "product_country": str,
    "product_id": str,
    "product_title": str,
    "partner_id": str,
    "user_id": str,
}
na_values = {
    "click_timestamp": "0",
    "nb_clicks_1week": "-1",
    "product_price": "-1",
    "product_age_group": "-1",
    "device_type": "-1",
    "audience_id": "-1",
    "product_gender": "-1",
    "product_brand": "-1",
    "product_category1": "-1",
    "product_category2": "-1",
    "product_category3": "-1",
    "product_category4": "-1",
    "product_category5": "-1",
    "product_category6": "-1",
    "product_category7": "-1",
    "product_country": "-1",
    "product_id": "-1",
    "product_title": "-1",
    "partner_id": "-1",
    "user_id": "-1",
}

criteo = pd.read_csv(
    "../data/criteo/Criteo_Conversion_Search/CriteoSearchData",
    names=dtype.keys(),
    dtype=dtype,
    na_values=na_values,
    header=None,
    sep="\t",
)

In [3]:
criteo = criteo.dropna(subset=["partner_id", "user_id", "product_id"])

In [4]:
"""
0a) What is the description of the dataset?
"""
criteo.describe()

Unnamed: 0,Sale,SalesAmountInEuro,Time_delay_for_conversion,click_timestamp,nb_clicks_1week,product_price
count,12167300.0,12167300.0,12167300.0,12167300.0,9251207.0,12167300.0
mean,0.1051584,11.98982,37812.05,1600548000.0,390.399443,11.33515
std,0.3067574,95.72121,227648.5,2261341.0,1342.472732,120.5586
min,0.0,-1.0,-1.0,1596439000.0,0.0,0.0
25%,0.0,-1.0,-1.0,1598628000.0,5.0,0.0
50%,0.0,-1.0,-1.0,1600681000.0,37.0,0.0
75%,0.0,-1.0,-1.0,1602489000.0,204.0,0.0
max,1.0,62458.77,2588699.0,1604302000.0,25390.0,76161.01


In [6]:
"""
0b) What is the description of the dataset per user?
"""
criteo.groupby(["user_id"]).size().describe()

count    1.043725e+07
mean     1.165756e+00
std      6.184687e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      4.240000e+02
dtype: float64

In [7]:
"""
1a) What is the description of the conversions?
"""
criteo[criteo.Sale == 1].describe()

Unnamed: 0,Sale,SalesAmountInEuro,Time_delay_for_conversion,click_timestamp,nb_clicks_1week,product_price
count,1279493.0,1279493.0,1279493.0,1279493.0,951362.0,1279493.0
mean,1.0,122.5263,359580.9,1600399000.0,239.66282,107.7912
std,0.0,271.0661,614097.1,2255931.0,990.883248,357.5153
min,1.0,0.0,-1.0,1596439000.0,0.0,0.0
25%,1.0,25.87189,707.0,1598493000.0,3.0,17.9
50%,1.0,56.66,7260.0,1600444000.0,22.0,40.0
75%,1.0,133.25,463808.0,1602357000.0,124.0,99.99
max,1.0,62458.77,2588699.0,1604302000.0,23822.0,76161.01


In [8]:
"""
1b) What is the description of the conversions per user?
"""
criteo[criteo.Sale == 1].groupby(["user_id"]).size().describe()

count    1.192588e+06
mean     1.072871e+00
std      3.443121e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      3.900000e+01
dtype: float64

In [9]:
"""
2a) What is the description of the impressions?
"""
criteo.loc[criteo.Sale != 1].describe()

Unnamed: 0,Sale,SalesAmountInEuro,Time_delay_for_conversion,click_timestamp,nb_clicks_1week,product_price
count,10887803.0,10887803.0,10887803.0,10887800.0,8299845.0,10887803.0
mean,0.0,-1.0,-1.0,1600566000.0,407.677488,0.0
std,0.0,0.0,0.0,2261325.0,1375.996325,0.0
min,0.0,-1.0,-1.0,1596439000.0,0.0,0.0
25%,0.0,-1.0,-1.0,1598647000.0,6.0,0.0
50%,0.0,-1.0,-1.0,1600723000.0,40.0,0.0
75%,0.0,-1.0,-1.0,1602514000.0,216.0,0.0
max,0.0,-1.0,-1.0,1604302000.0,25390.0,0.0


In [10]:
"""
2b) What is the description of the impressions per user?
"""
criteo.loc[criteo.Sale != 1].groupby(["user_id"]).size().describe()

count    9.495914e+06
mean     1.146578e+00
std      5.823203e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      4.240000e+02
dtype: float64

In [32]:

"""
3) You have a 100% attribution rate right?
"""
num_conversions_without_impressions = criteo.loc[(criteo.Sale == 1) & (criteo.click_timestamp == -1)].shape[0]
print("100% attribution rate" if num_conversions_without_impressions == 0 else "<100% attribution rate")

100% attribution rate


In [35]:
"""
4) What is the distribution of contribution values? I know you cap them at 5 so are they mostly 5 or like 2.5 on average or what?
"""
def __compute_product_count(conversion, cap: int) -> int:
    sell_price = conversion["SalesAmountInEuro"]
    offer_price = conversion["product_price"]
    if sell_price and offer_price:
        return min(cap, sell_price // offer_price)
    elif offer_price:
        return 0
    else:
        return 1

max_purchase_counts = 5

conversions = criteo[criteo.Sale == 1]
capped_conversions = conversions.assign(
    count=conversions.apply(
        lambda conversion: __compute_product_count(
            conversion, max_purchase_counts
        ),
        axis=1,
    ),
)
capped_conversions["count"].describe()

count    1.279493e+06
mean     1.447966e+00
std      1.488017e+00
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      5.000000e+00
Name: count, dtype: float64

In [39]:

"""
5) you have 30 days available right?
"""
print((criteo.click_timestamp.max() - criteo.click_timestamp.min()) // (60*60*24), "days of criteo data")

90 days
