In [1]:
from etl.loader import DataLoader
from etl.transformer import DataTransformer
from dataviz.sankey_bipartite import SankeyBipartite
from dataviz.sankey_tree_graph import SankeyTree
from utils.os_ import get_git_root
import os

import pandas as pd
import altair as alt

# Dataset preparation

## source info

In [2]:
git_root = get_git_root(os.getcwd())
path_to_file = os.path.join(git_root, "raw")
filename = "Analytics Interview Question_mobile_new_2024.xlsx"

## load raw dataset

In [3]:
data_loader = DataLoader()
df_raw = data_loader.load_data_xlsx_from_tab(
    path=os.path.join(path_to_file, filename), 
    sheet_name="Data")

## Transform dataset

In [4]:
data_transformer = DataTransformer(df_raw)
df = data_transformer.transform_data(
    fill_super_region=True,
    drop_post_book=True,
    map_property_to_super_region=True,
    replace_us_client_country=True,
    treat_apac_2022w45_outlier=True,
).copy()

In [5]:
print(df.shape)

(73648, 12)


In [6]:
df.sample(10)

Unnamed: 0,client_region,client_country,platform,mobile,property_region,property_country,booking_window,date,year,week,net_gross_booking_usd,net_orders
16982,APAC,Australia,Desktop,Desktop,EMEA,Austria,4-7 days,2022-11-21,2022,47,179.2604,2
5755,North America,United States of America,Mobile App,Mobile,EMEA,Greece,2-3 days,2022-11-07,2022,45,467.1616,5
3020,LATAM,Brazil,Desktop,Desktop,APAC,Thailand,2-3 days,2022-11-07,2022,45,224.4997,4
579,APAC,South Korea,Desktop,Desktop,EMEA,Italy,46-60 days,2022-11-07,2022,45,13271.0312,52
18544,EMEA,Norway,Desktop,Desktop,EMEA,Denmark,2-3 days,2022-11-21,2022,47,10672.2572,32
4392,APAC,Hong Kong,Mobile App,Mobile,APAC,Hong Kong,46-60 days,2022-11-07,2022,45,3461.4181,10
21045,North America,United States of America,Desktop,Desktop,EMEA,Ukraine,4-7 days,2022-11-21,2022,47,533.6486,2
59452,APAC,Australia,Mobile App,Mobile,North America,United States of America,4-7 days,2023-11-20,2023,47,5012.6706,9
1985,EMEA,United Kingdom,Desktop,Desktop,EMEA,Latvia,8-14 days,2022-11-07,2022,45,268.5355,3
10257,EMEA,United Kingdom,Desktop,Desktop,EMEA,Iceland,+90 days,2022-11-14,2022,46,5157.0919,11


# Overview of dataset

## absolute values By Super Region, Year and Platform

In [7]:
# altair chart of sum of net_gross_bookings_usd by client_region (color encoding), year (shape encoding), week (x_axis)
# requires transformation and aggregation before plotting

df_chart = df.groupby(["client_region", "year", "week", "mobile"]).agg({"net_gross_booking_usd": "sum", "net_orders": "sum"}).reset_index()

# avg ticket
df_chart["avg_ticket"] = df_chart["net_gross_booking_usd"] / df_chart["net_orders"]

# million usd
df_chart["net_gross_booking_usd"] = df_chart["net_gross_booking_usd"] / 1_000_000

# net orders to thousands
df_chart["net_orders"] = df_chart["net_orders"] / 1_000


print(df_chart.info())
print(df_chart.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   client_region          64 non-null     string 
 1   year                   64 non-null     int32  
 2   week                   64 non-null     int32  
 3   mobile                 64 non-null     string 
 4   net_gross_booking_usd  64 non-null     float64
 5   net_orders             64 non-null     float64
 6   avg_ticket             64 non-null     float64
dtypes: float64(3), int32(2), string(2)
memory usage: 3.1 KB
None
  client_region  year  week   mobile  net_gross_booking_usd  net_orders  \
0          APAC  2022    45  Desktop               8.129804      22.452   
1          APAC  2022    45   Mobile               4.319801       8.756   
2          APAC  2022    46  Desktop               9.067502      25.292   
3          APAC  2022    46   Mobile               2.641798     

In [8]:
y_var = "net_gross_booking_usd"
y_var_name = "Net Gross Bookings (Million USD)"

# y_var = "net_orders"
# y_var_name = "Net Orders (thousands)"

# y_var = "avg_ticket"
# y_var_name = "Average Ticket (USD)"

# Define selection
selection = alt.selection_point(fields=['client_region'], bind='legend', empty=True)

# Define alpha transparency level and gray color code
alpha_transparency = 0.1
gray_color_code = "#666666"

# Modify the line chart to include selection
line_chart = alt.Chart(df_chart).mark_line().encode(
    x=alt.X("week:O", title="Week"),
    y=alt.Y(f"{y_var}:Q", title=y_var_name),
    color=alt.condition(selection,
                        alt.Color("client_region:N", title="Client Region"),
                        alt.value(gray_color_code)),
    strokeDash=alt.StrokeDash("mobile:N", title="Platform"),
    detail=alt.Detail("year:N"),
    opacity=alt.condition(selection, alt.value(1), alt.value(alpha_transparency))
)

# Modify the point chart to include selection
point_chart = alt.Chart(df_chart).mark_point().encode(
    x=alt.X("week:O", title="Week"),
    y=alt.Y(f"{y_var}:Q", title=y_var_name),
    color=alt.condition(selection,
                        alt.Color("client_region:N", title="Client Region"),
                        alt.value(gray_color_code)),
    shape=alt.Shape("year:N", title="Year"),
    opacity=alt.condition(selection, alt.value(1), alt.value(alpha_transparency)),
    tooltip=[
        alt.Tooltip("client_region:N", title="Client Region"),
        alt.Tooltip("mobile:N", title="Platform"),
        alt.Tooltip("year:N", title="Year"),
        alt.Tooltip("week:O", title="Week"),
        alt.Tooltip(f"{y_var}:Q", title=y_var_name).format(",.2f")
    ]
)

# Add selection to the layered chart
layered_chart = alt.layer(line_chart, point_chart).resolve_scale(
    shape='independent',
    strokeDash='independent'
).add_params(
    selection
).properties(
    title=f"{y_var_name} by Client Region, Year, and Week",
    width=600,
    height=400
).interactive()

layered_chart


## Sankey chart view

In [9]:
# y_var = "net_gross_booking_usd"
y_var = "net_orders"

filter_rule = pd.Series([True] * df.shape[0], index=df.index)
# filter_rule &= df['mobile'] == 'Mobile'

# apply filter
df_sankey = df[filter_rule].copy()

df_sankey = df_sankey[['client_region', 'property_region', y_var]]\
    .groupby(['client_region', 'property_region'])\
    [y_var]\
    .agg(['sum', 'mean'])\
    .reset_index()

# normalize agg metrics by each source client_region
df_sankey['sum'] = (df_sankey['sum'] / df_sankey.groupby('client_region')['sum'].transform('sum')).mul(100).round(2)
df_sankey['mean'] = (df_sankey['mean'] / df_sankey.groupby('client_region')['mean'].transform('sum')).mul(100).round(2)

print(df_sankey.info())
print(df_sankey.sample(5))
# df_sankey

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   client_region    20 non-null     string 
 1   property_region  20 non-null     string 
 2   sum              20 non-null     float64
 3   mean             20 non-null     float64
dtypes: float64(2), string(2)
memory usage: 768.0 bytes
None
   client_region     property_region   sum   mean
7           EMEA               LATAM  1.01   2.31
3           APAC       North America  7.89  36.61
10         LATAM                APAC  2.70   2.59
5           EMEA                APAC  9.11   7.93
14         LATAM  Sub-Saharan Africa  0.20   1.92


In [10]:
sankey_chart_creator = SankeyBipartite(
     df=df_sankey, 
     flow_column="sum", 
     source_column="client_region", 
     target_column="property_region",
     normalized=True
)
sankey_chart_creator.generate_sankey_chart()


## YoY Growth

In [11]:
def calc_yoy_over_dimensions(df, dimensions, date_cols):

    metrics=["net_gross_booking_usd", "net_orders"]

    df_yoy = df.groupby(dimensions + date_cols).agg({metric: 'sum' for metric in metrics}).reset_index()
    df_yoy['avg_ticket'] = df_yoy['net_gross_booking_usd'] / df_yoy['net_orders'] 
    metrics.append("avg_ticket")
    print(df_yoy.info())

    df_yoy = df_yoy.sort_values(dimensions + date_cols)
    for metric in metrics:
        df_yoy[f"yoy_{metric}"] = df_yoy.groupby(dimensions)[metric].pct_change() * 100
    
    df_yoy = df_yoy.dropna(subset=[col for col in df_yoy.columns if 'yoy' in col]).drop(columns=metrics)
    return df_yoy


In [12]:
dimensions = ["client_region", "property_region", "platform", "mobile"]
date_cols = ["year"]

df_yoy = calc_yoy_over_dimensions(df, dimensions, date_cols)
df_yoy

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   client_region          120 non-null    string 
 1   property_region        120 non-null    string 
 2   platform               120 non-null    string 
 3   mobile                 120 non-null    string 
 4   year                   120 non-null    int32  
 5   net_gross_booking_usd  120 non-null    float64
 6   net_orders             120 non-null    int32  
 7   avg_ticket             120 non-null    float64
dtypes: float64(2), int32(2), string(4)
memory usage: 6.7 KB
None


Unnamed: 0,client_region,property_region,platform,mobile,year,yoy_net_gross_booking_usd,yoy_net_orders,yoy_avg_ticket
1,APAC,APAC,Desktop,Desktop,2023,20.208134,34.815412,-10.835021
3,APAC,APAC,Mobile App,Mobile,2023,76.128056,199.216828,-41.136982
5,APAC,APAC,Mobile Web,Mobile,2023,72.768609,98.593167,-13.00375
7,APAC,EMEA,Desktop,Desktop,2023,-4.04123,2.804148,-6.658659
9,APAC,EMEA,Mobile App,Mobile,2023,119.766499,142.751323,-9.468465
11,APAC,EMEA,Mobile Web,Mobile,2023,25.762452,46.261682,-14.015448
13,APAC,LATAM,Desktop,Desktop,2023,-14.408372,13.735343,-24.744916
15,APAC,LATAM,Mobile App,Mobile,2023,102.540711,183.636364,-28.591416
17,APAC,LATAM,Mobile Web,Mobile,2023,-14.293115,20.689655,-28.985724
19,APAC,North America,Desktop,Desktop,2023,3.068492,11.033843,-7.173805


In [13]:
dimensions = ["client_region", "mobile"]
date_cols = ["year"]

df_yoy = calc_yoy_over_dimensions(df, dimensions, date_cols).drop(columns=["year"])
df_yoy = df_yoy.melt(id_vars=["client_region", "mobile"], var_name="metric", value_name="yoy").sort_values(["client_region", "mobile", "metric"])
df_yoy

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   client_region          16 non-null     string 
 1   mobile                 16 non-null     string 
 2   year                   16 non-null     int32  
 3   net_gross_booking_usd  16 non-null     float64
 4   net_orders             16 non-null     int32  
 5   avg_ticket             16 non-null     float64
dtypes: float64(2), int32(2), string(2)
memory usage: 768.0 bytes
None


Unnamed: 0,client_region,mobile,metric,yoy
16,APAC,Desktop,yoy_avg_ticket,-10.940453
0,APAC,Desktop,yoy_net_gross_booking_usd,13.720001
8,APAC,Desktop,yoy_net_orders,27.689849
17,APAC,Mobile,yoy_avg_ticket,-26.707476
1,APAC,Mobile,yoy_net_gross_booking_usd,77.60046
9,APAC,Mobile,yoy_net_orders,142.317292
18,EMEA,Desktop,yoy_avg_ticket,-9.624061
2,EMEA,Desktop,yoy_net_gross_booking_usd,12.91264
10,EMEA,Desktop,yoy_net_orders,24.936616
19,EMEA,Mobile,yoy_avg_ticket,-10.885475


In [14]:
alt.Chart(df_yoy).mark_bar().encode(
    x=alt.X("client_region:N", title="Client Region"),
    y=alt.Y("yoy:Q", title="YoY %"),
    color=alt.Color("mobile:N", title="Platform"),
    column=alt.Column("metric:N", title="Metric"),
    tooltip=[
        alt.Tooltip("client_region:N", title="Client Region"),
        alt.Tooltip("mobile:N", title="Platform"),
        alt.Tooltip("metric:N", title="Metric"),
        alt.Tooltip("yoy:Q", title="YoY %").format(",.2f")
    ]
).properties(
    title="YoY % by Client Region, Platform, and Metric",
    width=200,
    height=200
).interactive()


In [15]:
dimensions = ["property_region", "mobile"]
date_cols = ["year"]

df_yoy = calc_yoy_over_dimensions(df, dimensions, date_cols).drop(columns=["year"])
df_yoy = df_yoy.melt(id_vars=["property_region", "mobile"], var_name="metric", value_name="yoy").sort_values(["property_region", "mobile", "metric"])
df_yoy

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   property_region        20 non-null     string 
 1   mobile                 20 non-null     string 
 2   year                   20 non-null     int32  
 3   net_gross_booking_usd  20 non-null     float64
 4   net_orders             20 non-null     int32  
 5   avg_ticket             20 non-null     float64
dtypes: float64(2), int32(2), string(2)
memory usage: 928.0 bytes
None


Unnamed: 0,property_region,mobile,metric,yoy
20,APAC,Desktop,yoy_avg_ticket,-11.881339
0,APAC,Desktop,yoy_net_gross_booking_usd,14.394733
10,APAC,Desktop,yoy_net_orders,29.818964
21,APAC,Mobile,yoy_avg_ticket,-27.346209
1,APAC,Mobile,yoy_net_gross_booking_usd,69.063174
11,APAC,Mobile,yoy_net_orders,132.696975
22,EMEA,Desktop,yoy_avg_ticket,-10.554192
2,EMEA,Desktop,yoy_net_gross_booking_usd,6.783354
12,EMEA,Desktop,yoy_net_orders,19.383297
23,EMEA,Mobile,yoy_avg_ticket,-10.606615


In [16]:
alt.Chart(df_yoy).mark_bar().encode(
    x=alt.X("property_region:N", title="Property Region"),
    y=alt.Y("yoy:Q", title="YoY %"),
    color=alt.Color("mobile:N", title="Platform"),
    column=alt.Column("metric:N", title="Metric"),
    tooltip=[
        alt.Tooltip("property_region:N", title="Property Region"),
        alt.Tooltip("mobile:N", title="Platform"),
        alt.Tooltip("metric:N", title="Metric"),
        alt.Tooltip("yoy:Q", title="YoY %").format(",.2f")
    ]
).properties(
    title="YoY % by Property Region, Platform, and Metric",
    width=200,
    height=200
).interactive()


In [17]:
dimensions = ["client_region", "platform"]
date_cols = ["year"]

df_yoy = calc_yoy_over_dimensions(df[df["platform"].isin(["Mobile App", "Mobile Web"])], dimensions, date_cols).drop(columns=["year"])
df_yoy = df_yoy.melt(id_vars=["client_region", "platform"], var_name="metric", value_name="yoy").sort_values(["client_region", "platform", "metric"])
df_yoy

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   client_region          16 non-null     string 
 1   platform               16 non-null     string 
 2   year                   16 non-null     int32  
 3   net_gross_booking_usd  16 non-null     float64
 4   net_orders             16 non-null     int32  
 5   avg_ticket             16 non-null     float64
dtypes: float64(2), int32(2), string(2)
memory usage: 768.0 bytes
None


Unnamed: 0,client_region,platform,metric,yoy
16,APAC,Mobile App,yoy_avg_ticket,-34.822461
0,APAC,Mobile App,yoy_net_gross_booking_usd,86.881021
8,APAC,Mobile App,yoy_net_orders,186.726107
17,APAC,Mobile Web,yoy_avg_ticket,-13.549747
1,APAC,Mobile Web,yoy_net_gross_booking_usd,64.126557
9,APAC,Mobile Web,yoy_net_orders,89.85087
18,EMEA,Mobile App,yoy_avg_ticket,-8.936254
2,EMEA,Mobile App,yoy_net_gross_booking_usd,65.674776
10,EMEA,Mobile App,yoy_net_orders,81.932748
19,EMEA,Mobile Web,yoy_avg_ticket,-11.45057


In [18]:
alt.Chart(df_yoy).mark_bar().encode(
    x=alt.X("client_region:N", title="Client Region"),
    y=alt.Y("yoy:Q", title="YoY %"),
    color=alt.Color("platform:N", title="Mobile Platform"),
    column=alt.Column("metric:N", title="Metric"),
    tooltip=[
        alt.Tooltip("client_region:N", title="Client Region"),
        alt.Tooltip("platform:N", title="Mobile Platform"),
        alt.Tooltip("metric:N", title="Metric"),
        alt.Tooltip("yoy:Q", title="YoY %").format(",.2f")
    ]
).properties(
    title="YoY % by Client Region, Mobile Platform, and Metric",
    width=200,
    height=200
).interactive()


## Mobile vs. Desktop Share of Wallet

In [19]:
def aggregate_and_normalize_share(df, dimensions, total_dimensions, normalize=True):
    # aggregate net gross bookings, net orders, and calculate average ticket
    df_agg = df.groupby(dimensions).agg({"net_gross_booking_usd": "sum", "net_orders": "sum"}).reset_index()    
    df_agg["avg_ticket"] = df_agg["net_gross_booking_usd"] / df_agg["net_orders"]

    if normalize:
        # normalize share of wallet of net gross bookings and net orders among platforms per year
        df_agg["net_gross_booking_usd"] = (df_agg["net_gross_booking_usd"] / df_agg.groupby(total_dimensions)["net_gross_booking_usd"].transform("sum")).mul(100).round(2)
        df_agg["net_orders"] = (df_agg["net_orders"] / df_agg.groupby(total_dimensions)["net_orders"].transform("sum")).mul(100).round(2)

        # sanity check
        print(df_agg.groupby(total_dimensions)["net_gross_booking_usd"].sum())

    return df_agg

In [20]:
dimensions = ["client_region", "platform", "mobile", "year"]
total_dimensions = ["client_region", "year"]

df_agg = aggregate_and_normalize_share(df, dimensions, total_dimensions)

client_region  year
APAC           2022    100.01
               2023    100.00
EMEA           2022    100.00
               2023    100.00
LATAM          2022    100.00
               2023     99.99
North America  2022    100.00
               2023    100.00
Name: net_gross_booking_usd, dtype: float64


In [21]:
print("Desktop")
df_agg[df_agg["mobile"] == "Desktop"]

Desktop


Unnamed: 0,client_region,platform,mobile,year,net_gross_booking_usd,net_orders,avg_ticket
0,APAC,Desktop,Desktop,2022,73.34,71.25,353.952488
1,APAC,Desktop,Desktop,2023,63.78,56.64,315.228481
6,EMEA,Desktop,Desktop,2022,78.26,74.15,351.115128
7,EMEA,Desktop,Desktop,2023,72.44,67.38,317.323594
12,LATAM,Desktop,Desktop,2022,83.44,81.23,462.351192
13,LATAM,Desktop,Desktop,2023,78.2,74.27,307.034643
18,North America,Desktop,Desktop,2022,76.1,68.05,279.674072
19,North America,Desktop,Desktop,2023,68.49,59.53,278.028128


In [22]:
print("Mobile")
df_agg[df_agg["mobile"] == "Mobile"]

Mobile


Unnamed: 0,client_region,platform,mobile,year,net_gross_booking_usd,net_orders,avg_ticket
2,APAC,Mobile App,Mobile,2022,15.79,15.57,348.745913
3,APAC,Mobile App,Mobile,2023,22.57,27.79,227.304004
4,APAC,Mobile Web,Mobile,2022,10.88,13.18,283.793344
5,APAC,Mobile Web,Mobile,2023,13.65,15.57,245.340062
8,EMEA,Mobile App,Mobile,2022,6.57,8.74,250.196476
9,EMEA,Mobile App,Mobile,2023,8.93,11.57,227.838285
10,EMEA,Mobile Web,Mobile,2022,15.17,17.1,295.022382
11,EMEA,Mobile Web,Mobile,2023,18.63,21.05,261.240636
14,LATAM,Mobile App,Mobile,2022,4.07,6.49,282.227228
15,LATAM,Mobile App,Mobile,2023,9.51,11.54,240.335384


In [23]:
dimensions = ["mobile", "year"]
total_dimensions = ["year"]

df_agg = aggregate_and_normalize_share(df, dimensions, total_dimensions)

df_sankey = df_agg.copy()
df_sankey["year"] = df_sankey["year"].astype(str)
sankey_chart_creator = SankeyBipartite(
     df=df_sankey, 
     flow_column="net_orders", 
     source_column="year", 
     target_column="mobile",
     normalized=True
)
sankey_chart_creator.generate_sankey_chart()

year
2022    100.0
2023    100.0
Name: net_gross_booking_usd, dtype: float64


In [24]:
dimensions = ["platform", "mobile", "year"]
total_dimensions = ["year"]

df_agg = aggregate_and_normalize_share(df, dimensions, total_dimensions, normalize=False)

In [25]:
df_agg

Unnamed: 0,platform,mobile,year,net_gross_booking_usd,net_orders,avg_ticket
0,Desktop,Desktop,2022,320188700.0,1065379,300.539717
1,Desktop,Desktop,2023,348996600.0,1210587,288.287069
2,Mobile App,Mobile,2022,38220540.0,183998,207.722601
3,Mobile App,Mobile,2023,66363360.0,332041,199.864945
4,Mobile Web,Mobile,2022,60569300.0,286099,211.707489
5,Mobile Web,Mobile,2023,91450850.0,452960,201.896076


In [26]:
sankey_tree_chart = SankeyTree(
    dataframe=df_agg,
    metric="net_gross_booking_usd",
    root_nodes_col="mobile",
    sequence_cols=["platform", "year"]
)

sankey_tree_chart.plot()

In [27]:
sankey_tree_chart = SankeyTree(
    dataframe=df_agg,
    metric="net_gross_booking_usd",
    root_nodes_col="year",
    sequence_cols=["platform", "mobile"]
)

sankey_tree_chart.plot()