In [1]:
from etl.loader import DataLoader
from etl.transformer import DataTransformer
from dataviz.sankey import SankeyChartCreator
from utils.os_ import get_git_root
import os

import pandas as pd
import altair as alt

# Dataset preparation

## source info

In [2]:
git_root = get_git_root(os.getcwd())
path_to_file = os.path.join(git_root, "raw")
filename = "Analytics Interview Question_mobile_new_2024.xlsx"

## load raw dataset

In [3]:
data_loader = DataLoader()
df_raw = data_loader.load_data_xlsx_from_tab(
    path=os.path.join(path_to_file, filename), 
    sheet_name="Data")

## Transform dataset

In [4]:
data_transformer = DataTransformer(df_raw)
df = data_transformer.transform_data(
    fill_super_region=True,
    drop_post_book=True,
    map_property_to_super_region=True,
    replace_us_client_country=True,
    treat_apac_2022w45_outlier=True,
).copy()

In [5]:
print(df.shape)

(73648, 12)


In [6]:
df.sample(10)

Unnamed: 0,client_region,client_country,platform,mobile,property_region,property_country,booking_window,date,year,week,net_gross_booking_usd,net_orders
24105,EMEA,Norway,Mobile Web,Mobile,EMEA,Lithuania,4-7 days,2022-11-21,2022,47,344.7346,2
53148,LATAM,Brazil,Mobile Web,Mobile,EMEA,Netherlands,+90 days,2023-11-13,2023,46,719.534,2
30410,EMEA,United Kingdom,Mobile App,Mobile,APAC,Japan,46-60 days,2022-11-28,2022,48,196.3951,1
66846,LATAM,Brazil,Desktop,Desktop,EMEA,France,0-1 days,2023-11-27,2023,48,748.3041,3
66658,EMEA,United Kingdom,Desktop,Desktop,North America,United States of America,+90 days,2023-11-27,2023,48,337006.1593,444
6223,APAC,South Korea,Mobile Web,Mobile,North America,Canada,+90 days,2022-11-07,2022,45,170.52,1
30192,EMEA,United Kingdom,Mobile App,Mobile,APAC,Australia,8-14 days,2022-11-28,2022,48,1060.894,3
13533,EMEA,United Kingdom,Mobile App,Mobile,APAC,India,8-14 days,2022-11-14,2022,46,2733.4465,6
8110,North America,United States of America,Mobile Web,Mobile,Sub-Saharan Africa,Mauritius,2-3 days,2022-11-07,2022,45,372.0405,1
69890,EMEA,Norway,Mobile App,Mobile,EMEA,Spain & Canary Islands,15-30 days,2023-11-27,2023,48,-410.8607,0


# Overview of dataset

## absolute values By Super Region, Year and Platform

In [7]:
# altair chart of sum of net_gross_bookings_usd by client_region (color encoding), year (shape encoding), week (x_axis)
# requires transformation and aggregation before plotting

df_chart = df.groupby(["client_region", "year", "week", "mobile"]).agg({"net_gross_booking_usd": "sum", "net_orders": "sum"}).reset_index()

# avg ticket
df_chart["avg_ticket"] = df_chart["net_gross_booking_usd"] / df_chart["net_orders"]

# million usd
df_chart["net_gross_booking_usd"] = df_chart["net_gross_booking_usd"] / 1_000_000

# net orders to thousands
df_chart["net_orders"] = df_chart["net_orders"] / 1_000


print(df_chart.info())
print(df_chart.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   client_region          64 non-null     string 
 1   year                   64 non-null     int32  
 2   week                   64 non-null     int32  
 3   mobile                 64 non-null     string 
 4   net_gross_booking_usd  64 non-null     float64
 5   net_orders             64 non-null     float64
 6   avg_ticket             64 non-null     float64
dtypes: float64(3), int32(2), string(2)
memory usage: 3.1 KB
None
  client_region  year  week   mobile  net_gross_booking_usd  net_orders  \
0          APAC  2022    45  Desktop               8.129804      22.452   
1          APAC  2022    45   Mobile               4.319801       8.756   
2          APAC  2022    46  Desktop               9.067502      25.292   
3          APAC  2022    46   Mobile               2.641798     

In [8]:
y_var = "net_gross_booking_usd"
y_var_name = "Net Gross Bookings (Million USD)"

# y_var = "net_orders"
# y_var_name = "Net Orders (thousands)"

# y_var = "avg_ticket"
# y_var_name = "Average Ticket (USD)"

# Define selection
selection = alt.selection_point(fields=['client_region'], bind='legend', empty=True)

# Define alpha transparency level and gray color code
alpha_transparency = 0.1
gray_color_code = "#666666"

# Modify the line chart to include selection
line_chart = alt.Chart(df_chart).mark_line().encode(
    x=alt.X("week:O", title="Week"),
    y=alt.Y(f"{y_var}:Q", title=y_var_name),
    color=alt.condition(selection,
                        alt.Color("client_region:N", title="Client Region"),
                        alt.value(gray_color_code)),
    strokeDash=alt.StrokeDash("mobile:N", title="Platform"),
    detail=alt.Detail("year:N"),
    opacity=alt.condition(selection, alt.value(1), alt.value(alpha_transparency))
)

# Modify the point chart to include selection
point_chart = alt.Chart(df_chart).mark_point().encode(
    x=alt.X("week:O", title="Week"),
    y=alt.Y(f"{y_var}:Q", title=y_var_name),
    color=alt.condition(selection,
                        alt.Color("client_region:N", title="Client Region"),
                        alt.value(gray_color_code)),
    shape=alt.Shape("year:N", title="Year"),
    opacity=alt.condition(selection, alt.value(1), alt.value(alpha_transparency)),
    tooltip=[
        alt.Tooltip("client_region:N", title="Client Region"),
        alt.Tooltip("mobile:N", title="Platform"),
        alt.Tooltip("year:N", title="Year"),
        alt.Tooltip("week:O", title="Week"),
        alt.Tooltip(f"{y_var}:Q", title=y_var_name).format(",.2f")
    ]
)

# Add selection to the layered chart
layered_chart = alt.layer(line_chart, point_chart).resolve_scale(
    shape='independent',
    strokeDash='independent'
).add_params(
    selection
).properties(
    title=f"{y_var_name} by Client Region, Year, and Week",
    width=600,
    height=400
).interactive()

layered_chart


## Sankey chart view

In [13]:
df_sankey = df[['client_region', 'property_region', 'net_gross_booking_usd']]\
    .groupby(['client_region', 'property_region'])\
    ['net_gross_booking_usd']\
    .agg(['sum', 'mean'])\
    .reset_index()

# normalize agg metrics by each source client_region
df_sankey['sum'] = df_sankey['sum'] / df_sankey.groupby('client_region')['sum'].transform('sum')
df_sankey['mean'] = df_sankey['mean'] / df_sankey.groupby('client_region')['mean'].transform('sum')

print(df_sankey.info())
print(df_sankey.sample(5))
df_sankey

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   client_region    20 non-null     string 
 1   property_region  20 non-null     string 
 2   sum              20 non-null     float64
 3   mean             20 non-null     float64
dtypes: float64(2), string(2)
memory usage: 768.0 bytes
None
   client_region     property_region           sum         mean
14         LATAM  Sub-Saharan Africa  1.131536e+05   890.973164
9           EMEA  Sub-Saharan Africa  1.727566e+06  1133.573534
5           EMEA                APAC  1.706292e+07  2329.726633
4           APAC  Sub-Saharan Africa  3.085133e+05   531.919435
11         LATAM                EMEA  4.798767e+06  2063.098662


Unnamed: 0,client_region,property_region,sum,mean
0,APAC,APAC,74393900.0,6180.435573
1,APAC,EMEA,13195620.0,1601.604002
2,APAC,LATAM,742593.4,530.802987
3,APAC,North America,13111730.0,8470.110141
4,APAC,Sub-Saharan Africa,308513.3,531.919435
5,EMEA,APAC,17062920.0,2329.726633
6,EMEA,EMEA,101683900.0,9400.381315
7,EMEA,LATAM,2515065.0,900.488873
8,EMEA,North America,16463580.0,16220.274486
9,EMEA,Sub-Saharan Africa,1727566.0,1133.573534


In [14]:
from dataviz.sankey import SankeyChartCreator

sankey_chart_creator = SankeyChartCreator(
     df=df_sankey, 
     flow_column="mean", 
     source_column="client_region", 
     target_column="property_region"
)
sankey_chart_creator.generate_sankey_chart()


In [15]:
(sankey_chart_creator.target_column, 
sankey_chart_creator.source_column, 
sankey_chart_creator.flow_column, 
sankey_chart_creator.source_nodes, 
sankey_chart_creator.target_nodes, 
sankey_chart_creator.intersection_node_values, 
sankey_chart_creator.all_node_values)

AttributeError: 'SankeyChartCreator' object has no attribute 'intersection_node_values'