In [1]:
from etl.loader import DataLoader
from etl.transformer import DataTransformer
from utils.os_ import get_git_root
import os

import pandas as pd
import altair as alt

# Dataset preparation

## source info

In [2]:
git_root = get_git_root(os.getcwd())
path_to_file = os.path.join(git_root, "raw")
filename = "Analytics Interview Question_mobile_new_2024.xlsx"

## load raw dataset

In [3]:
data_loader = DataLoader()
df_raw = data_loader.load_data_xlsx_from_tab(
    path=os.path.join(path_to_file, filename), 
    sheet_name="Data")

## Transform dataset

In [4]:
data_transformer = DataTransformer(df_raw)
df = data_transformer.transform_data(
    fill_super_region=True,
    drop_post_book=True,
    map_property_to_super_region=True,
    treat_apac_2022w45_outlier=True,
).copy()

In [5]:
df.sample(10)

Unnamed: 0,client_region,client_country,platform,mobile,property_region,property_country,booking_window,date,year,week,net_gross_booking_usd,net_orders
53203,LATAM,Brazil,Mobile Web,Mobile,EMEA,United Kingdom,4-7 days,2023-11-13,2023,46,1011.0488,1
48324,North America,US,Desktop,Desktop,LATAM,Trinidad and Tobago,2-3 days,2023-11-13,2023,46,2699.9,6
55288,APAC,Australia,Desktop,Desktop,APAC,United Arab Emirates,31-45 days,2023-11-20,2023,47,2193.6523,6
36529,EMEA,United Kingdom,Desktop,Desktop,Sub-Saharan Africa,Tanzania,4-7 days,2023-11-06,2023,45,-213.47,0
16110,EMEA,United Kingdom,Mobile Web,Mobile,APAC,Vietnam,0-1 days,2022-11-14,2022,46,980.2898,10
47605,North America,US,Desktop,Desktop,EMEA,France,+90 days,2023-11-13,2023,46,69827.8249,181
46393,EMEA,United Kingdom,Desktop,Desktop,APAC,Qatar,4-7 days,2023-11-13,2023,46,1999.5337,5
6977,EMEA,Norway,Mobile Web,Mobile,EMEA,Denmark,4-7 days,2022-11-07,2022,45,3118.3335,9
12183,North America,US,Desktop,Desktop,APAC,Nepal,15-30 days,2022-11-14,2022,46,636.52,3
5235,EMEA,United Kingdom,Mobile App,Mobile,EMEA,Portugal,8-14 days,2022-11-07,2022,45,107.6587,1


# Overview of absolute values By Super Region, Year and Platform

In [6]:
# altair chart of sum of net_gross_bookings_usd by client_region (color encoding), year (shape encoding), week (x_axis)
# requires transformation and aggregation before plotting

df_chart = df.groupby(["client_region", "year", "week", "mobile"]).agg({"net_gross_booking_usd": "sum", "net_orders": "sum"}).reset_index()

# avg ticket
df_chart["avg_ticket"] = df_chart["net_gross_booking_usd"] / df_chart["net_orders"]

# million usd
df_chart["net_gross_booking_usd"] = df_chart["net_gross_booking_usd"] / 1_000_000

# net orders to thousands
df_chart["net_orders"] = df_chart["net_orders"] / 1_000


print(df_chart.info())
print(df_chart.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   client_region          64 non-null     string 
 1   year                   64 non-null     int32  
 2   week                   64 non-null     int32  
 3   mobile                 64 non-null     string 
 4   net_gross_booking_usd  64 non-null     float64
 5   net_orders             64 non-null     float64
 6   avg_ticket             64 non-null     float64
dtypes: float64(3), int32(2), string(2)
memory usage: 3.1 KB
None
  client_region  year  week   mobile  net_gross_booking_usd  net_orders  \
0          APAC  2022    45  Desktop               8.129804      22.452   
1          APAC  2022    45   Mobile               4.319801       8.756   
2          APAC  2022    46  Desktop               9.067502      25.292   
3          APAC  2022    46   Mobile               2.641798     

In [7]:
y_var = "net_gross_booking_usd"
y_var_name = "Net Gross Bookings (Million USD)"

# y_var = "net_orders"
# y_var_name = "Net Orders (thousands)"

# y_var = "avg_ticket"
# y_var_name = "Average Ticket (USD)"

# Define selection
selection = alt.selection_point(fields=['client_region'], bind='legend', empty=True)

# Define alpha transparency level and gray color code
alpha_transparency = 0.1
gray_color_code = "#666666"

# Modify the line chart to include selection
line_chart = alt.Chart(df_chart).mark_line().encode(
    x=alt.X("week:O", title="Week"),
    y=alt.Y(f"{y_var}:Q", title=y_var_name),
    color=alt.condition(selection,
                        alt.Color("client_region:N", title="Client Region"),
                        alt.value(gray_color_code)),
    strokeDash=alt.StrokeDash("mobile:N", title="Platform"),
    detail=alt.Detail("year:N"),
    opacity=alt.condition(selection, alt.value(1), alt.value(alpha_transparency))
)

# Modify the point chart to include selection
point_chart = alt.Chart(df_chart).mark_point().encode(
    x=alt.X("week:O", title="Week"),
    y=alt.Y(f"{y_var}:Q", title=y_var_name),
    color=alt.condition(selection,
                        alt.Color("client_region:N", title="Client Region"),
                        alt.value(gray_color_code)),
    shape=alt.Shape("year:N", title="Year"),
    opacity=alt.condition(selection, alt.value(1), alt.value(alpha_transparency)),
    tooltip=[
        alt.Tooltip("client_region:N", title="Client Region"),
        alt.Tooltip("mobile:N", title="Platform"),
        alt.Tooltip("year:N", title="Year"),
        alt.Tooltip("week:O", title="Week"),
        alt.Tooltip(f"{y_var}:Q", title=y_var_name).format(",.2f")
    ]
)

# Add selection to the layered chart
layered_chart = alt.layer(line_chart, point_chart).resolve_scale(
    shape='independent',
    strokeDash='independent'
).add_params(
    selection
).properties(
    title=f"{y_var_name} by Client Region, Year, and Week",
    width=600,
    height=400
).interactive()

layered_chart
