In [1]:
from etl.loader import DataLoader
from etl.transformer import DataTransformer
from dataviz.sankey import SankeyChartCreator
from utils.os_ import get_git_root
import os

import pandas as pd
import altair as alt

# Dataset preparation

## source info

In [2]:
git_root = get_git_root(os.getcwd())
path_to_file = os.path.join(git_root, "raw")
filename = "Analytics Interview Question_mobile_new_2024.xlsx"

## load raw dataset

In [3]:
data_loader = DataLoader()
df_raw = data_loader.load_data_xlsx_from_tab(
    path=os.path.join(path_to_file, filename), 
    sheet_name="Data")

## Transform dataset

In [4]:
data_transformer = DataTransformer(df_raw)
df = data_transformer.transform_data(
    fill_super_region=True,
    drop_post_book=True,
    map_property_to_super_region=True,
    replace_us_client_country=True,
    treat_apac_2022w45_outlier=True,
).copy()

In [5]:
print(df.shape)

(73648, 12)


In [6]:
df.sample(10)

Unnamed: 0,client_region,client_country,platform,mobile,property_region,property_country,booking_window,date,year,week,net_gross_booking_usd,net_orders
44295,APAC,Australia,Desktop,Desktop,APAC,Guam,8-14 days,2023-11-13,2023,46,-34.8,-1
70186,LATAM,Brazil,Mobile App,Mobile,EMEA,Norway,61-90 days,2023-11-27,2023,48,-148.7411,0
41883,APAC,South Korea,Mobile Web,Mobile,APAC,Turkey,0-1 days,2023-11-06,2023,45,220.8026,3
3352,North America,United States of America,Desktop,Desktop,LATAM,Dominican Republic,0-1 days,2022-11-07,2022,45,18058.34,56
72212,EMEA,United Kingdom,Mobile Web,Mobile,EMEA,Italy,31-45 days,2023-11-27,2023,48,2585.7789,8
42361,EMEA,Norway,Mobile Web,Mobile,EMEA,Italy,4-7 days,2023-11-06,2023,45,82.1523,1
37840,North America,United States of America,Desktop,Desktop,North America,Mexico,61-90 days,2023-11-06,2023,45,87504.0965,85
1869,EMEA,United Kingdom,Desktop,Desktop,APAC,Indonesia,46-60 days,2022-11-07,2022,45,708.1672,3
65737,EMEA,Norway,Desktop,Desktop,EMEA,Hungary,31-45 days,2023-11-27,2023,48,1135.438,3
5355,EMEA,United Kingdom,Mobile App,Mobile,APAC,Turkey,46-60 days,2022-11-07,2022,45,242.8182,1


In [7]:
df[df["property_region"].isin(["Unknown Country"])]["property_country"].value_counts()

property_country
Cote d'ivoire            41
Sao Tome and Principe     4
Name: count, dtype: Int64

In [8]:
set(df["property_region"])

{'APAC',
 'Cocos Islands',
 'Curacao',
 'EMEA',
 'LATAM',
 'Macedonia',
 'North America',
 'Reunion',
 'Spain & Canary Islands',
 'St. Barthelemy',
 'St. Kitts and Nevis',
 'St. Lucia',
 'St. Martin',
 'St. Vincent and the Grenadines',
 'Sub-Saharan Africa',
 'Svalbard',
 'Swaziland',
 'Taiwan, Republic of China',
 'Turks and Caicos',
 'U.S. Virgin Islands',
 'Unknown Country'}

# Overview of dataset

## absolute values By Super Region, Year and Platform

In [9]:
# altair chart of sum of net_gross_bookings_usd by client_region (color encoding), year (shape encoding), week (x_axis)
# requires transformation and aggregation before plotting

df_chart = df.groupby(["client_region", "year", "week", "mobile"]).agg({"net_gross_booking_usd": "sum", "net_orders": "sum"}).reset_index()

# avg ticket
df_chart["avg_ticket"] = df_chart["net_gross_booking_usd"] / df_chart["net_orders"]

# million usd
df_chart["net_gross_booking_usd"] = df_chart["net_gross_booking_usd"] / 1_000_000

# net orders to thousands
df_chart["net_orders"] = df_chart["net_orders"] / 1_000


print(df_chart.info())
print(df_chart.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   client_region          64 non-null     string 
 1   year                   64 non-null     int32  
 2   week                   64 non-null     int32  
 3   mobile                 64 non-null     string 
 4   net_gross_booking_usd  64 non-null     float64
 5   net_orders             64 non-null     float64
 6   avg_ticket             64 non-null     float64
dtypes: float64(3), int32(2), string(2)
memory usage: 3.1 KB
None
  client_region  year  week   mobile  net_gross_booking_usd  net_orders  \
0          APAC  2022    45  Desktop               8.129804      22.452   
1          APAC  2022    45   Mobile               4.319801       8.756   
2          APAC  2022    46  Desktop               9.067502      25.292   
3          APAC  2022    46   Mobile               2.641798     

In [10]:
y_var = "net_gross_booking_usd"
y_var_name = "Net Gross Bookings (Million USD)"

# y_var = "net_orders"
# y_var_name = "Net Orders (thousands)"

# y_var = "avg_ticket"
# y_var_name = "Average Ticket (USD)"

# Define selection
selection = alt.selection_point(fields=['client_region'], bind='legend', empty=True)

# Define alpha transparency level and gray color code
alpha_transparency = 0.1
gray_color_code = "#666666"

# Modify the line chart to include selection
line_chart = alt.Chart(df_chart).mark_line().encode(
    x=alt.X("week:O", title="Week"),
    y=alt.Y(f"{y_var}:Q", title=y_var_name),
    color=alt.condition(selection,
                        alt.Color("client_region:N", title="Client Region"),
                        alt.value(gray_color_code)),
    strokeDash=alt.StrokeDash("mobile:N", title="Platform"),
    detail=alt.Detail("year:N"),
    opacity=alt.condition(selection, alt.value(1), alt.value(alpha_transparency))
)

# Modify the point chart to include selection
point_chart = alt.Chart(df_chart).mark_point().encode(
    x=alt.X("week:O", title="Week"),
    y=alt.Y(f"{y_var}:Q", title=y_var_name),
    color=alt.condition(selection,
                        alt.Color("client_region:N", title="Client Region"),
                        alt.value(gray_color_code)),
    shape=alt.Shape("year:N", title="Year"),
    opacity=alt.condition(selection, alt.value(1), alt.value(alpha_transparency)),
    tooltip=[
        alt.Tooltip("client_region:N", title="Client Region"),
        alt.Tooltip("mobile:N", title="Platform"),
        alt.Tooltip("year:N", title="Year"),
        alt.Tooltip("week:O", title="Week"),
        alt.Tooltip(f"{y_var}:Q", title=y_var_name).format(",.2f")
    ]
)

# Add selection to the layered chart
layered_chart = alt.layer(line_chart, point_chart).resolve_scale(
    shape='independent',
    strokeDash='independent'
).add_params(
    selection
).properties(
    title=f"{y_var_name} by Client Region, Year, and Week",
    width=600,
    height=400
).interactive()

layered_chart


## Sankey chart view

In [11]:
from dataviz.sankey import SankeyChartCreator

df_sankey = df[['client_region', 'property_region', 'net_gross_booking_usd']]\
    .groupby(['client_region', 'property_region'])\
    ['net_gross_booking_usd']\
    .agg(['sum'])\
    .reset_index()

print(df_sankey.info())
print(df_sankey.sample(5))
df_sankey

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   client_region    71 non-null     string 
 1   property_region  71 non-null     string 
 2   sum              71 non-null     float64
dtypes: float64(1), string(2)
memory usage: 1.8 KB
None
    client_region                 property_region           sum
38          LATAM                            EMEA  4.278552e+06
2            APAC                         Curacao  3.169276e+02
55  North America                       Macedonia  3.793538e+03
46          LATAM  St. Vincent and the Grenadines  1.129343e+03
5            APAC                       Macedonia  2.368988e+02


Unnamed: 0,client_region,property_region,sum
0,APAC,APAC,7.012307e+07
1,APAC,Cocos Islands,9.887493e+02
2,APAC,Curacao,3.169276e+02
3,APAC,EMEA,1.182390e+07
4,APAC,LATAM,7.375738e+05
...,...,...,...
66,North America,Swaziland,2.203520e+03
67,North America,"Taiwan, Republic of China",7.631840e+05
68,North America,Turks and Caicos,6.149879e+05
69,North America,U.S. Virgin Islands,7.078611e+05


In [12]:
# Example usage:
sankey_chart_creator = SankeyChartCreator(
     df=df_sankey, 
     flow_column="sum", 
     source_column="client_region", 
     target_column="property_region"
)
sankey_chart_creator.generate_sankey_chart()


StopIteration: 

In [None]:
(sankey_chart_creator.target_column, 
sankey_chart_creator.source_column, 
sankey_chart_creator.flow_column, 
sankey_chart_creator.source_nodes, 
sankey_chart_creator.target_nodes, 
sankey_chart_creator.intersection_node_values, 
sankey_chart_creator.all_node_values)