In [55]:
import pathlib as Path
import sys
import polars as pl
import numpy as np
import pandas as pd
import time
import copy
import matplotlib.pyplot as plt
import altair as alt
import hvplot.polars as hvpl 
import seaborn as sns
import autoviz as av
import sweetviz as sv
import polars.selectors as cs
import altair as alt

alt.data_transformers.enable("vegafusion")


DataTransformerRegistry.enable('vegafusion')

In [56]:
import polars as pl

# **Create a Dummy Dataset with One Property Star Rating**
dummy_data = pl.DataFrame({
    "prop_starrating": ["4", "4", "4", "4"],  # **Only one unique star rating**
    "proportion": [0.5, 0.3, 0.15, 0.05],  # **Sum to 1**
    "click_bool": ["1", "1", "1", "0"],  # **Clicks & Non-Clicks**
    "booking_bool": ["1", "0", "1", "0"]  # **Bookings & Non-Bookings**
})
agg_df=dummy_data
dummy_data



prop_starrating,proportion,click_bool,booking_bool
str,f64,str,str
"""4""",0.5,"""1""","""1"""
"""4""",0.3,"""1""","""0"""
"""4""",0.15,"""1""","""1"""
"""4""",0.05,"""0""","""0"""


In [57]:
import altair as alt
import polars as pl

# **Create a Dataset that Includes Clicked & Not Clicked**
agg_df = pl.DataFrame({
    "prop_starrating": ["4", "4"],  # **Same property star rating for both**
    "proportion": [0.7, 0.3],  # **Proportions for Clicked & Not Clicked**
    "click_bool": ["1", "0"],  # **"1" = Clicked, "0" = Not Clicked**
})

# **Multi-Selection Dropdown for Clicked vs Not Clicked**
click_selection = alt.param(
    name="click_bool_selection",
    bind=alt.binding_select(
        options=["All", "0", "1"],  # ✅ "All" = Both, "0" = Not Clicked, "1" = Clicked
        labels=["Both", "Not Clicked", "Clicked"],
        name="Click Status"
    ),
    value="All"  # ✅ Default to showing both Clicked & Not Clicked
)

# **Convert `click_bool` values to string before filtering**
agg_df = agg_df.with_columns(
    agg_df["click_bool"].cast(str)
)

# **Fix Filtering Logic: Show Selected Click Status or Both**
filter_condition = (
    (click_selection == "All") | (alt.datum.click_bool == click_selection)
)

# **Y-Axis Scale Settings**
y_axis = alt.Y(
    "proportion:Q",
    title="Proportion",
    scale=alt.Scale(domain=[0, 1]),  # ✅ Ensures full range from 0 to 1
    axis=alt.Axis(tickCount=10)  # ✅ Better tick spacing
)

# **Bar Chart for Clicked vs. Not Clicked (Side-by-Side, Not Stacked)**
chart = alt.Chart(agg_df).mark_bar(opacity=0.8).encode(
    x=alt.X("prop_starrating:O", title="Property Star Rating"),  
    xOffset=alt.XOffset("click_bool:N"),  # ✅ Moves Clicked and Not Clicked next to each other
    y=y_axis,  
    color=alt.Color(
        "click_bool:N",
        title="Click Status",
        scale=alt.Scale(domain=["0", "1"], range=["#ff7f0e", "#1f77b4"]),  # Orange & Blue
        legend=alt.Legend(title="Click Status")
    ),
    tooltip=["prop_starrating", "proportion", "click_bool"]
).transform_filter(
    filter_condition
).add_params(
    click_selection
).properties(
    width=400,
    height=400,
    title="Property Star Rating: Clicked vs. Not Clicked (Grouped)"
).configure_axisX(labelAngle=45)

chart


In [58]:
import altair as alt
import polars as pl

# Create a Dataset that Includes Clicked & Not Clicked
agg_df = pl.DataFrame({
    "prop_starrating": ["4", "3","2"],  
    "proportion": [0.7, 0.2,0.1],  
    "click_bool": [1,0,1],
    "booking_bool": [1,0,0]
})

# Multi-Selection Dropdown for Clicked vs Not Clicked
click_selection = alt.param(
    name="click_bool_selection",
    bind=alt.binding_select(
        options=["All", "0", "1"],
        labels=["Both", "Not Clicked", "Clicked"],
        name="Click Status"
    ),
    value="All"
)

# Multi-Selection Dropdown for Booked vs Not Booked
book_selection = alt.param(
    name="book_bool_selection",
    bind=alt.binding_select(
        options=["All", "0", "1"],
        labels=["Both", "Not Booked", "Booked"],
        name="Booking Status"
    ),
    value="All"
)

# Combined Filtering Logic
filter_condition = (
    ((click_selection == "All") | 
     ((click_selection == "0") & (alt.datum.click_bool == 0)) |
     ((click_selection == "1") & (alt.datum.click_bool == 1))) &
    ((book_selection == "All") | 
     ((book_selection == "0") & (alt.datum.booking_bool == 0)) |
     ((book_selection == "1") & (alt.datum.booking_bool == 1)))
)

y_axis = alt.Y(
    "proportion:Q",
    title="Proportion",
    scale=alt.Scale(domain=[0, 1]),
    axis=alt.Axis(tickCount=10)
)

zoom = alt.selection_interval(bind="scales")

# Updated chart with both click and booking status
chart = alt.Chart(agg_df).transform_calculate(
    click_label="datum.click_bool == 1 ? 'Clicked' : 'Not Clicked'",
    book_label="datum.booking_bool == 1 ? 'Booked' : 'Not Booked'"
).mark_bar(opacity=0.8).encode(
    x=alt.X("prop_starrating:O", title="Property Star Rating"),
    xOffset=alt.XOffset("book_label:N"),
    y=y_axis,
    color=alt.Color(
        "book_label:N",
        title="Booking Status",
        scale=alt.Scale(domain=["Not Booked", "Booked"], range=["#ff7f0e", "#1f77b4"]),
    ),
    tooltip=["prop_starrating", "proportion", "click_label:N", "book_label:N"]
).transform_filter(
    filter_condition
).add_params(
    click_selection,
    book_selection,
    zoom
).properties(
    width=500,
    height=400,
    title="Property Star Rating by Booking and Click Status"
).configure_axisX(
    labelAngle=45
)

chart


In [59]:
import altair as alt
import polars as pl

# ✅ Dummy Aggregated DataFrame (Replace with actual dataset)
agg_df = pl.DataFrame({
    "prop_starrating": ["1", "2", "3", "4", "5"],
    "click_bool": ["1", "1", "0", "0", "1"],
    "booking_bool": ["0", "1", "0", "0", "1"],
    "proportion": [0.1, 0.2, 0.15, 0.3, 0.25],
    "click_proportion": [0.3, 0.4, 0.3, 0.5, 0.35]  # Click-based proportions
})

# ✅ Convert DataFrame to Altair-compatible format
agg_df = agg_df.to_pandas()

# ✅ Dropdown for Clicked vs. Not Clicked
click_selection = alt.param(
    name="click_bool_selection",
    bind=alt.binding_select(
        options=["All", "0", "1"],
        labels=["Both", "Not Clicked", "Clicked"],
        name="Click Status"
    ),
    value="All"
)

# ✅ Dropdown for Booked vs. Not Booked
book_selection = alt.param(
    name="book_bool_selection",
    bind=alt.binding_select(
        options=["All", "0", "1"],
        labels=["Both", "Not Booked", "Booked"],
        name="Booking Status"
    ),
    value="All"
)

# ✅ Dropdown for Overall Proportion vs. Given Clicked
proportion_type_selection = alt.param(
    name="proportion_type_selection",
    bind=alt.binding_select(
        options=["Overall", "Given Clicked"],
        labels=["Overall Proportion", "Proportion Given Clicked"],
        name="Proportion Type"
    ),
    value="Overall"
)

# ✅ Precompute Adjusted Proportion (Outside Altair)
agg_df["adjusted_proportion"] = agg_df.apply(
    lambda row: row["proportion"] / row["click_proportion"]
    if row["click_proportion"] > 0 and proportion_type_selection == "Given Clicked"
    else row["proportion"],
    axis=1
)

# ✅ Filtering Logic
filter_condition = (
    ((click_selection == "All") | ((click_selection == "0") & (alt.datum.click_bool == "0")) | ((click_selection == "1") & (alt.datum.click_bool == "1"))) &
    ((book_selection == "All") | ((book_selection == "0") & (alt.datum.booking_bool == "0")) | ((book_selection == "1") & (alt.datum.booking_bool == "1")))
)

# ✅ Chart Definition
chart = alt.Chart(agg_df).transform_calculate(
    click_label="datum.click_bool == '1' ? 'Clicked' : 'Not Clicked'",
    book_label="datum.booking_bool == '1' ? 'Booked' : 'Not Booked'"
).mark_bar(opacity=0.8).encode(
    x=alt.X("prop_starrating:O", title="Property Star Rating"),
    xOffset=alt.XOffset("click_label:N"),  # ✅ Moves Clicked & Not Clicked Side by Side
    y=alt.Y("adjusted_proportion:Q", title="Proportion", scale=alt.Scale(domain=[0, 1])),
    color=alt.Color(
        "book_label:N",
        scale=alt.Scale(domain=['Not Booked', 'Booked'], range=['#ff7f0e', '#1f77b4']),
        legend=alt.Legend(title="Booking Status")
    ),
    column=alt.Column("book_label:N", title="Booking Status"),
    tooltip=["prop_starrating", "adjusted_proportion", "click_label:N", "book_label:N"]
).transform_filter(
    filter_condition
).add_params(
    click_selection,
    book_selection,
    proportion_type_selection
).properties(
    width=500,
    height=400,
    title="Property Star Rating by Booking and Click Status"
).configure_axisX(
    labelAngle=45
)

chart


In [60]:
import altair as alt
import polars as pl

# Create a Dataset that Includes Clicked & Not Clicked
agg_df = pl.DataFrame({
    "prop_starrating": ["4", "3", "2"],  
    "proportion": [0.7, 0.2, 0.1],  
    "click_bool": [1, 0, 1],
    "booking_bool": [1, 0, 0]
})

# Multi-Selection Dropdown for Clicked vs Not Clicked
click_selection = alt.param(
    name="click_bool_selection",
    bind=alt.binding_select(
        options=["All", "0", "1"],
        labels=["Both", "Not Clicked", "Clicked"],
        name="Click Status"
    ),
    value="All"
)

# Multi-Selection Dropdown for Booked vs Not Booked
book_selection = alt.param(
    name="book_bool_selection",
    bind=alt.binding_select(
        options=["All", "0", "1"],
        labels=["Both", "Not Booked", "Booked"],
        name="Booking Status"
    ),
    value="All"
)

# Combined Filtering Logic
filter_condition = (
    ((click_selection == "All") | 
     ((click_selection == "0") & (alt.datum.click_bool == 0)) |
     ((click_selection == "1") & (alt.datum.click_bool == 1))) &
    ((book_selection == "All") | 
     ((book_selection == "0") & (alt.datum.booking_bool == 0)) |
     ((book_selection == "1") & (alt.datum.booking_bool == 1)))
)

# Define Y-Axis
y_axis = alt.Y(
    "proportion:Q",
    title="Overall Proportion",
    scale=alt.Scale(domain=[0, 1]),
    axis=alt.Axis(tickCount=10)
)

# Enable Zooming
zoom = alt.selection_interval(bind="scales")

# ✅ **Dynamic Title Using `alt.expr`**
title_expr = alt.expr(
    f"click_bool_selection === '1' ? 'Property Star Rating for Clicked Users' : "
    f"click_bool_selection === '0' ? 'Property Star Rating for Not Clicked Users' : "
    f"'Property Star Rating by Booking and Click Status'"
)

chart = alt.Chart(agg_df).transform_calculate(
    click_label="datum.click_bool == 1 ? 'Clicked' : 'Not Clicked'",
    book_label="datum.booking_bool == 1 ? 'Booked' : 'Not Booked'"
).mark_bar(opacity=0.8).encode(
    x=alt.X("prop_starrating:O", title="Property Star Rating"),
    xOffset=alt.XOffset("book_label:N"),
    y=y_axis,
    color=alt.Color(
        "book_label:N",
        title="Booking Status",
        scale=alt.Scale(domain=["Not Booked", "Booked"], range=["#ff7f0e", "#1f77b4"]),
    ),
    tooltip=["prop_starrating", "proportion", "click_label:N", "book_label:N"]
).transform_filter(
    filter_condition
).add_params(
    click_selection,
    book_selection,
    zoom
).properties(
    width=500,
    height=400,
    title=alt.Title(
        text=title_expr,  # ✅ Dynamically updates based on Clicked/Not Clicked
        fontSize=14,
        anchor="middle",
        fontWeight="bold"
    )
)

chart


In [61]:
import altair as alt
import polars as pl

# Sample Data: Daily Aggregated Clicks & Bookings
agg_df = pl.DataFrame({
    "date": ["2024-02-01", "2024-02-02", "2024-02-03", "2024-02-04"],
    "click_count": [120, 150, 170, 140],
    "book_count": [30, 45, 50, 35]
})

# Convert date to a string format Altair can handle
agg_df = agg_df.with_columns(pl.col("date").str.strptime(pl.Date, "%Y-%m-%d").cast(str))

# Dropdown for Selecting Clicked vs Booked
selection_param = alt.param(
    name="selection",
    bind=alt.binding_select(
        options=["Both", "Clicked", "Booked"],
        labels=["Both", "Only Clicked", "Only Booked"],
        name="Filter"
    ),
    value="Both"
)

# Filtering Logic
filter_condition = (
    (selection_param == "Both") | 
    ((selection_param == "Clicked") & (alt.datum.metric == "click_count")) | 
    ((selection_param == "Booked") & (alt.datum.metric == "book_count"))
)
title_expr = alt.expr(
    f"click_bool_selection === 'Clicked' ? 'Property Star Rating for Clicked Users' : "
    f"click_bool_selection === 'Booked' ? 'C' : "
    f"'Property Star Rating by Booking and Click Status'"
)
# Convert Data to Long Format (so Altair can handle it)
melted_df = agg_df.unpivot(index=["date"], variable_name="metric", value_name="count")
print(melted_df)
print(melted_df["date"].min(), melted_df["date"].max())
# Create the Line Chart
chart = alt.Chart(melted_df).mark_line(point=True).encode(
    x=alt.X("date:T", title="Date", scale=alt.Scale(domain=(melted_df["date"].min(), melted_df["date"].max()))),

    y=alt.Y("count:Q", title="Count"),
    color=alt.Color("metric:N", title="Metric", scale=alt.Scale(domain=["click_count", "book_count"], range=["#1f77b4", "#ff7f0e"])),
    tooltip=["date", "metric", "count"]
).transform_filter(
    filter_condition
).add_params(
    selection_param
).properties(
    width=700,
    height=400,
    title="Daily Clicked vs Booked Count"
).configure_axisX(
    labelAngle=45
)

chart


shape: (8, 3)
┌────────────┬─────────────┬───────┐
│ date       ┆ metric      ┆ count │
│ ---        ┆ ---         ┆ ---   │
│ str        ┆ str         ┆ i64   │
╞════════════╪═════════════╪═══════╡
│ 2024-02-01 ┆ click_count ┆ 120   │
│ 2024-02-02 ┆ click_count ┆ 150   │
│ 2024-02-03 ┆ click_count ┆ 170   │
│ 2024-02-04 ┆ click_count ┆ 140   │
│ 2024-02-01 ┆ book_count  ┆ 30    │
│ 2024-02-02 ┆ book_count  ┆ 45    │
│ 2024-02-03 ┆ book_count  ┆ 50    │
│ 2024-02-04 ┆ book_count  ┆ 35    │
└────────────┴─────────────┴───────┘
2024-02-01 2024-02-04


In [62]:
import altair as alt
import polars as pl

# Sample Data: Daily Aggregated Clicks & Bookings
agg_df = pl.DataFrame({
    "date": ["2024-02-01", "2024-02-02", "2024-02-03", "2024-02-04"],
    "click_count": [120, 150, 170, 140],
    "book_count": [30, 45, 50, 35]
})

# Convert date to a string format Altair can handle
agg_df = agg_df.with_columns(pl.col("date").str.strptime(pl.Date, "%Y-%m-%d").cast(str))

# Dropdown for Selecting Clicked vs Booked
selection_param = alt.param(
    name="selection",
    bind=alt.binding_select(
        options=["Both", "Clicked", "Booked"],
        labels=["Both", "Only Clicked", "Only Booked"],
        name="Filter"
    ),
    value="Both"
)

# Filtering Logic
filter_condition = (
    (selection_param == "Both") | 
    ((selection_param == "Clicked") & (alt.datum.metric == "click_count")) | 
    ((selection_param == "Booked") & (alt.datum.metric == "book_count"))
)

# ✅ **Fix: Dynamic Title Using `alt.expr`**
title_expr = alt.expr(
    "selection === 'Clicked' ? 'Daily Click Count' : "
    "selection === 'Booked' ? 'Daily Booking Count' : "
    "'Daily Clicked vs Booked Count'"
)

# Convert Data to Long Format (so Altair can handle it)
melted_df = agg_df.unpivot(index=["date"], variable_name="metric", value_name="count")

# Create the Line Chart
chart = alt.Chart(melted_df).mark_line(point=True).encode(
    x=alt.X("date:T", title="Date", scale=alt.Scale(domain=(melted_df["date"].min(), melted_df["date"].max()))),
    y=alt.Y("count:Q", title="Count"),
    color=alt.Color("metric:N", title="Metric", scale=alt.Scale(domain=["click_count", "book_count"], range=["#1f77b4", "#ff7f0e"])),
    tooltip=["date", "metric", "count"]
).transform_filter(
    filter_condition
).add_params(
    selection_param
).properties(
    width=700,
    height=400,
    title=alt.TitleParams(
        text=title_expr,  # ✅ **Correctly uses an expression for the title**
        fontSize=14,
        anchor="middle",
        fontWeight="bold"
    )
).configure_axisX(
    labelAngle=45
)

chart


In [63]:
import altair as alt
import polars as pl
import numpy as np

# Simulated Data: Original (with outliers) & Cleaned (without outliers)
np.random.seed(42)
original_data = np.random.normal(loc=50, scale=10, size=100).tolist() + [150, 160, 170]  # Outliers added
cleaned_data = [x for x in original_data if x < 100]  # Removing outliers

# Create a DataFrame
df = pl.DataFrame({
    "value": original_data + cleaned_data,
    "dataset_type": ["Original (with outliers)"] * len(original_data) + ["Cleaned (without outliers)"] * len(cleaned_data)
})
print(df)
print(df.pivot(index="value", on="dataset_type"))  
# Dropdown for selecting dataset type
dataset_selection = alt.param(
    name="dataset_type",
    bind=alt.binding_select(
        options=["Both", "Original (with outliers)", "Cleaned (without outliers)"],
        labels=["Both", "Original", "Without Outliers"],
        name="Dataset"
    ),
    value="Both"
)

# Filtering Logic
filter_condition = (
    (dataset_selection == "Both") | 
    (alt.datum.dataset_type == dataset_selection)
)

title_expr = alt.expr(
    "dataset_type === 'Original (with outliers)' ? 'Original Distribution' : "
    "dataset_type === 'Cleaned (without outliers)' ? 'Post Process Distibution' : "
    "'Value Distribution with and without Outliers'"
)

# Create the Histogram
chart = alt.Chart(df).mark_bar(opacity=0.6).encode(
    x=alt.X("value:Q", bin=alt.Bin(maxbins=30), title="Value Distribution"),
    y=alt.Y("count()", title="Frequency"),
    color=alt.Color("dataset_type:N", title="Dataset Type", scale=alt.Scale(domain=["Original (with outliers)", "Cleaned (without outliers)"], range=["#ff7f0e", "#1f77b4"])),
    tooltip=["dataset_type", "value"]
).transform_filter(
    filter_condition
).add_params(
    dataset_selection
).properties(
    width=600,
    height=400,
    title=alt.TitleParams(
        text=title_expr,
        fontSize=14,
        anchor="middle",
        fontWeight="bold"
    )
).configure_axisX(
    labelAngle=45
)

chart


shape: (203, 2)
┌───────────┬────────────────────────────┐
│ value     ┆ dataset_type               │
│ ---       ┆ ---                        │
│ f64       ┆ str                        │
╞═══════════╪════════════════════════════╡
│ 54.967142 ┆ Original (with outliers)   │
│ 48.617357 ┆ Original (with outliers)   │
│ 56.476885 ┆ Original (with outliers)   │
│ 65.230299 ┆ Original (with outliers)   │
│ 47.658466 ┆ Original (with outliers)   │
│ …         ┆ …                          │
│ 35.364851 ┆ Cleaned (without outliers) │
│ 52.961203 ┆ Cleaned (without outliers) │
│ 52.610553 ┆ Cleaned (without outliers) │
│ 50.051135 ┆ Cleaned (without outliers) │
│ 47.654129 ┆ Cleaned (without outliers) │
└───────────┴────────────────────────────┘
shape: (0, 0)
┌┐
╞╡
└┘


In [68]:
import altair as alt
import polars as pl
import numpy as np

class Visualizer:
    def __init__(
        self,
        df: pl.DataFrame,
        default_width: int = 800,
        default_height: int = 400,
        color_scheme: dict = None,
        theme: dict = None,
        debug: bool = False
    ):
        """
        Initialize the Visualizer with a Polars DataFrame and optional settings.

        Args:
            df (pl.DataFrame): The data source.
            default_width (int): Default width for charts.
            default_height (int): Default height for charts.
            color_scheme (dict): A dictionary to define color settings.
            theme (dict): A dictionary to define overall chart theme settings.
            debug (bool): If True, prints additional debugging info.
        """
        self.df = df
        self.default_width = default_width
        self.default_height = default_height
        self.debug = debug

        # Default color scheme if not provided
        self.color_scheme = color_scheme or {
            "booking": "#1f77b4",
            "not_booking": "#d62728",
            "click": "#1f77b4",
            "not_click": "#ff7f0e"
        }

        # Default theme settings (expand as needed)
        self.theme = theme or {
            "title_fontsize": 14,
            "title_anchor": "middle",
            "title_fontweight": "bold",
            "axis_label_angle": 45
        }

    def plot_categorical_distribution(self, var_x: str, click: bool = False) -> alt.Chart:
        n_rows = self.df.shape[0]
        df_agg = (
            self.df.group_by([var_x, "click_bool", "booking_bool"])
            .agg(((pl.count() / n_rows) * 100).alias("proportion"))
            .sort("proportion", descending=True)
        )

        if self.debug:
            print(f"Aggregated DataFrame: {df_agg}")

        click_selection = alt.param(
            name="click_bool_selection",
            bind=alt.binding_select(
                options=["All", 0, 1],
                labels=["Both", "Not Clicked", "Clicked"],
                name="Click Status"
            ),
            value="All"
        )

        book_selection = alt.param(
            name="book_bool_selection",
            bind=alt.binding_select(
                options=["All", 0, 1],
                labels=["Both", "Not Booked", "Booked"],
                name="Booking Status"
            ),
            value="All"
        )

        filter_condition = (
            ((click_selection == "All") | 
             ((click_selection == 0) & (alt.datum.click_bool == 0)) |
             ((click_selection == 1) & (alt.datum.click_bool == 1))) &
            ((book_selection == "All") | 
             ((book_selection == 0) & (alt.datum.booking_bool == 0)) |
             ((book_selection == 1) & (alt.datum.booking_bool == 1)))
        )

        y_axis = alt.Y(
            "proportion:Q",
            title=f"Percentage of {var_x.replace('_', ' ').title()}",
            scale=alt.Scale(domain=[0, 100]),
            axis=alt.Axis(tickCount=10)
        )

        zoom = alt.selection_interval(bind="scales")

        title_expr = alt.expr(
            f"click_bool_selection === 1 ? 'Distribution of {var_x.replace('_', ' ').title()} for Clicked Users' : "
            f"click_bool_selection === 0 ? 'Distribution of {var_x.replace('_', ' ').title()} for Not Clicked Users' : "
            f"'Distribution of {var_x.replace('_', ' ').title()} by Click & Booking Status'"
        )

        chart = alt.Chart(df_agg).transform_calculate(
            click_label="datum.click_bool == 1 ? 'Clicked' : 'Not Clicked'",
            book_label="datum.booking_bool == 1 ? 'Booked' : 'Not Booked'"
        ).mark_bar(opacity=0.8).encode(
            x=alt.X(f"{var_x}:O", title=var_x.replace('_', ' ').title()),
            xOffset=alt.XOffset("book_label:N"),
            y=y_axis,
            color=alt.Color(
                "book_label:N",
                title="Booking Status",
                scale=alt.Scale(domain=["Not Booked", "Booked"], range=[self.color_scheme["not_booking"], self.color_scheme["booking"]]),
            ),
            tooltip=[alt.Tooltip(var_x, title=var_x.replace("_", " ").title()), 
                     alt.Tooltip("proportion", title="Percentage"),
                     alt.Tooltip("click_label:N", title="Click Status"), 
                     alt.Tooltip("book_label:N", title="Booking Status")]
        ).transform_filter(
            filter_condition
        ).add_params(
            click_selection,
            book_selection,
            zoom
        ).properties(
            width=self.default_width,
            height=self.default_height,
            title=alt.TitleParams(
                text=title_expr,
                fontSize=self.theme["title_fontsize"],
                anchor=self.theme["title_anchor"],
                fontWeight=self.theme["title_fontweight"]
            )
        )

        return chart

    def plot_timeseries(self, datetime_col: str = "date") -> alt.Chart:
        df_local = self.df
        if datetime_col not in df_local.columns:
            if self.debug:
                print(f"Column '{datetime_col}' not found. Extracting date from a time column...")
            df_local = df_local.with_columns(
                pl.col("^.*time.*$").dt.date().alias(datetime_col)
            )

        df_agg = (
            df_local.group_by(datetime_col)
            .agg(
                pl.col("click_bool").sum().alias("click_count"),
                pl.col("booking_bool").sum().alias("book_count")
            )
            .sort(datetime_col)
        )

        melted_df = df_agg.melt(id_vars=[datetime_col], variable_name="metric", value_name="count")
        first_date, last_date = melted_df[datetime_col].min(), melted_df[datetime_col].max()

        selection_param = alt.param(
            name="selection",
            bind=alt.binding_select(
                options=["Both", "click_count", "book_count"],
                labels=["Both", "Clicked", "Booked"],
                name="Filter"
            ),
            value="Both"
        )

        filter_condition = (
            (selection_param == "Both") | 
            ((selection_param == "click_count") & (alt.datum.metric == "click_count")) | 
            ((selection_param == "book_count") & (alt.datum.metric == "book_count"))
        )

        title_expr = alt.expr(
            "selection === 'click_count' ? 'Daily Click Count' : "
            "selection === 'book_count' ? 'Daily Booking Count' : "
            "'Daily Clicked vs Booked Count'"
        )

        zoom = alt.selection_interval(bind="scales")

        chart = alt.Chart(melted_df).mark_line(point=True).encode(
            x=alt.X(
                f"{datetime_col}:T",
                title="Date",
                scale=alt.Scale(domain=[first_date, last_date])
            ),
            y=alt.Y("count:Q", title="Count"),
            color=alt.Color("metric:N", title="Metric", scale=alt.Scale(domain=["click_count", "book_count"], range=[self.color_scheme["click"], self.color_scheme["not_click"]])),
            tooltip=[alt.Tooltip(datetime_col, title="Date"), alt.Tooltip("metric:N", title="Metric"), alt.Tooltip("count:Q", title="Count")]
        ).transform_filter(
            filter_condition
        ).add_params(
            selection_param,
            zoom
        ).properties(
            width=self.default_width,
            height=self.default_height,
            title=alt.TitleParams(
                text=title_expr,
                fontSize=self.theme["title_fontsize"],
                anchor=self.theme["title_anchor"],
                fontWeight=self.theme["title_fontweight"]
            )
        ).configure_axisX(
            labelAngle=self.theme["axis_label_angle"]
        )

        return chart

    def plot_histogram(self) -> alt.Chart:
        np.random.seed(42)
        original_data = np.random.normal(loc=50, scale=10, size=100).tolist() + [150, 160, 170]
        cleaned_data = [x for x in original_data if x < 100]

        df_hist = pl.DataFrame({
            "value": original_data + cleaned_data,
            "dataset_type": ["Original (with outliers)"] * len(original_data) + ["Cleaned (without outliers)"] * len(cleaned_data)
        })

        dataset_selection = alt.param(
            name="dataset_type",
            bind=alt.binding_select(
                options=["Both", "Original (with outliers)", "Cleaned (without outliers)"],
                labels=["Both", "Original", "Without Outliers"],
                name="Dataset"
            ),
            value="Both"
        )

        filter_condition = (
            (dataset_selection == "Both") | 
            (alt.datum.dataset_type == dataset_selection)
        )

        title_expr = alt.expr(
            "dataset_type === 'Original (with outliers)' ? 'Original Distribution' : "
            "dataset_type === 'Cleaned (without outliers)' ? 'Post Process Distribution' : "
            "'Value Distribution with and without Outliers'"
        )

        chart = alt.Chart(df_hist).mark_bar(opacity=0.6).encode(
            x=alt.X("value:Q", bin=alt.Bin(maxbins=30), title="Value Distribution"),
            y=alt.Y("count()", title="Frequency"),
            color=alt.Color("dataset_type:N", title="Dataset Type",
                            scale=alt.Scale(domain=["Original (with outliers)", "Cleaned (without outliers)"],
                                            range=[self.color_scheme["not_booking"], self.color_scheme["booking"]])),
            tooltip=["dataset_type", "value"]
        ).transform_filter(
            filter_condition
        ).add_params(
            dataset_selection
        ).properties(
            width=int(self.default_width * 0.75),
            height=self.default_height,
            title=alt.TitleParams(
                text=title_expr,
                fontSize=self.theme["title_fontsize"],
                anchor=self.theme["title_anchor"],
                fontWeight=self.theme["title_fontweight"]
            )
        ).configure_axisX(
            labelAngle=self.theme["axis_label_angle"]
        )

        return chart

# Example usage:
visualizer = Visualizer(train_df, default_width=800, default_height=400, debug=True)
chart1 = visualizer.plot_categorical_distribution("booking_day_of_week")
chart2 = visualizer.plot_timeseries("booking_date")
chart3 = visualizer.plot_histogram()
chart1.display()
chart2.display()
chart3.display()


NameError: name 'train_df' is not defined

In [65]:
import polars as pl
from datetime import datetime, timedelta

# Create a list of dates for testing
dates = [datetime(2023, 1, 1) + timedelta(days=i) for i in range(10)]

# Create a sample DataFrame
df_sample = pl.DataFrame({
    "booking_day_of_week": ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", "Monday", "Tuesday", "Wednesday"],
    "booking_date": dates,
    "click_bool": [1, 0, 1, 0, 1, 0, 0, 1, 0, 1],
    "booking_bool": [0, 0, 1, 0, 1, 0, 0, 1, 1, 0]
})

print(df_sample)


shape: (10, 4)
┌─────────────────────┬─────────────────────┬────────────┬──────────────┐
│ booking_day_of_week ┆ booking_date        ┆ click_bool ┆ booking_bool │
│ ---                 ┆ ---                 ┆ ---        ┆ ---          │
│ str                 ┆ datetime[μs]        ┆ i64        ┆ i64          │
╞═════════════════════╪═════════════════════╪════════════╪══════════════╡
│ Monday              ┆ 2023-01-01 00:00:00 ┆ 1          ┆ 0            │
│ Tuesday             ┆ 2023-01-02 00:00:00 ┆ 0          ┆ 0            │
│ Wednesday           ┆ 2023-01-03 00:00:00 ┆ 1          ┆ 1            │
│ Thursday            ┆ 2023-01-04 00:00:00 ┆ 0          ┆ 0            │
│ Friday              ┆ 2023-01-05 00:00:00 ┆ 1          ┆ 1            │
│ Saturday            ┆ 2023-01-06 00:00:00 ┆ 0          ┆ 0            │
│ Sunday              ┆ 2023-01-07 00:00:00 ┆ 0          ┆ 0            │
│ Monday              ┆ 2023-01-08 00:00:00 ┆ 1          ┆ 1            │
│ Tuesday             ┆

In [66]:
# Assuming the Visualizer class code is already defined and imported

# Initialize the Visualizer with the sample DataFrame
visualizer = Visualizer(df_sample)

# Generate the charts
chart1 = visualizer.plot_categorical_distribution("booking_day_of_week", click=False)
chart2 = visualizer.plot_timeseries("booking_date")
chart3 = visualizer.plot_histogram()

# Display the charts (works in a Jupyter Notebook or similar interactive environment)
chart1.display()
chart2.display()
chart3.display()
