In [None]:
# Corpus_Browser.ipynb

# Install required packages (run only if not installed)
# !pip install plotly itables ipywidgets

# Import required libraries
import pandas as pd
import polars as pl
from pathlib import Path
import ipywidgets as widgets
from IPython.display import display, clear_output
import plotly.express as px
import itables

# Initialize itables for interactive tables in notebook
itables.init_notebook_mode()

# -------------------------------
# Step 1: Define paths to CSV summary files
# Replace these with actual CSV file paths after running Task 1
CDLK_CSV = "xmi_basic_info_cdlk.csv"
KLP1_CSV = "xmi_basic_info_klp1.csv"

# -------------------------------
# Step 2: Define loaders for Pandas and Polars dataframes
def load_corpus_pandas(corpus_name):
    """
    Load corpus data as a pandas DataFrame based on corpus name.
    """
    if corpus_name == "CDLK":
        df = pd.read_csv(CDLK_CSV)
    else:
        df = pd.read_csv(KLP1_CSV)
    return df

def load_corpus_polars(corpus_name):
    """
    Load corpus data as a polars DataFrame based on corpus name.
    """
    if corpus_name == "CDLK":
        df = pl.read_csv(CDLK_CSV)
    else:
        df = pl.read_csv(KLP1_CSV)
    return df

# -------------------------------
# Step 3: Setup UI widgets for user selection
corpus_selector = widgets.Dropdown(
    options=["CDLK", "KLP1"],
    description="Corpus:",
    value="CDLK"
)

backend_selector = widgets.Dropdown(
    options=["Pandas", "Polars"],
    description="Backend:",
    value="Pandas"
)

output_area = widgets.Output()

# -------------------------------
# Step 4: Define event handler for selection changes
def on_selection_change(change):
    with output_area:
        clear_output()
        corpus = corpus_selector.value
        backend = backend_selector.value
        print(f"Loading {corpus} corpus using {backend} backend...\n")

        if backend == "Pandas":
            df = load_corpus_pandas(corpus)

            # Display interactive table preview
            print("Preview of the first 10 rows:")
            display(itables.show(df.head(10)))

            # Show descriptive statistics
            print("\nBasic Statistics:")
            display(df.describe())

            # Plot histograms using Plotly Express
            fig = px.histogram(df, x="doc_text_length", nbins=30,
                               title=f"{corpus} Document Text Length Distribution")
            fig.show()

            fig2 = px.histogram(df, x="token_count", nbins=30,
                                title=f"{corpus} Token Count Distribution")
            fig2.show()

            fig3 = px.histogram(df, x="sentence_count", nbins=30,
                                title=f"{corpus} Sentence Count Distribution")
            fig3.show()

        else:  # Polars backend
            df = load_corpus_polars(corpus)

            # Convert Polars DataFrame to Pandas for interactive display and plotting
            df_pd = df.to_pandas()

            print("Preview of the first 10 rows:")
            display(itables.show(df_pd.head(10)))

            print("\nBasic Statistics:")
            display(df_pd.describe())

            fig = px.histogram(df_pd, x="doc_text_length", nbins=30,
                               title=f"{corpus} Document Text Length Distribution")
            fig.show()

            fig2 = px.histogram(df_pd, x="token_count", nbins=30,
                                title=f"{corpus} Token Count Distribution")
            fig2.show()

            fig3 = px.histogram(df_pd, x="sentence_count", nbins=30,
                                title=f"{corpus} Sentence Count Distribution")
            fig3.show()

# -------------------------------
# Step 5: Link UI widgets to event handler
corpus_selector.observe(on_selection_change, names='value')
backend_selector.observe(on_selection_change, names='value')

# -------------------------------
# Step 6: Display the widgets and output area in notebook
display(corpus_selector, backend_selector, output_area)

# Initial trigger to load default view
on_selection_change(None)
