In [1]:
import os
import pandas as pd


scenes_file = '../resources/scenes_mastersheet.csv'

In [2]:
scenes_df = pd.read_csv(scenes_file)
scenes_df.columns

Index(['World', 'Level', 'Scene', 'x Hi (entry)', 'x Lo (entry)',
       'x Hi (exit)', 'x Lo (exit)', 'Entry point', 'Exit point', 'Layout',
       'Enemy', '2-Horde', '3-Horde', '4-Horde', 'Roof', 'Gap',
       'Multiple gaps', 'Variable gaps', 'Gap enemy', 'Pillar gap', 'Valley',
       'Pipe valley', 'Empty valley', 'Enemy valley', 'Roof valley', '2-Path',
       '3-Path', 'Risk/Reward', 'Stair up', 'Stair down', 'Empty stair valley',
       'Enemy stair valley', 'Gap stair valley', 'Reward', 'Moving platform',
       'Flagpole', 'Beginning', 'Bonus zone', 'Legend : ', 'XXXX = checkpoint',
       'Added patterns'],
      dtype='object')

In [3]:

def curate_dataframe(df):
    # Create the 'scene_ID' column
    df['scene_ID'] = df.apply(
        lambda row: f"w{row['World']}l{row['Level']}s{row['Scene']}",
        axis=1
    )
    
    # List of feature columns you want to keep
    feature_cols = [
        'Enemy', '2-Horde', '3-Horde', '4-Horde', 'Roof', 'Gap',
        'Multiple gaps', 'Variable gaps', 'Gap enemy', 'Pillar gap', 'Valley',
        'Pipe valley', 'Empty valley', 'Enemy valley', 'Roof valley', '2-Path',
        '3-Path', 'Risk/Reward', 'Stair up', 'Stair down', 'Empty stair valley',
        'Enemy stair valley', 'Gap stair valley', 'Reward', 'Moving platform',
        'Flagpole', 'Beginning', 'Bonus zone'
    ]
    
    # Select columns to keep in the curated DataFrame
    curated_df = df[
        ['scene_ID', 'World', 'Level', 'Scene'] + feature_cols
    ].copy()
    
    return curated_df

curated_df = curate_dataframe(scenes_df)
curated_df

df_features = curated_df.drop(columns=['scene_ID', 'World', 'Level', 'Scene'])

In [4]:
# Dimensionality reduction
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import umap.umap_ as umap

def dimensionality_reduction(
    df_features,
    method="pca",
    n_components=2,
    random_state=42,
    dr_params=None
):
    """
    Apply a dimensionality reduction technique to the given feature DataFrame.

    Parameters
    ----------
    df_features : pd.DataFrame
        A DataFrame containing only the feature columns (no IDs or labels).
        All features should be numeric (or pre-encoded as numeric).

    method : str, optional
        The dimensionality reduction method to use. Options:
          - "none" : return the original data without changes
          - "pca"  : Principal Component Analysis
          - "umap" : Uniform Manifold Approximation and Projection
          - "tsne" : t-distributed Stochastic Neighbor Embedding
        Default is "pca".

    n_components : int, optional
        Number of components (dimensions) to project down to.
        Typically 2 or 3 if you're plotting. Default is 2.

    random_state : int, optional
        Random seed for reproducibility in methods that support it
        (PCA, UMAP, t-SNE). Default is 42.

    dr_params : dict, optional
        A dictionary of additional parameters for the DR method.
        Examples:
          - For PCA:  {"svd_solver": "full"}
          - For UMAP: {"n_neighbors": 15, "min_dist": 0.1}
          - For t-SNE: {"perplexity": 30, "learning_rate": 200}
        Default is None (no extra parameters).

    Returns
    -------
    pd.DataFrame
        A new DataFrame with the transformed features, whose columns are named
        DR_1, DR_2, ..., DR_{n_components}.
        If method="none", returns a copy of df_features.
    """
    if dr_params is None:
        dr_params = {}

    method = method.lower()

    # If no DR is needed, just return the original data.
    if method == "none":
        return df_features.copy()

    X = df_features.values  # Convert DF to numpy array

    if method == "pca":
        reducer = PCA(n_components=n_components, random_state=random_state, **dr_params)
        embedding = reducer.fit_transform(X)

    elif method == "umap":
        reducer = umap.UMAP(n_components=n_components, random_state=random_state, **dr_params)
        embedding = reducer.fit_transform(X)

    elif method == "tsne":
        reducer = TSNE(n_components=n_components, random_state=random_state, **dr_params)
        embedding = reducer.fit_transform(X)

    else:
        raise ValueError("method must be one of ['none', 'pca', 'umap', 'tsne'].")

    # Build a new DataFrame of the results
    dr_columns = [f"DR_{i+1}" for i in range(n_components)]
    df_reduced = pd.DataFrame(embedding, columns=dr_columns, index=df_features.index)

    return df_reduced


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Suppose you have a DataFrame "df_features" with numerical columns like:
# ['Enemy', 'Gap', 'Roof', 'Valley', ...] etc.

# 1) PCA Example
df_pca = dimensionality_reduction(
    df_features,
    method="pca",
    n_components=2,
    random_state=42,
    dr_params={"svd_solver": "full"}  # an optional PCA parameter
)
print(df_pca.head())

# 2) UMAP Example
df_umap = dimensionality_reduction(
    df_features,
    method="umap",
    n_components=2,
    random_state=42,
    dr_params={"n_neighbors": 15, "min_dist": 0.1}
)
print(df_umap.head())

# 3) t-SNE Example
df_tsne = dimensionality_reduction(
    df_features,
    method="tsne",
    n_components=2,
    random_state=42,
    dr_params={"perplexity": 30, "learning_rate": 200}
)
print(df_tsne.head())

# 4) No Dimensionality Reduction
df_none = dimensionality_reduction(df_features, method="none")
print(df_none.head())  # Same as original


       DR_1      DR_2
0 -0.339596 -0.246100
1  0.381437  0.603183
2 -0.351999 -0.124654
3  0.277867  0.038365
4 -0.545734  0.438426


  warn(


        DR_1       DR_2
0  33.720116 -34.881977
1  32.133862 -33.344692
2  32.238438 -30.521679
3  32.921875 -33.703979
4  31.672804 -30.393167
       DR_1      DR_2
0 -2.528442 -3.821689
1  6.720411  5.306932
2 -8.886157 -7.270678
3  1.139122  6.946015
4 -4.916055 -7.888424
   Enemy  2-Horde  3-Horde  4-Horde  Roof  Gap  Multiple gaps  Variable gaps  \
0      0        0        0        0     0    0              0              0   
1      1        0        0        0     0    0              0              0   
2      0        0        0        0     0    0              0              0   
3      1        0        0        0     0    0              0              0   
4      0        1        0        0     0    0              0              0   

   Gap enemy  Pillar gap  ...  Stair up  Stair down  Empty stair valley  \
0          0           0  ...         0           0                   0   
1          0           0  ...         0           0                   0   
2          0      

In [12]:
import plotly.graph_objects as go
import matplotlib.colors as mcolors

def interactive_plot_2d(df_curated, df_reduced,
                        x_dim="DR_1", y_dim="DR_2",
                        title="Interactive 2D Scatter",
                        out_html=None):
    """
    Creates an interactive Plotly scatter plot with:
      - One base hue per World, brightness variation per Level.
      - Hover tooltips showing w{World}l{Level}s{Scene}.
      - Legend placed outside the main frame (upper left).
      - Optional export to an HTML file.

    Parameters
    ----------
    df_curated : pd.DataFrame
        Should have columns: 'World', 'Level', 'Scene', ...
        Must align with df_reduced (index or a merge key).

    df_reduced : pd.DataFrame
        Contains 2D coordinates, e.g. DR_1 and DR_2 (or your chosen x,y).
        Must align with df_curated.

    x_dim : str, optional
        Column name for x-axis in df_reduced. Default "DR_1".

    y_dim : str, optional
        Column name for y-axis in df_reduced. Default "DR_2".

    title : str, optional
        The plot title.

    out_html : str or None, optional
        If provided, saves the figure to the specified HTML file.

    Returns
    -------
    None
        Displays an interactive plot in the notebook,
        and optionally writes it to an HTML file.
    """

    # 1. Merge or join DataFrames
    # If they share the same index:
    df = df_curated.join(df_reduced, how="inner")
    # If they share a 'scene_ID' key instead, you'd do:
    # df = df_curated.merge(df_reduced, on='scene_ID', how="inner")

    # 2. Create a comprehensive label like w1l2s3
    df["scene_ID"] = df.apply(lambda r: f"w{r['World']}l{r['Level']}s{r['Scene']}", axis=1)

    # 3. Define base hues per World (internally)
    base_hues = {
        1: "red",
        2: "blue",
        3: "green",
        4: "#FFA500",  # more vibrant orange
        5: "purple",
        6: "brown",
        7: "yellow",   # replaced pink with yellow
        8: "gray"
    }

    # 4. Identify levels per world
    worlds = sorted(df["World"].unique())
    world_levels = {
        w: sorted(df.loc[df["World"] == w, "Level"].unique()) for w in worlds
    }

    # Helper to vary brightness
    def adjust_brightness(color, factor):
        c = mcolors.to_rgb(color)
        return tuple(min(1, max(0, chan * factor)) for chan in c)

    # Build final color dictionary: (w, lvl) -> color
    wl_colors = {}
    for w in worlds:
        base = base_hues.get(w, "gray")  # fallback if not defined
        lvls = world_levels[w]
        count_lvls = len(lvls)

        # Spread brightness from 0.7 to 1.3
        for i, lvl in enumerate(lvls):
            if count_lvls == 1:
                factor = 1.0
            else:
                fraction = i / (count_lvls - 1)
                factor = 0.7 + 0.6 * fraction
            wl_colors[(w, lvl)] = adjust_brightness(base, factor)

    # 5. Create a Plotly figure and add one trace per (World, Level)
    fig = go.Figure()

    for (w, lvl), color_rgb in wl_colors.items():
        # Convert (r, g, b) in [0,1] to a CSS string like 'rgba(...)'
        color_str = f"rgba({int(color_rgb[0]*255)},{int(color_rgb[1]*255)},{int(color_rgb[2]*255)},0.8)"

        subset = df[(df["World"] == w) & (df["Level"] == lvl)]

        fig.add_trace(
            go.Scatter(
                x=subset[x_dim],
                y=subset[y_dim],
                mode='markers',
                name=f"w{w}l{lvl}",
                marker=dict(color=color_str),
                # Hover shows the comprehensive scene_ID and x,y
                text=subset["scene_ID"],
                hovertemplate=(
                    "Scene ID: %{text}<br>"
                    f"{x_dim}: %{{x}}<br>"
                    f"{y_dim}: %{{y}}<extra></extra>"
                )
            )
        )

    # 6. Update layout: place legend outside the main frame (upper left region)
    fig.update_layout(
        title=title,
        xaxis_title=x_dim,
        yaxis_title=y_dim,
        legend=dict(
            x=1.02,  # shift right
            y=1,     # top aligned
            xanchor='left',
            yanchor='top',
            bgcolor='rgba(0,0,0,0)'  # transparent background
        ),
        margin=dict(r=200),  # extra space on the right to accommodate legend
        hovermode='closest'
    )

    # 7. Optionally save to HTML, then display
    if out_html:
        fig.write_html(out_html)

    fig.show()


In [13]:
interactive_plot_2d(
    df_curated=curated_df,
    df_reduced=df_umap,
    x_dim="DR_1",
    y_dim="DR_2",
    title="UMAP projection of scenes annotations",
    out_html="scenes_annotations_umap.html"
)

In [14]:
interactive_plot_2d(
    df_curated=curated_df,
    df_reduced=df_pca,
    x_dim="DR_1",
    y_dim="DR_2",
    title="PCA projection of scenes annotations",
    out_html="scenes_annotations_pca.html"
)

In [10]:
df_pca.columns

Index(['DR_1', 'DR_2'], dtype='object')