In [None]:
# Plotly renderer setup and safe_show utility
import plotly.io as pio
import os

# Prefer VS Code renderer when available; fallback to browser
preferred_renderers = ['vscode', 'notebook', 'jupyterlab']
for r in preferred_renderers:
    try:
        pio.renderers.default = r
        break
    except Exception:
        continue

# Final fallback: browser
if not pio.renderers.default:
    pio.renderers.default = 'browser'   

# Safe show helper
from pathlib import Path

def safe_show(fig, name: str, outdir: Path):
    try:
        fig.show()
    except Exception as e:
        print(f"Plotly show failed ({e}). Saving to outputs as fallback.")
        outdir.mkdir(parents=True, exist_ok=True)
        fig.write_html(outdir / f"{name}.html")
        try:
            import kaleido  # ensure available
            fig.write_image(outdir / f"{name}.png", scale=2)
        except Exception:
            pass

# Captain Coaster Cluster Analysis
This notebook loads rollercoaster data, engineers features from ratings and specs, clusters coasters, and visualizes clusters interactively.

# 0. Goal and streamlined workflow
- Visualize coasters in 2D via PCA and UMAP, colored by `avg_rating`.
- Cluster in the 2D space (on PCA components) to group similar coasters.
- Explore how properties (e.g., laps, inversions, height, speed, length, duration) relate to higher scores.

We exclude `avg_rating` and `total_ratings` from PCA to avoid rating driving the projection. Clustering is performed on PCA components.

In [2]:
# Import libraries and set paths
import os
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

import plotly.io as pio
# For VS Code notebooks:
pio.renderers.default = 'vscode'
# For classic Jupyter Notebook:
# pio.renderers.default = 'notebook'

import plotly.express as px
import plotly.graph_objects as go

# Models and preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, silhouette_samples

# Optional UMAP
try:
    import umap
    HAS_UMAP = True
except Exception:
    HAS_UMAP = False

# Optional HDBSCAN
try:
    import hdbscan
    HAS_HDBSCAN = True
except Exception:
    HAS_HDBSCAN = False

DATA_DIR = Path('c:/Users/Lenovo/rollercoaster')
mapping_path = DATA_DIR / 'ratings_data/rating_to_rfdb_mapping_enhanced.csv'
merged_path = DATA_DIR / 'ratings_data/rating_to_rfdb_mapping_merged.csv'
output_dir = DATA_DIR / 'outputs/cluster_figures'
output_dir.mkdir(parents=True, exist_ok=True)

# Load the most complete CSV
df_map = pd.read_csv(mapping_path)
df = df_map.copy()
if merged_path.exists():
    df_merged = pd.read_csv(merged_path)
    # Prefer merged if it has more non-null ratings/specs
    cols = ['avg_rating','total_ratings','height_m','speed_kmh','track_length_m','inversions_count']
    def non_null_count(d):
        return sum(d[c].notna().sum() if c in d.columns else 0 for c in cols)
    if non_null_count(df_merged) >= non_null_count(df_map):
        df = df_merged.copy()
print(f"Loaded rows: {len(df)}")
df.head(3)

Loaded rows: 1299


Unnamed: 0,coaster_id,ratings_coaster,ratings_park,rfdb_coaster_folder,rfdb_park_folder,csv_count,full_path,coaster_similarity,park_similarity,combined_similarity,...,pct_3.0_stars,pct_3.5_stars,pct_4.0_stars,pct_4.5_stars,pct_5.0_stars,scraped_at,height_m,speed_kmh,track_length_m,inversions_count
0,686,10 Inversion Roller Coaster,Chimelong Paradise,invertigo,kingsisland,5,rfdb_csvs\kingsisland\invertigo,66.7,40.0,58.7,...,,,,,,2025-12-05T16:10:05.976986,29.87,72.42,850.09,10
1,382,Abismo,Parque de Atracciones de Madrid,abismo,parquedeatraccionesdemadrid,1,rfdb_csvs\parquedeatraccionesdemadrid\abismo,100.0,93.1,97.9,...,,,,,,2025-12-05T16:09:51.019097,46.02,104.61,449.88,2
2,2570,Abyss,Adventure World,theboss,sixflagsstlouis,5,rfdb_csvs\sixflagsstlouis\theboss,66.7,25.0,54.2,...,,,,,,2025-12-05T16:09:51.001869,29.87,85.3,630.02,3


In [3]:
# Utility: Safe figure display (defined early)
import plotly.io as pio

def safe_show(fig, name: str = None):
    try:
        fig.show()
    except Exception as e:
        print(f"Plotly show failed ({e}). Saving to outputs as fallback.")
        if name:
            try:
                fig.write_image(str(output_dir / f"{name}.png"))
            except Exception as _img_e:
                try:
                    html_path = output_dir / f"{name}.html"
                    pio.write_html(fig, file=str(html_path), auto_open=False)
                    print(f"Saved HTML to {html_path}")
                except Exception as _html_e:
                    print("Could not save image or HTML.")

In [4]:
# 2. Inspect and Summarize Columns
df.info()
df.describe(include='all').T.head(20)
print("\nMissing values per column (top 20):")
print(df.isna().sum().sort_values(ascending=False).head(20))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   coaster_id           1299 non-null   int64  
 1   ratings_coaster      1299 non-null   object 
 2   ratings_park         1299 non-null   object 
 3   rfdb_coaster_folder  1299 non-null   object 
 4   rfdb_park_folder     1299 non-null   object 
 5   csv_count            1299 non-null   int64  
 6   full_path            1299 non-null   object 
 7   coaster_similarity   1299 non-null   float64
 8   park_similarity      1299 non-null   float64
 9   combined_similarity  1299 non-null   float64
 10  match_reason         1299 non-null   object 
 11  match_type           1299 non-null   object 
 12  avg_rating           1052 non-null   float64
 13  total_ratings        1052 non-null   float64
 14  pct_0.5_stars        0 non-null      float64
 15  pct_1.0_stars        0 non-null      f

In [5]:
# 3. Clean and Preprocess Data
NUMERIC_COLS = [
    'avg_rating','total_ratings',
    # Exclude pct_* star columns (often empty)
    'height_m','speed_kmh','track_length_m','inversions_count',
 ]
CAT_COLS = [c for c in ['ratings_coaster','ratings_park','rfdb_park_folder','rfdb_coaster_folder'] if c in df.columns]

# Clip outliers for specs
for c in ['height_m','speed_kmh','track_length_m','inversions_count']:
    if c in df.columns:
        q1, q99 = df[c].quantile(0.01), df[c].quantile(0.99)
        df[c] = df[c].clip(lower=q1, upper=q99)

# Impute numeric with median, categorical with most frequent
for c in NUMERIC_COLS:
    if c in df.columns:
        df[c] = df[c].fillna(df[c].median())
for c in CAT_COLS:
    df[c] = df[c].fillna(df[c].mode().iloc[0] if not df[c].mode().empty else 'Unknown')

print('Preprocessing complete. Using numeric columns:', NUMERIC_COLS)

Preprocessing complete. Using numeric columns: ['avg_rating', 'total_ratings', 'height_m', 'speed_kmh', 'track_length_m', 'inversions_count']


In [6]:
# 4. Feature Engineering for Clustering
FEATURE_COLS = [c for c in NUMERIC_COLS if c in df.columns]
FEATURE_EXCLUDE = {'avg_rating','total_ratings'}  # Exclude rating signals from PCA input
FEATURE_COLS_PCA = [c for c in FEATURE_COLS if c not in FEATURE_EXCLUDE]
# Coerce to numeric and allow NaNs for now (to be imputed next)
X = df[FEATURE_COLS_PCA].apply(pd.to_numeric, errors='coerce')
print(f"Using {len(FEATURE_COLS_PCA)} PCA features (excluding {sorted(FEATURE_EXCLUDE)}): {FEATURE_COLS_PCA}")
print('Initial NaNs per feature (top 10):')
print(X.isna().sum().sort_values(ascending=False).head(10))

Using 4 PCA features (excluding ['avg_rating', 'total_ratings']): ['height_m', 'speed_kmh', 'track_length_m', 'inversions_count']
Initial NaNs per feature (top 10):
height_m            0
speed_kmh           0
track_length_m      0
inversions_count    0
dtype: int64


In [7]:
# 5. Scale Features
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='median')
X_imputed = imp.fit_transform(X)
# Verify no NaNs remain
import numpy as np
if np.isnan(X_imputed).any():
    # Fallback: replace any remaining NaNs with zeros
    X_imputed = np.nan_to_num(X_imputed)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
print('Scaled feature matrix shape:', X_scaled.shape)

Scaled feature matrix shape: (1299, 4)


In [8]:
# 6. Dimensionality Reduction (PCA and UMAP)
# Choose a safe n_components based on data shape
max_components = int(min(X_scaled.shape[0] - 1, X_scaled.shape[1], 10))
if max_components < 2:
    max_components = 2  # ensure at least 2 components for plotting
pca = PCA(n_components=max_components, random_state=42)
X_pca = pca.fit_transform(X_scaled)
explained = pca.explained_variance_ratio_
fig_pca_var = px.bar(x=list(range(1, len(explained)+1)), y=explained, labels={'x':'PC', 'y':'Explained Variance Ratio'}, title=f'PCA Explained Variance (n_components={max_components})')
safe_show(fig_pca_var, 'pca_explained_variance')

# Feature loadings for PC1/PC2 (based on FEATURE_COLS_PCA)
loadings = pd.DataFrame(pca.components_[:2], columns=FEATURE_COLS_PCA, index=['PC1','PC2']).T
loadings['abs_sum'] = loadings['PC1'].abs() + loadings['PC2'].abs()
top_feats = loadings.sort_values('abs_sum', ascending=False).head(12)
fig_load = px.bar(top_feats, x='abs_sum', y=top_feats.index, orientation='h', title='Top Feature Contributions to PC1/PC2 (excluding avg_rating)')
fig_load.update_layout(yaxis={'title':'feature'}, xaxis={'title':'|loading| sum'})
safe_show(fig_load, 'pca_top_feature_loadings')

pca_df = pd.DataFrame(X_pca[:, :2], columns=['PC1','PC2'])
pca_df['avg_rating'] = df.get('avg_rating', pd.Series(index=df.index))
if 'name_col' in globals():
    pca_df[name_col] = df[name_col]
else:
    pca_df['ratings_coaster'] = df.get('ratings_coaster', pd.Series(index=df.index))
    name_col = 'ratings_coaster'

# UMAP (optional)
if HAS_UMAP:
    reducer = umap.UMAP(n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=42)
    X_umap = reducer.fit_transform(X_scaled)
    umap_df = pd.DataFrame(X_umap, columns=['UMAP1','UMAP2'])
    umap_df[name_col] = pca_df[name_col]
    umap_df['avg_rating'] = pca_df['avg_rating']
else:
    umap_df = None
print('PCA and UMAP computed.')


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



PCA and UMAP computed.


In [9]:
# 7. Cluster with KMeans
from sklearn.metrics import silhouette_score
k_range = list(range(2, 13))
sil_scores = []
models = {}
for k in k_range:
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    labels = km.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    sil_scores.append(score)
    models[k] = km
fig_sil = px.line(x=k_range, y=sil_scores, markers=True, labels={'x':'k','y':'silhouette_score'}, title='Silhouette vs k')
safe_show(fig_sil, 'silhouette_vs_k')
best_k = k_range[int(np.argmax(sil_scores))]
kmeans = models[best_k]
labels_km = kmeans.predict(X_scaled)
print('Chosen k:', best_k)

Chosen k: 4


In [10]:
# 8. Cluster with DBSCAN
db_labels = None
try:
    db = DBSCAN(eps=1.5, min_samples=10)
    db_labels = db.fit_predict(X_scaled)
    print('DBSCAN labels:', np.unique(db_labels))
except Exception as e:
    print('DBSCAN not available or failed:', e)

DBSCAN labels: [-1  0]


In [11]:
# 9. Evaluate Clusters (Silhouette, Inertia)
sil_km = silhouette_score(X_scaled, labels_km)
print(f"KMeans silhouette: {sil_km:.3f}")
inertia_km = kmeans.inertia_
print(f"KMeans inertia: {inertia_km:.2f}")

KMeans silhouette: 0.437
KMeans inertia: 2091.82


In [12]:
# A. PCA scatter colored by avg_rating and clustering on PCA
pca_df['cluster'] = labels_km

# Name column for hover
name_candidates = ['coaster_name','ratings_coaster','rfdb_coaster_folder','ratings_url']
name_col = next((c for c in name_candidates if c in df.columns), None)
if name_col is None:
    name_col = 'index'; df = df.reset_index()
pca_df[name_col] = df[name_col]

# PCA scatter colored by rating
fig_pca_rating = px.scatter(pca_df, x='PC1', y='PC2', color='avg_rating', color_continuous_scale='Turbo',
                             hover_data=[name_col], title='PCA Colored by Average Rating')
fig_pca_rating.update_layout(coloraxis_colorbar=dict(title='Avg Rating'))
safe_show(fig_pca_rating, 'pca_colored_by_rating')

# KMeans clustering already computed as labels_km; overlay centroids for clarity
centroids = pca_df.groupby('cluster')[['PC1','PC2']].mean().reset_index()
# Compute mean avg_rating per cluster for labels
rating_means = pca_df.groupby('cluster')['avg_rating'].mean().round(2).reset_index()
centroids = centroids.merge(rating_means, on='cluster', how='left')

fig_pca_clusters = px.scatter(pca_df, x='PC1', y='PC2', color='cluster', hover_data=[name_col,'avg_rating'], title='PCA Clusters (KMeans on PCA)')
fig_pca_clusters.add_scatter(x=centroids['PC1'], y=centroids['PC2'], mode='markers',
                              marker=dict(size=14, symbol='x', color='black'),
                              name='Centroids')
# Add readable labels as annotations with semi-transparent background
annotations = []
for _, row in centroids.iterrows():
    annotations.append(dict(
        x=float(row['PC1']), y=float(row['PC2']),
        xref='x', yref='y',
        text=f"C{int(row['cluster'])}: {float(row['avg_rating']):.2f}",
        showarrow=False,
        font=dict(size=12, color='black'),
        align='center',
        bgcolor='rgba(255,255,255,0.7)',
        bordercolor='rgba(0,0,0,0.2)',
        borderpad=4,
        xanchor='center', yanchor='bottom'
    ))
fig_pca_clusters.update_layout(annotations=annotations)
safe_show(fig_pca_clusters, 'pca_clusters_with_centroids_rating')

In [13]:
# 10.4 UMAP colored by average rating (continuous)
import plotly.express as px

if HAS_UMAP and umap_df is not None:
    # Ensure avg_rating exists; fill missing with median for visualization only
    if 'avg_rating' not in umap_df.columns or umap_df['avg_rating'].isna().all():
        if 'avg_rating' in df.columns:
            umap_df['avg_rating'] = df['avg_rating'].fillna(df['avg_rating'].median())
        else:
            umap_df['avg_rating'] = 0
    # Build continuous color scatter for UMAP
    fig_umap_rating = px.scatter(
        umap_df, x='UMAP1', y='UMAP2',
        color='avg_rating', color_continuous_scale='Turbo',
        hover_data=[name_col] if 'name_col' in globals() else None,
        title='UMAP Scatter Colored by Average Rating'
)
    fig_umap_rating.update_layout(coloraxis_colorbar=dict(title='Avg Rating'))
    safe_show(fig_umap_rating, 'umap_scatter_avg_rating')
else:
    print('UMAP embedding not available; skipping rating-colored UMAP plot.')

In [14]:
# B. UMAP scatter colored by avg_rating and clusters with labeled centroids
if HAS_UMAP and umap_df is not None:
    # Ensure avg_rating exists; fill missing with median for visualization only
    if 'avg_rating' not in umap_df.columns or umap_df['avg_rating'].isna().all():
        umap_df['avg_rating'] = df['avg_rating'].fillna(df['avg_rating'].median()) if 'avg_rating' in df.columns else 0
    # Ensure name column exists for hover
    if 'name_col' not in globals():
        name_candidates = ['coaster_name','ratings_coaster','rfdb_coaster_folder','ratings_url']
        name_col = next((c for c in name_candidates if c in df.columns), None)
        if name_col is None:
            name_col = 'index'; df = df.reset_index()
    if name_col not in umap_df.columns:
        umap_df[name_col] = df[name_col]
    # Attach clusters computed on PCA features to UMAP dataframe
    if 'cluster' not in umap_df.columns:
        umap_df['cluster'] = pca_df.get('cluster', pd.Series(labels_km, index=pca_df.index))
    # Base UMAP scatter colored by cluster
    fig_umap_scatter = px.scatter(umap_df, x='UMAP1', y='UMAP2', color='cluster', hover_data=[name_col,'avg_rating'], title='UMAP Clusters (KMeans on PCA)')
    # Compute centroids and mean ratings for labels in UMAP space
    centroids_umap = umap_df.groupby('cluster')[['UMAP1','UMAP2']].mean().reset_index()
    rating_means_umap = umap_df.groupby('cluster')['avg_rating'].mean().round(2).reset_index()
    centroids_umap = centroids_umap.merge(rating_means_umap, on='cluster', how='left')
    # Add centroid markers
    fig_umap_scatter.add_scatter(x=centroids_umap['UMAP1'], y=centroids_umap['UMAP2'], mode='markers',
                                 marker=dict(size=14, symbol='x', color='black'), name='Centroids')
    # Add readable labels as semi-transparent annotations
    annotations = []
    for _, row in centroids_umap.iterrows():
        annotations.append(dict(
            x=float(row['UMAP1']), y=float(row['UMAP2']),
            xref='x', yref='y',
            text=f"C{int(row['cluster'])}: {float(row['avg_rating']):.2f}",
            showarrow=False,
            font=dict(size=12, color='black'),
            align='center',
            bgcolor='rgba(255,255,255,0.7)',
            bordercolor='rgba(0,0,0,0.2)',
            borderpad=4,
            xanchor='center', yanchor='bottom'
        ))
    fig_umap_scatter.update_layout(annotations=annotations)
    safe_show(fig_umap_scatter, 'umap_clusters_with_centroids_rating')
else:
    print('UMAP not available; skip UMAP clustering visualization.')

In [15]:
# 10.1 Enhanced PCA cluster visualization: centroids and counts
import numpy as np
import plotly.graph_objects as go

pca_df['cluster'] = labels_km
# Ensure name column exists for hover
name_candidates = ['coaster_name','ratings_coaster','rfdb_coaster_folder','ratings_url']
name_col = next((c for c in name_candidates if c in pca_df.columns or c in df.columns), None)
if name_col is None:
    name_col = 'index'
    if 'index' not in df.columns:
        df = df.reset_index()
if name_col not in pca_df.columns:
    pca_df[name_col] = df[name_col]

# Compute centroids in PC space and cluster sizes
centroids = pca_df.groupby('cluster')[['PC1','PC2']].mean().reset_index()
counts = pca_df.groupby('cluster').size().reset_index(name='count')
centroids = centroids.merge(counts, on='cluster')

# Build enhanced figure
fig_pca_enh = go.Figure()
for cl in sorted(pca_df['cluster'].unique()):
    sub = pca_df[pca_df['cluster'] == cl]
    fig_pca_enh.add_trace(go.Scatter(
        x=sub['PC1'], y=sub['PC2'], mode='markers',
        marker=dict(size=6, opacity=0.7),
        name=f"Cluster {cl}",
        hovertemplate="PC1=%{x:.2f}<br>PC2=%{y:.2f}<br>Name=%{text}",
        text=sub[name_col]
    ))
# Centroid markers with labels
fig_pca_enh.add_trace(go.Scatter(
    x=centroids['PC1'], y=centroids['PC2'], mode='markers+text',
    marker=dict(size=14, symbol='x', color='black'),
    text=[f"C{int(c)} (n={int(n)})" for c, n in zip(centroids['cluster'], centroids['count'])],
    textposition='top center', name='Centroids'
))
fig_pca_enh.update_layout(title='PCA Scatter with Cluster Centroids and Sizes', xaxis_title='PC1', yaxis_title='PC2')
safe_show(fig_pca_enh, 'pca_scatter_centroids')

In [16]:
# 10.6 Star rating distribution profiles per cluster
print('Skipping star percentage columns (pct_*); they are empty or unreliable in this dataset.')

Skipping star percentage columns (pct_*); they are empty or unreliable in this dataset.


In [17]:
# 10.5 Build df_clusters for profiles
df_clusters = df.copy()
df_clusters['cluster'] = labels_km
print('df_clusters created:', df_clusters.shape)

df_clusters created: (1299, 30)


In [18]:
# Cluster profiles and radar with coverage diagnostics and mean ratings
import numpy as np
import plotly.graph_objs as go

# Radar chart for specs: prefer display names, fallback to originals
spec_candidates = [
    ('Height (ft)', 'height_m'),
    ('Speed (MPH)', 'speed_kmh'),
    ('Track Length (ft)', 'track_length_m'),
    ('inversions_count', 'inversions_count'),
]
spec_cols = []
for disp, orig in spec_candidates:
    if disp in df.columns:
        spec_cols.append(disp)
    elif orig in df.columns:
        spec_cols.append(orig)

# Diagnostics: coverage per cluster
coverage = (
    df_clusters
    .assign(all_specs_present=df_clusters[spec_cols].notna().all(axis=1))
    .groupby('cluster')
    .agg(total=('cluster','size'), present=('all_specs_present','sum'))
)
coverage['pct_present'] = (coverage['present'] / coverage['total'] * 100).round(1)
print('Spec coverage per cluster:\n', coverage)

# Mean average rating per cluster for context
if 'avg_rating' in df_clusters.columns:
    rating_summary = (
        df_clusters.groupby('cluster')['avg_rating'].mean().round(2).reset_index()
    )
    print('Mean average rating per cluster:\n', rating_summary)
    rating_map = dict(zip(rating_summary['cluster'], rating_summary['avg_rating']))
else:
    rating_summary = None
    rating_map = {}

# Use only rows with complete specs to avoid bias
df_specs = df_clusters[df_clusters[spec_cols].notna().all(axis=1)].copy()
if df_specs.empty:
    # Fallback: use imputed values from original df
    df_specs = df_clusters.copy()
    for c in spec_cols:
        df_specs[c] = df_specs[c].fillna(df[c].median() if c in df.columns else 0)

spec_means = df_specs.groupby('cluster')[spec_cols].mean().reset_index()
# Normalize 0..1 using max-min per column
min_vals = spec_means[spec_cols].min()
max_vals = spec_means[spec_cols].max()
range_vals = (max_vals - min_vals).replace(0, 1)  # avoid div-by-zero
spec_norm = (spec_means[spec_cols] - min_vals) / range_vals
spec_norm['cluster'] = spec_means['cluster']
fig_radar = go.Figure()
for _, row in spec_norm.iterrows():
    cl = int(row['cluster'])
    avg = rating_map.get(cl, None)
    trace_name = f"Cluster {cl}" if avg is None else f"Cluster {cl} — Avg {avg:.2f}"
    fig_radar.add_trace(go.Scatterpolar(r=row[spec_cols].values,
                                        theta=spec_cols,
                                        fill='toself',
                                        name=trace_name))
# Build a subtitle with mean ratings if available
title_text = "Cluster Spec Profiles (complete-spec rows)"
if rating_summary is not None and not rating_summary.empty:
    subtitle = ", ".join([f"C{int(r['cluster'])}: {float(r['avg_rating']):.2f}" for _, r in rating_summary.iterrows()])
    #title_text += f" — Mean Avg Rating: {subtitle}"
fig_radar.update_layout(polar=dict(radialaxis=dict(visible=True)), showlegend=True, title=title_text)
#safe_show(fig_radar, "cluster_specs_radar")


# change names of access as before

# fix names of axes in radar plot
spec_display_names = {
    'height_m': 'Height (ft)',
    'speed_kmh': 'Speed (mph)',
    'track_length_m': 'Track Length (ft)',
    'inversions_count': 'Inversions',
}   
spec_cols_named = [spec_display_names.get(c, c) for c in spec_cols]
fig_radar.update_layout(polar=dict(
    radialaxis=dict(visible=True),
    angularaxis=dict(
        tickmode='array',
        tickvals=list(range(len(spec_cols_named))),
        ticktext=spec_cols_named
    )
), showlegend=True, title=title_text)
safe_show(fig_radar, "cluster_specs_radar_named")


Spec coverage per cluster:
          total  present  pct_present
cluster                             
0          791      791        100.0
1           56       56        100.0
2          267      267        100.0
3          185      185        100.0
Mean average rating per cluster:
    cluster  avg_rating
0        0        2.29
1        1        2.67
2        2        3.19
3        3        3.82


In [19]:
# 13. Save Clustered Dataset and Figures
clustered = df.copy()
clustered['cluster'] = labels_km
clustered['PC1'] = pca_df['PC1']
clustered['PC2'] = pca_df['PC2']
if HAS_UMAP and umap_df is not None:
    clustered['UMAP1'] = umap_df['UMAP1']
    clustered['UMAP2'] = umap_df['UMAP2']
out_csv = DATA_DIR / 'outputs/cc_clusters.csv'
clustered.to_csv(out_csv, index=False, encoding='utf-8')
print(f"Saved clustered dataset to: {out_csv}")

# Save figures (guard if missing)
for name, fig in [
    ('pca_explained_variance', 'fig_pca_var'),
    ('pca_scatter_clusters', 'fig_pca_scatter'),
    ('pca_scatter_centroids', 'fig_pca_enh'),
    ('pca_scatter_avg_rating', 'fig_pca_rating'),
    ('umap_scatter_clusters', 'fig_umap_scatter'),
    ('umap_scatter_avg_rating', 'fig_umap_rating'),
    ('pca_top_feature_loadings', 'fig_load')
]:
    try:
        f = globals().get(fig)
        if f:
            f.write_image(str(output_dir / f'{name}.png'))
    except Exception:
        pass

try:
    fig_star.write_image(str(output_dir / 'cluster_star_profiles.png'))
except Exception:
    print('fig_star not defined or write failed; skipping image save.')

print('Saved figures to', output_dir)


Saved clustered dataset to: c:\Users\Lenovo\rollercoaster\outputs\cc_clusters.csv


Resorting to unclean kill browser.
Resorting to unclean kill browser.


fig_star not defined or write failed; skipping image save.
Saved figures to c:\Users\Lenovo\rollercoaster\outputs\cluster_figures


In [20]:
# Utility: Safe figure display
import plotly.io as pio
def safe_show(fig, name: str = None):
    try:
        fig.show()
    except Exception as e:
        print(f"Plotly show failed ({e}). Saving to outputs as fallback.")
        if name:
            try:
                fig.write_image(str(output_dir / f"{name}.png"))
            except Exception as _img_e:
                try:
                    html_path = output_dir / f"{name}.html"
                    pio.write_html(fig, file=str(html_path), auto_open=False)
                    print(f"Saved HTML to {html_path}")
                except Exception as _html_e:
                    print("Could not save image or HTML.")

In [21]:
# C. Property effects on rating: visuals to explain high scores
props = []
# Prefer display columns if available
if 'Track Length (ft)' in df.columns:
    props.append('Track Length (ft)')
elif 'track_length_m' in df.columns:
    props.append('track_length_m')
if 'Speed (MPH)' in df.columns:
    props.append('Speed (MPH)')
elif 'speed_kmh' in df.columns:
    props.append('speed_kmh')
if 'Height (ft)' in df.columns:
    props.append('Height (ft)')
elif 'height_m' in df.columns:
    props.append('height_m')
if 'ride_duration_s' in df.columns:
    props.append('ride_duration_s')
if 'inversions_count' in df.columns:
    props.append('inversions_count')
if 'laps_count' in df.columns:
    props.append('laps_count')

available_props = [p for p in props if p in df.columns]
print('Available property columns:', available_props)

import statsmodels.api as sm
import numpy as np
import plotly.express as px

figs = []
for c in available_props:
    sub = df[[c, 'avg_rating']].dropna()
    if len(sub) < 5:
        continue
    # LOWESS smoothing for trend
    lowess = sm.nonparametric.lowess
    z = lowess(sub['avg_rating'], sub[c], frac=0.3, return_sorted=True)
    z = np.array(z)

    f = px.scatter(sub, x=c, y='avg_rating', opacity=0.6, title=f'{c} vs Avg Rating')
    f.add_scatter(x=z[:,0], y=z[:,1], mode='lines', name='LOWESS')
    figs.append(f)

if len(figs):
    from plotly.subplots import make_subplots
    rows = int(np.ceil(len(figs)/2))
    fig_props = make_subplots(rows=rows, cols=2, subplot_titles=[f.layout.title.text for f in figs])
    r = c = 1
    for f in figs:
        fig_props.add_trace(f.data[0], row=r, col=c)
        fig_props.add_trace(f.data[1], row=r, col=c)
        c += 1
        if c > 2:
            c = 1
            r += 1
    fig_props.update_layout(height=300*rows, title='Property Effects on Avg Rating')
    safe_show(fig_props, 'property_effects')
else:
    print('No sufficient data to plot property effects.')

Available property columns: ['track_length_m', 'speed_kmh', 'height_m', 'inversions_count']



invalid value encountered in divide

