# 03 - Risk Scoring Framework

This notebook demonstrates the multi-dimensional risk scoring methodology.

**Risk Components:**
1. **Frequency** (35%): Historical event count
2. **Trend** (25%): Are events increasing over time?
3. **MTBF** (20%): Mean time between failures (inverted)
4. **Age** (10%): Equipment age
5. **Recency** (10%): Time since last event (inverted)

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_loader import load_pmu_disturbance_data
from risk_scorer import PMURiskScorer

DATA_PATH = '../../data/PMU_disturbance.xlsx'
plt.style.use('seaborn-v0_8-whitegrid')

## 1. Load Data

In [None]:
pmu_df, dist_df = load_pmu_disturbance_data(DATA_PATH)
print(f"Loaded {len(pmu_df)} PMUs and {len(dist_df)} events")

## 2. Calculate Risk Scores

In [None]:
# Default weights
default_weights = {
    'frequency': 0.35,
    'trend': 0.25,
    'mtbf': 0.20,
    'age': 0.10,
    'recency': 0.10
}

scorer = PMURiskScorer(pmu_df, dist_df, weights=default_weights)
results = scorer.calculate_risk_scores()

print(f"Calculated risk scores for {len(results)} sections")
results.head(10)

## 3. Risk Score Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(results['risk_score'], bins=30, color='steelblue', edgecolor='white')
axes[0].axvline(results['risk_score'].mean(), color='red', linestyle='--', label=f'Mean: {results["risk_score"].mean():.1f}')
axes[0].set_xlabel('Risk Score')
axes[0].set_ylabel('Number of Sections')
axes[0].set_title('Risk Score Distribution')
axes[0].legend()

# Category counts
category_counts = results['category'].value_counts()
colors = {'Low': 'green', 'Medium': 'orange', 'High': 'red'}
axes[1].bar(category_counts.index, category_counts.values, color=[colors.get(c, 'gray') for c in category_counts.index])
axes[1].set_xlabel('Risk Category')
axes[1].set_ylabel('Number of Sections')
axes[1].set_title('Risk Category Distribution')

plt.tight_layout()
plt.show()

## 4. Component Analysis

In [None]:
# Get component scores for top 10 sections
top_10 = results.head(10)

component_cols = [c for c in results.columns if c.endswith('_score') and c != 'risk_score']
if component_cols:
    fig, ax = plt.subplots(figsize=(12, 6))
    x = np.arange(len(top_10))
    width = 0.15
    
    for i, col in enumerate(component_cols):
        ax.bar(x + i*width, top_10[col], width, label=col.replace('_score', ''))
    
    ax.set_xlabel('Section')
    ax.set_ylabel('Component Score')
    ax.set_title('Risk Component Breakdown - Top 10 Sections')
    ax.set_xticks(x + width * 2)
    ax.set_xticklabels([f"S{int(s)}" for s in top_10['SectionID']])
    ax.legend()
    plt.tight_layout()
    plt.show()
else:
    print("Component scores not available in results")

## 5. Weight Sensitivity Analysis

In [None]:
# Test different weight configurations
weight_configs = {
    'Default': {'frequency': 0.35, 'trend': 0.25, 'mtbf': 0.20, 'age': 0.10, 'recency': 0.10},
    'Frequency-Heavy': {'frequency': 0.60, 'trend': 0.15, 'mtbf': 0.10, 'age': 0.10, 'recency': 0.05},
    'Trend-Heavy': {'frequency': 0.20, 'trend': 0.50, 'mtbf': 0.15, 'age': 0.10, 'recency': 0.05},
    'Balanced': {'frequency': 0.20, 'trend': 0.20, 'mtbf': 0.20, 'age': 0.20, 'recency': 0.20}
}

rankings = {}
for name, weights in weight_configs.items():
    scorer = PMURiskScorer(pmu_df, dist_df, weights=weights)
    r = scorer.calculate_risk_scores()
    rankings[name] = r.head(10)['SectionID'].tolist()

print("Top 10 Rankings by Weight Configuration:")
for name, top_10 in rankings.items():
    print(f"\n{name}: {top_10}")

## 6. Correlation Between Components

In [None]:
# Correlation heatmap
score_cols = ['risk_score'] + [c for c in results.columns if c.endswith('_score') and c != 'risk_score']
if len(score_cols) > 1:
    corr = results[score_cols].corr()
    
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, ax=ax)
    ax.set_title('Component Score Correlations')
    plt.tight_layout()
    plt.show()
else:
    print("Insufficient component scores for correlation analysis")

## Summary

The multi-dimensional risk scoring framework:
- Combines 5 risk factors with configurable weights
- Section 150 consistently ranks #1 across different weight configurations
- The default weights prioritize frequency (35%) and trend (25%)
- Risk categories (Low/Medium/High) provide actionable classification