# PMU Disturbance Analysis - Causality & Pattern Mining

This notebook analyzes disturbance causes and patterns:
1. Cause frequency distribution and Pareto analysis
2. Cause evolution over time
3. Association rule mining (Apriori algorithm)
4. Co-occurrence matrix
5. Sequential pattern detection
6. Reliability metrics (MTBF, MTTR, failure rates)
7. Transition probability matrices

**Input**: `outputs/data/cleaned_data.parquet`

**Output**: `outputs/data/causality_results.csv`, visualizations

In [None]:
# Import libraries
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Import project modules
from src import causality, visualizations as viz
import config

# Set visualization style
sns.set_style(config.PLOT_SETTINGS['style'])
plt.rcParams['figure.figsize'] = config.DEFAULT_FIGSIZE

print("Libraries loaded successfully!")

## 1. Load Data

In [None]:
# Load cleaned data
merged_df = pd.read_parquet(config.CLEANED_DATA)
print(f"Loaded {len(merged_df):,} records")

# Identify cause column (adjust based on your data)
potential_cause_cols = [col for col in merged_df.columns if 'cause' in col.lower()]
if len(potential_cause_cols) > 0:
    cause_col = potential_cause_cols[0]
    print(f"Using cause column: {cause_col}")
else:
    print("WARNING: No 'Cause' column found. Please specify manually.")
    cause_col = 'Cause'  # Adjust this

# Identify datetime column
datetime_cols = merged_df.select_dtypes(include=['datetime64']).columns.tolist()
datetime_col = datetime_cols[0] if len(datetime_cols) > 0 else 'DateTime'
print(f"Using datetime column: {datetime_col}")

## 2. Cause Distribution Analysis

In [None]:
# Analyze cause distribution
cause_dist = causality.analyze_cause_distribution(merged_df, cause_col=cause_col)

print("Top 10 Causes by Frequency:")
display(cause_dist.head(10))

# Pareto analysis (80/20 rule)
pareto_causes, n_causes = causality.calculate_pareto_80_20(cause_dist)
print(f"\nPareto Analysis: {n_causes} causes account for 80% of disturbances")
display(pareto_causes)

In [None]:
# Pareto chart
fig, ax1 = plt.subplots(figsize=(14, 6))

# Bar chart for counts
top_causes = cause_dist.head(15)
x = np.arange(len(top_causes))
ax1.bar(x, top_causes['Count'], color='steelblue', alpha=0.7)
ax1.set_xlabel('Cause')
ax1.set_ylabel('Count', color='steelblue')
ax1.set_title('Pareto Chart: Top 15 Disturbance Causes', fontsize=14, fontweight='bold')
ax1.tick_params(axis='y', labelcolor='steelblue')
ax1.set_xticks(x)
ax1.set_xticklabels(top_causes.index, rotation=45, ha='right')
ax1.grid(axis='y', alpha=0.3)

# Line chart for cumulative percentage
ax2 = ax1.twinx()
ax2.plot(x, top_causes['Cumulative_Percentage'], color='red', marker='o', linewidth=2)
ax2.axhline(y=80, color='red', linestyle='--', alpha=0.5, label='80% threshold')
ax2.set_ylabel('Cumulative Percentage (%)', color='red')
ax2.tick_params(axis='y', labelcolor='red')
ax2.set_ylim(0, 105)
ax2.legend()

plt.tight_layout()
viz.save_figure(fig, '03_01_pareto_chart')
plt.show()

## 3. Cause Evolution Over Time

In [None]:
# Analyze cause trends over time
merged_df['YearMonth'] = pd.to_datetime(merged_df[datetime_col]).dt.to_period('M')
cause_evolution = merged_df.groupby(['YearMonth', cause_col]).size().unstack(fill_value=0)

# Plot top 5 causes over time
top_5_causes = cause_dist.head(5).index.tolist()
fig, ax = plt.subplots(figsize=(14, 6))

for cause in top_5_causes:
    if cause in cause_evolution.columns:
        ax.plot(cause_evolution.index.astype(str), cause_evolution[cause], marker='o', label=cause, linewidth=2)

ax.set_xlabel('Year-Month')
ax.set_ylabel('Disturbance Count')
ax.set_title('Top 5 Causes: Evolution Over Time', fontsize=14, fontweight='bold')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(alpha=0.3)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
viz.save_figure(fig, '03_02_cause_evolution')
plt.show()

## 4. Association Rule Mining

In [None]:
# Mine association rules
rules = causality.mine_association_rules(
    merged_df,
    cause_col=cause_col,
    section_col='SectionID',
    min_support=config.MIN_SUPPORT,
    min_confidence=config.MIN_CONFIDENCE
)

if len(rules) > 0:
    print(f"Found {len(rules)} association rules")
    print("\nTop 10 Association Rules:")
    display(rules.head(10)[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
else:
    print("No association rules found. Try lowering min_support or min_confidence in config.py")

## 5. Co-occurrence Matrix

In [None]:
# Create co-occurrence matrix
cooccurrence = causality.create_cooccurrence_matrix(
    merged_df,
    cause_col=cause_col,
    section_col='SectionID'
)

# Plot heatmap for top causes
top_10_causes = cause_dist.head(10).index.tolist()
cooccur_subset = cooccurrence.loc[top_10_causes, top_10_causes]

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(cooccur_subset, annot=True, fmt='d', cmap='YlOrRd', ax=ax, 
            cbar_kws={'label': 'Co-occurrence Count'})
ax.set_title('Cause Co-occurrence Matrix (Top 10 Causes)', fontsize=14, fontweight='bold')
ax.set_xlabel('Cause')
ax.set_ylabel('Cause')
plt.tight_layout()
viz.save_figure(fig, '03_03_cooccurrence_matrix')
plt.show()

## 6. Sequential Pattern Detection

In [None]:
# Detect sequential patterns
sequential_patterns = causality.detect_sequential_patterns(
    merged_df,
    datetime_col=datetime_col,
    cause_col=cause_col,
    section_col='SectionID',
    window_days=config.SEQUENTIAL_WINDOW_DAYS
)

if len(sequential_patterns) > 0:
    print(f"Found {len(sequential_patterns)} sequential patterns")
    print(f"\nTop 10 Sequential Patterns (Cause A → Cause B within {config.SEQUENTIAL_WINDOW_DAYS} days):")
    display(sequential_patterns.head(10))
else:
    print("No sequential patterns found")

## 7. Transition Probability Matrix

In [None]:
# Create transition matrix
transition_matrix = causality.create_transition_matrix(
    merged_df,
    datetime_col=datetime_col,
    cause_col=cause_col,
    section_col='SectionID'
)

# Plot for top causes
trans_subset = transition_matrix.loc[top_10_causes, top_10_causes]

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(trans_subset, annot=True, fmt='.2f', cmap='Blues', ax=ax,
            cbar_kws={'label': 'Transition Probability'})
ax.set_title('Transition Probability Matrix: P(Cause B | Cause A)', fontsize=14, fontweight='bold')
ax.set_xlabel('Next Cause (B)')
ax.set_ylabel('Current Cause (A)')
plt.tight_layout()
viz.save_figure(fig, '03_04_transition_matrix')
plt.show()

## 8. Reliability Metrics (MTBF, MTTR)

In [None]:
# Calculate MTBF and MTTR
reliability_metrics = causality.calculate_mtbf_mttr(
    merged_df,
    datetime_col=datetime_col,
    section_col='SectionID'
)

print("Reliability Metrics by Section:")
print("\nTop 10 sections by failure count:")
display(reliability_metrics.head(10))

print("\nOverall Statistics:")
print(f"  Mean MTBF: {reliability_metrics['MTBF_hours'].mean():.2f} hours")
print(f"  Median MTBF: {reliability_metrics['MTBF_hours'].median():.2f} hours")
print(f"  Mean Failure Rate: {reliability_metrics['Failure_Rate'].mean():.6f} failures/hour")

In [None]:
# Plot MTBF distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# MTBF histogram
mtbf_values = reliability_metrics['MTBF_hours'].dropna()
axes[0].hist(mtbf_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
axes[0].set_xlabel('MTBF (hours)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Mean Time Between Failures')
axes[0].grid(axis='y', alpha=0.3)

# Failure count vs MTBF
axes[1].scatter(reliability_metrics['Failure_Count'], reliability_metrics['MTBF_hours'], alpha=0.6)
axes[1].set_xlabel('Number of Failures')
axes[1].set_ylabel('MTBF (hours)')
axes[1].set_title('Failure Count vs MTBF')
axes[1].grid(alpha=0.3)

plt.tight_layout()
viz.save_figure(fig, '03_05_mtbf_analysis')
plt.show()

## 9. Cause Severity Analysis

In [None]:
# Calculate cause severity
# Note: This uses Operations column if available in your data
operations_cols = [col for col in merged_df.columns if 'operation' in col.lower()]
operations_col = operations_cols[0] if len(operations_cols) > 0 else None

severity = causality.calculate_cause_severity(
    merged_df,
    cause_col=cause_col,
    operations_col=operations_col
)

print("Cause Severity Analysis:")
display(severity.head(10))

## 10. Sankey Diagram (Interactive)

In [None]:
# Create Sankey diagram showing cause transitions
if len(sequential_patterns) > 0:
    # Get top 20 sequential patterns
    top_patterns = sequential_patterns.head(20)
    
    # Create unique node list
    all_causes = list(set(top_patterns['From'].tolist() + top_patterns['To'].tolist()))
    cause_to_idx = {cause: i for i, cause in enumerate(all_causes)}
    
    # Create Sankey data
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            label=all_causes
        ),
        link=dict(
            source=[cause_to_idx[f] for f in top_patterns['From']],
            target=[cause_to_idx[t] for t in top_patterns['To']],
            value=top_patterns['Count'].tolist()
        )
    )])
    
    fig.update_layout(
        title='Sequential Cause Patterns (Sankey Diagram)',
        font_size=10,
        height=600
    )
    
    viz.save_figure(fig, '03_06_sankey_diagram', static=False, interactive=True)
    fig.show()
else:
    print("No sequential patterns available for Sankey diagram")

## 11. Save Results

In [None]:
# Save main results
cause_dist.to_csv(config.CAUSALITY_RESULTS, index=True)
print(f"Cause distribution saved to: {config.CAUSALITY_RESULTS}")

# Save reliability metrics
reliability_path = Path(config.OUTPUT_DIR) / 'data' / 'reliability_metrics.csv'
reliability_metrics.to_csv(reliability_path, index=False)
print(f"Reliability metrics saved to: {reliability_path}")

# Save sequential patterns if found
if len(sequential_patterns) > 0:
    patterns_path = Path(config.OUTPUT_DIR) / 'data' / 'sequential_patterns.csv'
    sequential_patterns.to_csv(patterns_path, index=False)
    print(f"Sequential patterns saved to: {patterns_path}")

print("\nCausality analysis complete!")

## Summary

This notebook has:
- ✅ Analyzed cause frequency distribution and Pareto principle
- ✅ Examined cause evolution over time
- ✅ Mined association rules between causes
- ✅ Created co-occurrence and transition matrices
- ✅ Detected sequential patterns
- ✅ Calculated reliability metrics (MTBF, MTTR, failure rates)
- ✅ Generated 6 visualizations including Sankey diagram

**Key Findings**: Review the outputs above for causality insights

**Next Steps**: Proceed to Notebook 04 (Spatial & Network Analysis)