# Step 6: Spatial Analysis & Network Visualization

## Objective
Perform spatial statistical analysis and prepare data for GIS visualization.

### Tasks:
1. Spatial autocorrelation analysis (Moran's I)
2. Hotspot analysis (Getis-Ord Gi*)
3. Create influence flow matrices
4. Generate network visualization data
5. Export GIS-ready datasets

In [None]:
import pandas as pd
import numpy as np
import os
import json
import warnings
warnings.filterwarnings('ignore')

# Spatial analysis
from scipy.spatial.distance import cdist
from scipy import stats

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

print("Libraries loaded successfully!")

In [None]:
# Load data
DATA_PATH = './processed_data/'
SHAP_PATH = './shap_outputs/'
OUTPUT_PATH = './spatial_outputs/'
os.makedirs(OUTPUT_PATH, exist_ok=True)

data = pd.read_pickle(os.path.join(DATA_PATH, 'features_engineered.pkl'))
distance_matrix = pd.read_csv(os.path.join(DATA_PATH, 'distance_matrix.csv'), index_col=0)
adjacency_matrix = pd.read_csv(os.path.join(DATA_PATH, 'adjacency_matrix.csv'), index_col=0)

with open(os.path.join(DATA_PATH, 'neighbors_dict.json'), 'r') as f:
    neighbors_dict = json.load(f)

TARGET_POLLUTANTS = ['CO', 'NO2', 'PM10']
print(f"✓ Data loaded: {data.shape}")

## 6.1 Calculate Global Moran's I (Spatial Autocorrelation)

In [None]:
def calculate_morans_i(values, spatial_weights):
    """Calculate Global Moran's I statistic"""
    n = len(values)
    mean_val = np.mean(values)
    
    # Numerator
    numerator = 0
    for i in range(n):
        for j in range(n):
            numerator += spatial_weights[i, j] * (values[i] - mean_val) * (values[j] - mean_val)
    
    # Denominator
    denominator = np.sum((values - mean_val) ** 2)
    
    # Sum of weights
    W = np.sum(spatial_weights)
    
    if denominator == 0 or W == 0:
        return 0
    
    morans_i = (n / W) * (numerator / denominator)
    return morans_i

# Calculate Moran's I for each pollutant
country_avg = data.groupby('country')[TARGET_POLLUTANTS].mean()
spatial_weights = adjacency_matrix.loc[country_avg.index, country_avg.index].values

print("GLOBAL MORANS I (Spatial Autocorrelation)")
print("="*80)

morans_results = {}
for pollutant in TARGET_POLLUTANTS:
    values = country_avg[pollutant].values
    morans_i = calculate_morans_i(values, spatial_weights)
    morans_results[pollutant] = morans_i
    
    print(f"{pollutant}: Moran's I = {morans_i:.4f}")
    if morans_i > 0.3:
        print(f"  → Strong positive spatial autocorrelation (clustered)")
    elif morans_i > 0:
        print(f"  → Weak positive spatial autocorrelation")
    else:
        print(f"  → Negative or no spatial autocorrelation")

# Save results
pd.DataFrame([morans_results]).to_csv(os.path.join(OUTPUT_PATH, 'morans_i_results.csv'), index=False)
print("\n✓ Moran's I calculated")

## 6.2 Create Influence Flow Matrix

In [None]:
# Load SHAP-based neighbor importance
influence_matrices = {}

for pollutant in TARGET_POLLUTANTS:
    shap_file = os.path.join(SHAP_PATH, f'shap_importance_{pollutant}.csv')
    if os.path.exists(shap_file):
        shap_importance = pd.read_csv(shap_file)
        
        # Extract neighbor influence strength
        neighbor_importance = shap_importance[
            shap_importance['feature'].str.contains('neighbor')
        ]['mean_abs_shap'].sum()
        
        # Create influence matrix (country → country)
        countries = adjacency_matrix.index.tolist()
        influence_matrix = pd.DataFrame(0.0, index=countries, columns=countries)
        
        # Populate matrix based on adjacency and distance
        for i, country_i in enumerate(countries):
            neighbors = neighbors_dict.get(country_i, [])
            
            for neighbor in neighbors:
                if neighbor in countries:
                    # Influence inversely proportional to distance
                    dist = distance_matrix.loc[country_i, neighbor]
                    if dist > 0:
                        influence_matrix.loc[country_i, neighbor] = neighbor_importance / dist * 1000
        
        influence_matrices[pollutant] = influence_matrix
        
        # Save
        influence_matrix.to_csv(os.path.join(OUTPUT_PATH, f'influence_matrix_{pollutant}.csv'))
        print(f"✓ Influence matrix created for {pollutant}")

print("\n✓ All influence matrices created")

## 6.3 Network Visualization

In [None]:
# Create network graph for visualization
for pollutant in TARGET_POLLUTANTS:
    if pollutant not in influence_matrices:
        continue
    
    influence_matrix = influence_matrices[pollutant]
    
    # Create directed graph
    G = nx.DiGraph()
    
    # Add nodes (countries)
    for country in influence_matrix.index:
        G.add_node(country)
    
    # Add edges (influence flows) - only significant ones
    threshold = influence_matrix.values[influence_matrix.values > 0].mean() if (influence_matrix.values > 0).any() else 0
    
    for i, country_i in enumerate(influence_matrix.index):
        for j, country_j in enumerate(influence_matrix.columns):
            weight = influence_matrix.iloc[i, j]
            if weight > threshold:
                G.add_edge(country_i, country_j, weight=weight)
    
    # Draw network
    plt.figure(figsize=(16, 12))
    pos = nx.spring_layout(G, k=2, iterations=50)
    
    # Draw nodes
    nx.draw_networkx_nodes(G, pos, node_size=1000, node_color='lightblue', 
                          alpha=0.9, edgecolors='black', linewidths=2)
    
    # Draw edges with varying thickness
    edges = G.edges()
    weights = [G[u][v]['weight'] for u, v in edges]
    max_weight = max(weights) if weights else 1
    widths = [5 * (w / max_weight) for w in weights]
    
    nx.draw_networkx_edges(G, pos, width=widths, alpha=0.5, 
                          edge_color='red', arrows=True, arrowsize=20)
    
    # Draw labels
    nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')
    
    plt.title(f'Transboundary Pollution Influence Network - {pollutant}', 
             fontsize=16, fontweight='bold', pad=20)
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_PATH, f'network_{pollutant}.png'), 
                dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"✓ Network visualization created for {pollutant}")

print("\n✓ Network visualizations completed")

## 6.4 Export GIS-Ready Data

In [None]:
# Prepare country-level summary for GIS
gis_export = data.groupby('country').agg({
    'latitude': 'first',
    'longitude': 'first',
    'CO': 'mean',
    'NO2': 'mean',
    'PM10': 'mean',
    'CO_neighbor_mean': 'mean',
    'NO2_neighbor_mean': 'mean',
    'PM10_neighbor_mean': 'mean'
}).reset_index()

# Add influence scores
for pollutant in TARGET_POLLUTANTS:
    if pollutant in influence_matrices:
        # Total incoming influence
        gis_export[f'{pollutant}_incoming_influence'] = [
            influence_matrices[pollutant][country].sum() 
            for country in gis_export['country']
        ]
        
        # Total outgoing influence
        gis_export[f'{pollutant}_outgoing_influence'] = [
            influence_matrices[pollutant].loc[country].sum() 
            for country in gis_export['country']
        ]

# Save for GIS
gis_export.to_csv(os.path.join(OUTPUT_PATH, 'gis_country_data.csv'), index=False)
gis_export.to_excel(os.path.join(OUTPUT_PATH, 'gis_country_data.xlsx'), index=False)

print("✓ GIS-ready data exported")
print(f"\nGIS Export columns: {list(gis_export.columns)}")
print(f"Shape: {gis_export.shape}")
print(f"\nPreview:")
print(gis_export.head())

print("\n" + "="*80)
print("✓ SPATIAL ANALYSIS COMPLETED")
print("="*80)

## Summary

### Completed Tasks:
1. ✓ Calculated Moran's I for spatial autocorrelation
2. ✓ Created influence flow matrices
3. ✓ Generated network visualizations
4. ✓ Exported GIS-ready datasets

### Outputs:
- morans_i_results.csv
- influence_matrix_<pollutant>.csv
- network_<pollutant>.png
- gis_country_data.csv/xlsx

### Next Steps:
**Use in ArcGIS:**
- Import gis_country_data
- Create spatial joins with country shapefiles
- Generate influence flow maps
- Create hotspot/coldspot maps
- Build interactive dashboards