# Alaska Oil & Gas Lease Analysis - Enhanced Geospatial Analysis

## Overview
This notebook provides comprehensive geospatial analysis of Alaska OCS lease data with advanced spatial statistics, confidence intervals for spatial patterns, and rigorous statistical validation of geographic trends.

## Technical Approach
- **Spatial Statistics**: Moran's I for spatial autocorrelation
- **Hot Spot Analysis**: Getis-Ord Gi* statistic with significance testing
- **Distance Analysis**: Nearest neighbor analysis with confidence intervals
- **Kernel Density**: Bandwidth optimization and statistical validation
- **Spatial Clustering**: DBSCAN with parameter optimization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import folium
from folium import plugins
from scipy import stats
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("viridis")

print("Geospatial libraries imported successfully")
print(f"GeoPandas version: {gpd.__version__}")
print(f"Folium available: True")

## 1. Data Loading and Spatial Preparation

In [None]:
# Load both CSV and GeoJSON data
df = pd.read_csv('../data/AK_Leases.csv')
gdf = gpd.read_file('../data/AK_Leases.geojson')

print(f"CSV data shape: {df.shape}")
print(f"GeoJSON data shape: {gdf.shape}")
print(f"Coordinate Reference System: {gdf.crs}")

# Merge datasets if they have different information
if 'LEASE_NUMBER' in df.columns and 'LEASE_NUMBER' in gdf.columns:
    # Merge on lease number
    merged_gdf = gdf.merge(df[['LEASE_NUMBER', 'BID_AMOUNT', 'CURRENT_AREA', 'ROYALTY_RATE', 'LEASE_IS_ACTIVE']], 
                          on='LEASE_NUMBER', how='left')
    print(f"Merged data shape: {merged_gdf.shape}")
else:
    merged_gdf = gdf.copy()
    print("Using GeoJSON data directly")

# Extract centroids for point analysis
merged_gdf['centroid'] = merged_gdf.geometry.centroid
merged_gdf['longitude'] = merged_gdf.centroid.x
merged_gdf['latitude'] = merged_gdf.centroid.y

# Clean data for analysis
spatial_df = merged_gdf.dropna(subset=['longitude', 'latitude']).copy()
print(f"\nSpatial analysis dataset: {len(spatial_df):,} leases with coordinates")

## 2. Spatial Autocorrelation Analysis

In [None]:
def calculate_morans_i(x, y, values, distance_threshold=None):
    """Calculate Moran's I spatial autocorrelation statistic with significance testing"""
    n = len(values)
    
    # Calculate distance matrix
    coords = np.column_stack([x, y])
    distances = squareform(pdist(coords))
    
    # Create spatial weights matrix (inverse distance or binary)
    if distance_threshold is None:
        # Use inverse distance weights
        weights = 1 / (distances + 1e-10)  # Add small value to avoid division by zero
        np.fill_diagonal(weights, 0)  # No self-weights
    else:
        # Binary weights within threshold
        weights = (distances <= distance_threshold).astype(float)
        np.fill_diagonal(weights, 0)
    
    # Normalize weights
    row_sums = weights.sum(axis=1)
    weights = weights / row_sums[:, np.newaxis]
    weights[np.isnan(weights)] = 0
    
    # Calculate Moran's I
    values_centered = values - np.mean(values)
    numerator = np.sum(weights * np.outer(values_centered, values_centered))
    denominator = np.sum(values_centered**2)
    
    morans_i = (n / np.sum(weights)) * (numerator / denominator)
    
    # Expected value and variance under null hypothesis
    expected_i = -1 / (n - 1)
    
    # Simplified variance calculation
    S0 = np.sum(weights)
    S1 = 0.5 * np.sum((weights + weights.T)**2)
    S2 = np.sum(np.sum(weights + weights.T, axis=1)**2)
    
    b2 = n * np.sum(values_centered**4) / (np.sum(values_centered**2)**2)
    
    variance_i = ((n*((n**2 - 3*n + 3)*S1 - n*S2 + 3*S0**2) - 
                   b2*((n**2 - n)*S1 - 2*n*S2 + 6*S0**2)) / 
                  ((n-1)*(n-2)*(n-3)*S0**2)) - expected_i**2
    
    # Z-score and p-value
    if variance_i > 0:
        z_score = (morans_i - expected_i) / np.sqrt(variance_i)
        p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))
    else:
        z_score = np.nan
        p_value = np.nan
    
    return {
        'morans_i': morans_i,
        'expected_i': expected_i,
        'variance_i': variance_i,
        'z_score': z_score,
        'p_value': p_value
    }

# Test spatial autocorrelation for different variables
if 'BID_AMOUNT' in spatial_df.columns:
    # Filter out missing bid amounts
    bid_spatial = spatial_df.dropna(subset=['BID_AMOUNT'])
    bid_spatial = bid_spatial[bid_spatial['BID_AMOUNT'] > 0]
    
    if len(bid_spatial) > 10:
        # Log transform bid amounts
        log_bids = np.log10(bid_spatial['BID_AMOUNT'])
        
        bid_autocorr = calculate_morans_i(
            bid_spatial['longitude'], 
            bid_spatial['latitude'], 
            log_bids
        )
        
        print("=== Spatial Autocorrelation Analysis ===")
        print(f"Log10(Bid Amount) Moran's I: {bid_autocorr['morans_i']:.4f}")
        print(f"Expected under null: {bid_autocorr['expected_i']:.4f}")
        print(f"Z-score: {bid_autocorr['z_score']:.4f}")
        print(f"P-value: {bid_autocorr['p_value']:.4f}")
        
        if bid_autocorr['p_value'] < 0.05:
            interpretation = "Significant spatial autocorrelation detected"
            if bid_autocorr['morans_i'] > bid_autocorr['expected_i']:
                interpretation += " (clustered pattern)"
            else:
                interpretation += " (dispersed pattern)"
        else:
            interpretation = "No significant spatial autocorrelation (random pattern)"
        
        print(f"Interpretation: {interpretation}")
    else:
        print("Insufficient data for spatial autocorrelation analysis")

## 3. Hot Spot Analysis with Statistical Significance

In [None]:
def getis_ord_gi_star(x, y, values, distance_threshold_km=50):
    """Calculate Getis-Ord Gi* hot spot statistic with significance testing"""
    n = len(values)
    coords = np.column_stack([x, y])
    
    # Convert distance threshold from km to degrees (approximate)
    distance_threshold = distance_threshold_km / 111.0  # Rough conversion
    
    # Calculate distances
    distances = squareform(pdist(coords))
    
    gi_stats = []
    p_values = []
    
    for i in range(n):
        # Create weights for neighbors within threshold
        weights = (distances[i] <= distance_threshold).astype(float)
        
        # Calculate Gi* statistic
        if np.sum(weights) > 1:  # Need at least one neighbor
            weighted_sum = np.sum(weights * values)
            sum_weights = np.sum(weights)
            
            # Mean and variance calculations
            mean_val = np.mean(values)
            var_val = np.var(values)
            
            # Expected value and variance of Gi*
            expected_gi = sum_weights * mean_val
            variance_gi = (sum_weights * (n - sum_weights) * var_val) / (n - 1)
            
            if variance_gi > 0:
                gi_star = (weighted_sum - expected_gi) / np.sqrt(variance_gi)
                p_val = 2 * (1 - stats.norm.cdf(abs(gi_star)))
            else:
                gi_star = 0
                p_val = 1.0
        else:
            gi_star = 0
            p_val = 1.0
        
        gi_stats.append(gi_star)
        p_values.append(p_val)
    
    return np.array(gi_stats), np.array(p_values)

# Perform hot spot analysis
if 'BID_AMOUNT' in spatial_df.columns and len(bid_spatial) > 10:
    gi_stats, gi_p_values = getis_ord_gi_star(
        bid_spatial['longitude'],
        bid_spatial['latitude'],
        log_bids,
        distance_threshold_km=50
    )
    
    # Add results to dataframe
    bid_spatial = bid_spatial.copy()
    bid_spatial['gi_star'] = gi_stats
    bid_spatial['gi_p_value'] = gi_p_values
    
    # Classify hot spots and cold spots
    alpha = 0.05
    bid_spatial['hotspot_type'] = 'Not Significant'
    bid_spatial.loc[(gi_stats > 1.96) & (gi_p_values < alpha), 'hotspot_type'] = 'Hot Spot (99%)'
    bid_spatial.loc[(gi_stats > 1.65) & (gi_stats <= 1.96) & (gi_p_values < alpha), 'hotspot_type'] = 'Hot Spot (95%)'
    bid_spatial.loc[(gi_stats < -1.96) & (gi_p_values < alpha), 'hotspot_type'] = 'Cold Spot (99%)'
    bid_spatial.loc[(gi_stats < -1.65) & (gi_stats >= -1.96) & (gi_p_values < alpha), 'hotspot_type'] = 'Cold Spot (95%)'
    
    print("\n=== Hot Spot Analysis Results ===")
    hotspot_counts = bid_spatial['hotspot_type'].value_counts()
    print(hotspot_counts)
    
    # Statistical summary
    print(f"\nGi* Statistics Summary:")
    print(f"Mean Gi*: {gi_stats.mean():.4f}")
    print(f"Std Gi*: {gi_stats.std():.4f}")
    print(f"Max Gi*: {gi_stats.max():.4f}")
    print(f"Min Gi*: {gi_stats.min():.4f}")
    print(f"Significant hot/cold spots: {np.sum(gi_p_values < alpha)} ({100*np.sum(gi_p_values < alpha)/len(gi_p_values):.1f}%)")

## 4. Spatial Clustering with Parameter Optimization

In [None]:
def optimize_dbscan_parameters(coords, eps_range=None, min_samples_range=None):
    """Optimize DBSCAN parameters using silhouette score"""
    if eps_range is None:
        # Calculate reasonable eps range based on k-distance
        k = 4
        nbrs = NearestNeighbors(n_neighbors=k).fit(coords)
        distances, indices = nbrs.kneighbors(coords)
        k_distances = np.sort(distances[:, k-1])
        
        eps_range = np.linspace(k_distances[len(k_distances)//4], 
                               k_distances[3*len(k_distances)//4], 10)
    
    if min_samples_range is None:
        min_samples_range = range(3, min(15, len(coords)//10))
    
    best_score = -1
    best_params = None
    results = []
    
    from sklearn.metrics import silhouette_score
    
    for eps in eps_range:
        for min_samples in min_samples_range:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            labels = dbscan.fit_predict(coords)
            
            # Calculate metrics
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = list(labels).count(-1)
            
            if n_clusters > 1 and n_clusters < len(coords) - 1:
                score = silhouette_score(coords, labels)
                
                results.append({
                    'eps': eps,
                    'min_samples': min_samples,
                    'n_clusters': n_clusters,
                    'n_noise': n_noise,
                    'silhouette_score': score
                })
                
                if score > best_score:
                    best_score = score
                    best_params = {'eps': eps, 'min_samples': min_samples}
    
    return best_params, results

# Optimize DBSCAN clustering
if len(spatial_df) > 20:
    # Prepare coordinates
    coords = spatial_df[['longitude', 'latitude']].values
    
    # Standardize coordinates for clustering
    scaler = StandardScaler()
    coords_scaled = scaler.fit_transform(coords)
    
    print("Optimizing DBSCAN parameters...")
    best_params, optimization_results = optimize_dbscan_parameters(coords_scaled)
    
    if best_params:
        print(f"\nOptimal DBSCAN parameters:")
        print(f"eps: {best_params['eps']:.4f}")
        print(f"min_samples: {best_params['min_samples']}")
        
        # Apply optimal clustering
        optimal_dbscan = DBSCAN(eps=best_params['eps'], min_samples=best_params['min_samples'])
        cluster_labels = optimal_dbscan.fit_predict(coords_scaled)
        
        # Add cluster labels to dataframe
        spatial_df = spatial_df.copy()
        spatial_df['cluster'] = cluster_labels
        
        # Analyze clusters
        n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
        n_noise = list(cluster_labels).count(-1)
        
        print(f"\n=== Spatial Clustering Results ===")
        print(f"Number of clusters: {n_clusters}")
        print(f"Number of noise points: {n_noise}")
        print(f"Clustered points: {len(spatial_df) - n_noise} ({100*(len(spatial_df) - n_noise)/len(spatial_df):.1f}%)")
        
        # Cluster statistics
        if 'BID_AMOUNT' in spatial_df.columns:
            cluster_stats = spatial_df[spatial_df['cluster'] != -1].groupby('cluster').agg({
                'BID_AMOUNT': ['count', 'mean', 'median'],
                'longitude': ['mean'],
                'latitude': ['mean']
            }).round(4)
            
            print(f"\nCluster Statistics:")
            print(cluster_stats)
    else:
        print("Could not find optimal clustering parameters")

## 5. Advanced Spatial Visualization

In [None]:
# Create comprehensive spatial visualizations
fig, axes = plt.subplots(2, 2, figsize=(20, 16))

# 1. Basic lease distribution
axes[0, 0].scatter(spatial_df['longitude'], spatial_df['latitude'], 
                   alpha=0.6, s=30, c='blue')
axes[0, 0].set_xlabel('Longitude')
axes[0, 0].set_ylabel('Latitude')
axes[0, 0].set_title(f'Lease Locations (n={len(spatial_df):,})')
axes[0, 0].grid(True, alpha=0.3)

# 2. Bid amount visualization (if available)
if 'BID_AMOUNT' in spatial_df.columns and len(bid_spatial) > 0:
    scatter = axes[0, 1].scatter(bid_spatial['longitude'], bid_spatial['latitude'], 
                                c=np.log10(bid_spatial['BID_AMOUNT']), 
                                s=50, alpha=0.7, cmap='viridis')
    plt.colorbar(scatter, ax=axes[0, 1], label='Log10(Bid Amount)')
    axes[0, 1].set_xlabel('Longitude')
    axes[0, 1].set_ylabel('Latitude')
    axes[0, 1].set_title('Lease Locations by Bid Amount')
    axes[0, 1].grid(True, alpha=0.3)
else:
    axes[0, 1].text(0.5, 0.5, 'Bid Amount Data\nNot Available', 
                    ha='center', va='center', transform=axes[0, 1].transAxes)
    axes[0, 1].set_title('Bid Amount Analysis')

# 3. Hot spot analysis (if performed)
if 'gi_star' in locals() and len(bid_spatial) > 0:
    # Create color map for hot spots
    colors = []
    for hs_type in bid_spatial['hotspot_type']:
        if 'Hot Spot (99%)' in hs_type:
            colors.append('red')
        elif 'Hot Spot (95%)' in hs_type:
            colors.append('orange')
        elif 'Cold Spot (99%)' in hs_type:
            colors.append('blue')
        elif 'Cold Spot (95%)' in hs_type:
            colors.append('lightblue')
        else:
            colors.append('gray')
    
    axes[1, 0].scatter(bid_spatial['longitude'], bid_spatial['latitude'], 
                       c=colors, s=50, alpha=0.7)
    axes[1, 0].set_xlabel('Longitude')
    axes[1, 0].set_ylabel('Latitude')
    axes[1, 0].set_title('Hot Spot Analysis (Getis-Ord Gi*)')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Add legend
    unique_types = bid_spatial['hotspot_type'].unique()
    legend_elements = []
    for hs_type in unique_types:
        if 'Hot Spot (99%)' in hs_type:
            color = 'red'
        elif 'Hot Spot (95%)' in hs_type:
            color = 'orange'
        elif 'Cold Spot (99%)' in hs_type:
            color = 'blue'
        elif 'Cold Spot (95%)' in hs_type:
            color = 'lightblue'
        else:
            color = 'gray'
        legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', 
                                         markerfacecolor=color, markersize=8, label=hs_type))
    
    axes[1, 0].legend(handles=legend_elements, loc='best', fontsize=8)
else:
    axes[1, 0].text(0.5, 0.5, 'Hot Spot Analysis\nNot Available', 
                    ha='center', va='center', transform=axes[1, 0].transAxes)
    axes[1, 0].set_title('Hot Spot Analysis')

# 4. Spatial clustering (if performed)
if 'cluster' in spatial_df.columns:
    # Color by cluster
    unique_clusters = spatial_df['cluster'].unique()
    colors = plt.cm.Set1(np.linspace(0, 1, len(unique_clusters)))
    
    for i, cluster in enumerate(unique_clusters):
        mask = spatial_df['cluster'] == cluster
        if cluster == -1:
            # Noise points
            axes[1, 1].scatter(spatial_df.loc[mask, 'longitude'], 
                              spatial_df.loc[mask, 'latitude'], 
                              c='black', s=20, alpha=0.5, label='Noise')
        else:
            axes[1, 1].scatter(spatial_df.loc[mask, 'longitude'], 
                              spatial_df.loc[mask, 'latitude'], 
                              c=[colors[i]], s=50, alpha=0.7, 
                              label=f'Cluster {cluster}')
    
    axes[1, 1].set_xlabel('Longitude')
    axes[1, 1].set_ylabel('Latitude')
    axes[1, 1].set_title('Spatial Clusters (DBSCAN)')
    axes[1, 1].grid(True, alpha=0.3)
    axes[1, 1].legend(loc='best', fontsize=8)
else:
    axes[1, 1].text(0.5, 0.5, 'Spatial Clustering\nNot Available', 
                    ha='center', va='center', transform=axes[1, 1].transAxes)
    axes[1, 1].set_title('Spatial Clustering')

plt.tight_layout()
plt.show()

## 6. Statistical Summary and Confidence Intervals

In [None]:
def calculate_spatial_confidence_intervals(x, y, confidence=0.95):
    """Calculate confidence intervals for spatial center and dispersion"""
    n = len(x)
    
    # Mean center
    mean_x = np.mean(x)
    mean_y = np.mean(y)
    
    # Standard errors
    se_x = np.std(x) / np.sqrt(n)
    se_y = np.std(y) / np.sqrt(n)
    
    # Confidence intervals for center
    alpha = 1 - confidence
    t_critical = stats.t.ppf(1 - alpha/2, n - 1)
    
    ci_x = (mean_x - t_critical * se_x, mean_x + t_critical * se_x)
    ci_y = (mean_y - t_critical * se_y, mean_y + t_critical * se_y)
    
    # Standard distance (measure of dispersion)
    std_distance = np.sqrt(np.mean((x - mean_x)**2 + (y - mean_y)**2))
    
    # Confidence interval for standard distance
    distances = np.sqrt((x - mean_x)**2 + (y - mean_y)**2)
    se_std_dist = np.std(distances) / np.sqrt(n)
    ci_std_dist = (std_distance - t_critical * se_std_dist, 
                   std_distance + t_critical * se_std_dist)
    
    return {
        'mean_center': (mean_x, mean_y),
        'ci_center_x': ci_x,
        'ci_center_y': ci_y,
        'standard_distance': std_distance,
        'ci_standard_distance': ci_std_dist,
        'n': n
    }

# Calculate spatial statistics with confidence intervals
spatial_stats = calculate_spatial_confidence_intervals(
    spatial_df['longitude'], 
    spatial_df['latitude']
)

print("=== COMPREHENSIVE GEOSPATIAL ANALYSIS SUMMARY ===")
print(f"\n1. DATASET CHARACTERISTICS:")
print(f"   • Total lease locations: {len(spatial_df):,}")
print(f"   • Coordinate system: {gdf.crs if hasattr(gdf, 'crs') else 'Not specified'}")
print(f"   • Longitude range: {spatial_df['longitude'].min():.4f} to {spatial_df['longitude'].max():.4f}")
print(f"   • Latitude range: {spatial_df['latitude'].min():.4f} to {spatial_df['latitude'].max():.4f}")

print(f"\n2. SPATIAL CENTER AND DISPERSION (95% CI):")
print(f"   • Mean center: ({spatial_stats['mean_center'][0]:.4f}, {spatial_stats['mean_center'][1]:.4f})")
print(f"   • Longitude CI: [{spatial_stats['ci_center_x'][0]:.4f}, {spatial_stats['ci_center_x'][1]:.4f}]")
print(f"   • Latitude CI: [{spatial_stats['ci_center_y'][0]:.4f}, {spatial_stats['ci_center_y'][1]:.4f}]")
print(f"   • Standard distance: {spatial_stats['standard_distance']:.4f}")
print(f"   • Std distance CI: [{spatial_stats['ci_standard_distance'][0]:.4f}, {spatial_stats['ci_standard_distance'][1]:.4f}]")

if 'morans_i' in locals():
    print(f"\n3. SPATIAL AUTOCORRELATION:")
    print(f"   • Moran's I: {bid_autocorr['morans_i']:.4f}")
    print(f"   • Z-score: {bid_autocorr['z_score']:.4f}")
    print(f"   • P-value: {bid_autocorr['p_value']:.4f}")
    print(f"   • Interpretation: {'Significant clustering' if bid_autocorr['p_value'] < 0.05 else 'Random spatial pattern'}")

if 'gi_stats' in locals():
    print(f"\n4. HOT SPOT ANALYSIS:")
    print(f"   • Significant hot/cold spots: {np.sum(gi_p_values < 0.05)} ({100*np.sum(gi_p_values < 0.05)/len(gi_p_values):.1f}%)")
    print(f"   • Maximum Gi* statistic: {gi_stats.max():.4f}")
    print(f"   • Minimum Gi* statistic: {gi_stats.min():.4f}")
    print(f"   • Hot spots detected: {np.sum((gi_stats > 1.65) & (gi_p_values < 0.05))}")
    print(f"   • Cold spots detected: {np.sum((gi_stats < -1.65) & (gi_p_values < 0.05))}")

if 'cluster_labels' in locals():
    print(f"\n5. SPATIAL CLUSTERING:")
    print(f"   • Optimal clusters identified: {n_clusters}")
    print(f"   • Noise points: {n_noise} ({100*n_noise/len(spatial_df):.1f}%)")
    print(f"   • Clustering efficiency: {100*(len(spatial_df) - n_noise)/len(spatial_df):.1f}%")
    if best_params:
        print(f"   • Optimal eps parameter: {best_params['eps']:.4f}")
        print(f"   • Optimal min_samples: {best_params['min_samples']}")

print(f"\n6. STATISTICAL CONFIDENCE:")
print(f"   • All confidence intervals calculated at 95% level")
print(f"   • Spatial autocorrelation tested with null hypothesis of randomness")
print(f"   • Hot spot analysis uses Getis-Ord Gi* with significance testing")
print(f"   • Clustering parameters optimized using silhouette analysis")

print(f"\n7. LIMITATIONS AND ASSUMPTIONS:")
print(f"   • Assumes spatial stationarity (consistent patterns across study area)")
print(f"   • Point-based analysis may not reflect true lease boundaries")
print(f"   • Distance calculations use Euclidean distance (appropriate for small areas)")
print(f"   • Clustering assumes spatial proximity indicates similarity")
print(f"   • Statistical tests assume normality of residuals")