In [None]:
# notebooks/maritime_analysis.ipynb
"""
# Maritime Data Analysis - Data Science Portfolio Project

This notebook demonstrates advanced data analysis techniques applied to maritime data.
Perfect for showcasing Data Science skills in job interviews.

Author: [Your Name]
Project: BergNavn Maritime Intelligence Platform
Date: December 2024
"""

# Cell 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json
import requests
import folium
from folium.plugins import HeatMap
import warnings
warnings.filterwarnings('ignore')

# Cell 2: Load data from API
print("üì° Loading maritime data from API...")
BASE_URL = "http://localhost:5000"  # Change to your server URL

try:
    # Try to get route data
    response = requests.get(f"{BASE_URL}/maritime/api/analytics/route-statistics", timeout=10)
    if response.status_code == 200:
        route_data = response.json()
        print("‚úÖ Route data loaded successfully")
    else:
        print("‚ö†Ô∏è Could not load route data, using sample data")
        # Load sample data
        with open('../backend/assets/sample_routes.json', 'r') as f:
            route_data = json.load(f)
except:
    print("‚ùå API not available, using simulated data")
    # Create simulated data
    np.random.seed(42)
    route_data = {
        'visualizations': {
            'scatter_data': [
                {'x': np.random.uniform(50, 500), 'y': np.random.randint(5, 50), 
                 'city': np.random.choice(['bergen', 'oslo', 'stavanger']), 'name': f'Route_{i}'}
                for i in range(50)
            ]
        }
    }

# Cell 3: Create DataFrame
print("\nüìä Creating DataFrame for analysis...")
scatter_data = route_data.get('visualizations', {}).get('scatter_data', [])
df = pd.DataFrame(scatter_data)

if not df.empty:
    print(f"‚úÖ Loaded {len(df)} routes")
    print("\nFirst 5 rows:")
    print(df.head())
else:
    print("‚ùå No data available")
    # Create dummy data
    df = pd.DataFrame({
        'x': np.random.uniform(50, 500, 50),
        'y': np.random.randint(5, 50, 50),
        'city': np.random.choice(['bergen', 'oslo', 'stavanger', 'trondheim'], 50)
    })

# Cell 4: Descriptive Statistics
print("\n" + "="*50)
print("DESCRIPTIVE STATISTICS")
print("="*50)

print("\nüìà Basic Statistics:")
print(df[['x', 'y']].describe())

print("\nüèôÔ∏è Routes by City:")
print(df['city'].value_counts())

# Cell 5: Data Visualization
print("\n" + "="*50)
print("DATA VISUALIZATION")
print("="*50)

# Figure 1: Distribution of Route Distances
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df['x'], bins=15, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Distribution of Route Distances', fontsize=14, fontweight='bold')
plt.xlabel('Distance (nautical miles)')
plt.ylabel('Frequency')
plt.grid(alpha=0.3)

# Figure 2: Distance vs Waypoints
plt.subplot(1, 2, 2)
scatter = plt.scatter(df['x'], df['y'], c=pd.factorize(df['city'])[0], 
                      alpha=0.6, cmap='viridis', s=50)
plt.title('Distance vs Waypoints by City', fontsize=14, fontweight='bold')
plt.xlabel('Distance (nm)')
plt.ylabel('Number of Waypoints')
plt.colorbar(scatter, label='City')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Cell 6: Statistical Analysis
print("\n" + "="*50)
print("STATISTICAL ANALYSIS")
print("="*50)

from scipy import stats

# Correlation analysis
correlation = df[['x', 'y']].corr().iloc[0,1]
print(f"üìä Correlation between distance and waypoints: {correlation:.3f}")

# T-test between city groups (simplified)
if 'city' in df.columns and len(df['city'].unique()) >= 2:
    cities = df['city'].unique()[:2]
    group1 = df[df['city'] == cities[0]]['x']
    group2 = df[df['city'] == cities[1]]['x']
    
    if len(group1) > 1 and len(group2) > 1:
        t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=False)
        print(f"üìã T-test between {cities[0]} and {cities[1]}:")
        print(f"   t-statistic = {t_stat:.3f}, p-value = {p_value:.3f}")

# Cell 7: Machine Learning - Anomaly Detection
print("\n" + "="*50)
print("MACHINE LEARNING: ANOMALY DETECTION")
print("="*50)

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Prepare features
features = df[['x', 'y']].fillna(0)
if len(features) > 10:
    # Scale features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    # Apply Isolation Forest
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    df['anomaly_score'] = iso_forest.fit_predict(features_scaled)
    df['is_outlier'] = df['anomaly_score'] == -1
    
    outliers = df[df['is_outlier']]
    print(f"üîç Found {len(outliers)} outlier routes ({len(outliers)/len(df)*100:.1f}%)")
    
    if not outliers.empty:
        print("\nüìù Sample outliers:")
        print(outliers[['x', 'y', 'city']].head())
else:
    print("‚ö†Ô∏è Not enough data for anomaly detection")

# Cell 8: Geospatial Analysis
print("\n" + "="*50)
print("GEOSPATIAL ANALYSIS")
print("="*50)

# Create interactive map
print("\nüó∫Ô∏è Creating interactive heatmap...")

# Sample coordinates for Norwegian coast (in reality, use actual waypoints)
norway_coords = [
    [58.1467, 8.0980], [58.9699, 5.7331], [60.3913, 5.3221],
    [59.9139, 10.7522], [63.4305, 10.3951], [62.4722, 6.1497]
]

# Create map centered on Norway
m = folium.Map(location=[63.0, 10.0], zoom_start=5, tiles='CartoDB positron')

# Add heatmap
HeatMap(norway_coords * 5, radius=15, blur=10).add_to(m)

# Add markers for major ports
ports = {
    'Bergen': [60.3913, 5.3221],
    'Oslo': [59.9139, 10.7522],
    'Stavanger': [58.9699, 5.7331],
    'Trondheim': [63.4305, 10.3951],
    '√Ölesund': [62.4722, 6.1497]
}

for port, coords in ports.items():
    folium.Marker(
        coords,
        popup=f"<b>{port}</b><br>Major Norwegian Port",
        icon=folium.Icon(color='blue', icon='anchor')
    ).add_to(m)

# Display map
print("‚úÖ Map created successfully")
display(m)  # This works in Jupyter
# To save: m.save('maritime_heatmap.html')

# Cell 9: Business Insights
print("\n" + "="*50)
print("BUSINESS INSIGHTS & RECOMMENDATIONS")
print("="*50)

print("\nüí° Key Insights:")
print("1. Route Optimization: Average distance shows potential for optimization")
print("2. Outlier Detection: Identified unusual routes for further investigation")
print("3. City Patterns: Different cities show distinct route characteristics")

print("\nüéØ Recommendations:")
print("1. Implement real-time route optimization for fuel savings")
print("2. Set up automated alerts for anomalous routes")
print("3. Develop predictive maintenance based on route patterns")

# Cell 10: Export Analysis Results
print("\n" + "="*50)
print("EXPORTING RESULTS")
print("="*50)

# Save analysis summary
analysis_summary = {
    'timestamp': datetime.now().isoformat(),
    'total_routes_analyzed': len(df),
    'average_distance': float(df['x'].mean()),
    'average_waypoints': float(df['y'].mean()),
    'outliers_detected': len(outliers) if 'outliers' in locals() else 0,
    'correlation_distance_waypoints': float(correlation)
}

# Save to JSON
with open('analysis_results.json', 'w') as f:
    json.dump(analysis_summary, f, indent=2)

print("‚úÖ Analysis results saved to 'analysis_results.json'")
print("\n" + "="*50)
print("ANALYSIS COMPLETE - Ready for Data Science Portfolio!")
print("="*50)