In [None]:
"""
MINE DETECTION AND CLASSIFICATION - COMPLETE ANALYSIS
=====================================================

This notebook demonstrates:
1. Data Loading and Exploration
2. Data Preprocessing
3. Classification (Random Forest + Neural Networks)
4. Clustering Analysis
5. Results Comparison

Authors: [Gaia Luna Acosta, Bujar Cysa]
Date: [01/12/2025]
"""

# %% [markdown]
# # 1. Setup and Imports

# %%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Import custom modules
import sys
sys.path.append('../src')

from data_loader import MineDataLoader
from preprocessing import MineDataPreprocessor
from classification import RandomForestOptimizer
from neural_network import NeuralNetworkExperiment
from clustering import ClusteringAnalysis

print("✓ All imports successful!")

# %% [markdown]
# # 2. Load and Explore Data

# %%
# Load data
loader = MineDataLoader('../data/processed/mine_data_clean.csv')
df = loader.load_data(file_format='csv')

# Print summary
loader.print_summary()

# Display first rows
print("\nFirst 5 samples:")
print(df.head())

# %%
# Visualize data distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Voltage distribution by mine type
df.boxplot(column='V', by='M', ax=axes[0, 0])
axes[0, 0].set_title('Voltage Distribution by Mine Type')
axes[0, 0].set_xlabel('Mine Type')
axes[0, 0].set_ylabel('Voltage (V)')

# Height distribution
df['H'].hist(bins=30, ax=axes[0, 1], edgecolor='black')
axes[0, 1].set_title('Height Distribution')
axes[0, 1].set_xlabel('Height (cm)')
axes[0, 1].set_ylabel('Frequency')

# Soil type distribution
df['S'].value_counts().plot(kind='bar', ax=axes[1, 0], edgecolor='black')
axes[1, 0].set_title('Soil Type Distribution')
axes[1, 0].set_xlabel('Soil Type')
axes[1, 0].set_ylabel('Count')

# Mine type distribution
df['M'].value_counts().sort_index().plot(kind='bar', ax=axes[1, 1], 
                                          color=['red', 'blue', 'green', 'orange', 'purple'],
                                          edgecolor='black')
axes[1, 1].set_title('Mine Type Distribution')
axes[1, 1].set_xlabel('Mine Type')
axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.savefig('../results/figures/data_exploration.png', dpi=300, bbox_inches='tight')
plt.show()

# %%
# Correlation analysis
X, y = loader.get_features_target()
correlation = X.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('../results/figures/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# # 3. Data Preprocessing

# %%
preprocessor = MineDataPreprocessor()

# Get features and target
X, y = loader.get_features_target()

# Check class balance
balance = preprocessor.check_class_balance(y)
print("\nClass Balance:")
print(balance)

# Prepare data for Random Forest
X_rf, y_rf = preprocessor.prepare_for_random_forest(X, y)

# Prepare data for Neural Network
X_nn, y_nn = preprocessor.prepare_for_neural_network(X, y)

print(f"\n✓ Data prepared for both Random Forest and Neural Network")
print(f"Random Forest input shape: {X_rf.shape}")
print(f"Neural Network input shape: {X_nn.shape}")

# %% [markdown]
# # 4. Classification - Random Forest

# %% [markdown]
# ## 4.1 Baseline Random Forest

# %%
from sklearn.model_selection import train_test_split

# Split data for Random Forest
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(
    X_rf, y_rf, test_size=0.2, random_state=42, stratify=y_rf
)

# Initialize optimizer
rf_optimizer = RandomForestOptimizer(random_state=42)

# Train baseline
rf_baseline = rf_optimizer.train_baseline(X_train_rf, y_train_rf)

# Evaluate baseline
baseline_metrics = rf_optimizer.evaluate_model(
    rf_baseline, X_test_rf, y_test_rf, "Baseline Random Forest"
)

# %% [markdown]
# ## 4.2 Hyperparameter Tuning

# %%
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Perform grid search (this may take a few minutes)
print("Starting hyperparameter tuning...")
grid_search = rf_optimizer.grid_search_tuning(
    X_train_rf, y_train_rf, 
    param_grid=param_grid, 
    cv=5
)

# Evaluate optimized model
optimized_metrics = rf_optimizer.evaluate_model(
    rf_optimizer.optimized_model, X_test_rf, y_test_rf, 
    "Optimized Random Forest"
)

# %% [markdown]
# ## 4.3 Compare Baseline vs Optimized

# %%
# Compare models
comparison_rf = rf_optimizer.compare_models(X_test_rf, y_test_rf)

# Plot comparison
fig, ax = plt.subplots(figsize=(10, 6))
comparison_rf.set_index('Model')[['Accuracy', 'Precision', 'Recall', 'F1-Score']].plot(
    kind='bar', ax=ax, width=0.8
)
plt.title('Baseline vs Optimized Random Forest')
plt.ylabel('Score')
plt.ylim([0.7, 1.0])
plt.xticks(rotation=0)
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../results/figures/rf_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 4.4 Feature Importance

# %%
# Get feature importance
importance_df = rf_optimizer.get_feature_importance(
    feature_names=['Voltage', 'Height', 'Soil Type'],
    model_type='optimized'
)

print("\nFeature Importance:")
print(importance_df)

# Plot feature importance
fig, ax = plt.subplots(figsize=(10, 6))
importance_df.plot(x='Feature', y='Importance', kind='barh', ax=ax, legend=False)
plt.xlabel('Importance')
plt.title('Feature Importance - Optimized Random Forest')
plt.tight_layout()
plt.savefig('../results/figures/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 4.5 Learning Curves

# %%
# Plot learning curve
fig = rf_optimizer.plot_learning_curve(X_rf, y_rf, model_type='optimized', cv=5)
plt.savefig('../results/figures/learning_curve_rf.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# # 5. Neural Networks

# %% [markdown]
# ## 5.1 Prepare Data

# %%
# Split data for Neural Network (train/val/test)
X_train_nn, X_val_nn, X_test_nn, y_train_nn, y_val_nn, y_test_nn = \
    preprocessor.create_train_val_test_split(X_nn, y_nn)

print(f"Training set:   {X_train_nn.shape[0]} samples")
print(f"Validation set: {X_val_nn.shape[0]} samples")
print(f"Test set:       {X_test_nn.shape[0]} samples")

# %% [markdown]
# ## 5.2 Create and Train Different Architectures

# %%
# Initialize experiment
nn_exp = NeuralNetworkExperiment(
    input_dim=X_train_nn.shape[1],
    num_classes=5,
    random_state=42
)

# %% [markdown]
# ### Simple Model (2 layers)

# %%
# Create simple model
simple_model = nn_exp.create_simple_model(learning_rate=0.001, name='simple')
print("\nSimple Model Architecture:")
simple_model.summary()

# Train
history_simple = nn_exp.train_model(
    simple_model,
    X_train_nn, y_train_nn,
    X_val_nn, y_val_nn,
    epochs=100,
    batch_size=32,
    verbose=1
)

# %% [markdown]
# ### Medium Model (3 layers with Dropout)

# %%
# Create medium model
medium_model = nn_exp.create_medium_model(
    learning_rate=0.001, 
    dropout_rate=0.3, 
    name='medium'
)
print("\nMedium Model Architecture:")
medium_model.summary()

# Train
history_medium = nn_exp.train_model(
    medium_model,
    X_train_nn, y_train_nn,
    X_val_nn, y_val_nn,
    epochs=100,
    batch_size=32,
    verbose=1
)

# %% [markdown]
# ### Deep Model (4 layers with Batch Normalization)

# %%
# Create deep model
deep_model = nn_exp.create_deep_model(
    learning_rate=0.001,
    dropout_rate=0.3,
    use_batch_norm=True,
    name='deep'
)
print("\nDeep Model Architecture:")
deep_model.summary()

# Train
history_deep = nn_exp.train_model(
    deep_model,
    X_train_nn, y_train_nn,
    X_val_nn, y_val_nn,
    epochs=100,
    batch_size=32,
    verbose=1
)

# %% [markdown]
# ## 5.3 Compare Neural Network Architectures

# %%
# Plot training histories
fig = nn_exp.plot_training_history(figsize=(15, 5))
plt.savefig('../results/figures/nn_training_history.png', dpi=300, bbox_inches='tight')
plt.show()

# Compare all models
comparison_nn = nn_exp.compare_all_models(X_test_nn, y_test_nn)

# %% [markdown]
# ## 5.4 Confusion Matrices

# %%
# Plot confusion matrices
fig = nn_exp.plot_confusion_matrices(X_test_nn, y_test_nn)
plt.savefig('../results/figures/nn_confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# # 6. Clustering Analysis

# %% [markdown]
# ## 6.1 Find Optimal Number of Clusters

# %%
# Initialize clustering
clustering = ClusteringAnalysis(random_state=42)

# Find optimal K
fig, optimal_k_results = clustering.find_optimal_k(X_nn, k_range=range(2, 11))
plt.savefig('../results/figures/optimal_k_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 6.2 K-Means Clustering

# %%
# Perform K-Means with optimal k (5, matching number of classes)
kmeans_model = clustering.perform_kmeans(X_nn, n_clusters=5)

# Visualize clusters
fig = clustering.visualize_clusters_2d(X_nn, 'kmeans')
plt.savefig('../results/figures/kmeans_clusters.png', dpi=300, bbox_inches='tight')
plt.show()

# Compare with true labels
ari_kmeans = clustering.compare_with_true_labels(y_nn, 'kmeans')

# Plot cluster vs true label distribution
fig = clustering.plot_cluster_distributions(y, 'kmeans')
plt.savefig('../results/figures/kmeans_true_labels.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 6.3 DBSCAN Clustering

# %%
# Perform DBSCAN
dbscan_model = clustering.perform_dbscan(X_nn, eps=0.5, min_samples=5)

# Visualize
fig = clustering.visualize_clusters_2d(X_nn, 'dbscan')
plt.savefig('../results/figures/dbscan_clusters.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 6.4 Hierarchical Clustering

# %%
# Perform hierarchical clustering
hierarchical_model = clustering.perform_hierarchical(X_nn, n_clusters=5)

# Visualize
fig = clustering.visualize_clusters_2d(X_nn, 'hierarchical')
plt.savefig('../results/figures/hierarchical_clusters.png', dpi=300, bbox_inches='tight')
plt.show()

# Plot dendrogram
fig = clustering.plot_dendrogram(X_nn[:100])  # Using subset for clarity
plt.savefig('../results/figures/dendrogram.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 6.5 Compare Clustering Algorithms

# %%
comparison_clustering = clustering.compare_all_algorithms()

# %% [markdown]
# # 7. Final Comparison: All Methods

# %%
# Create comprehensive comparison
final_results = pd.DataFrame({
    'Method': [
        'Random Forest (Baseline)',
        'Random Forest (Optimized)',
        'Neural Network (Simple)',
        'Neural Network (Medium)',
        'Neural Network (Deep)',
        'K-Means (Clustering)',
    ],
    'Accuracy/Performance': [
        baseline_metrics['accuracy'],
        optimized_metrics['accuracy'],
        comparison_nn[comparison_nn['Model'] == 'simple']['Test Accuracy'].values[0],
        comparison_nn[comparison_nn['Model'] == 'medium']['Test Accuracy'].values[0],
        comparison_nn[comparison_nn['Model'] == 'deep']['Test Accuracy'].values[0],
        clustering.metrics['kmeans']['silhouette_score']
    ],
    'Type': ['Classification', 'Classification', 'Classification', 
             'Classification', 'Classification', 'Clustering']
})

print("\n" + "="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)
print(final_results.to_string(index=False))
print("="*80)

# Visualize
fig, ax = plt.subplots(figsize=(12, 6))
colors = ['#FF6B6B' if t == 'Classification' else '#4ECDC4' 
          for t in final_results['Type']]
bars = ax.barh(final_results['Method'], final_results['Accuracy/Performance'], 
               color=colors, edgecolor='black', linewidth=1.5)
ax.set_xlabel('Score')
ax.set_title('Final Performance Comparison - All Methods')
ax.set_xlim([0, 1])
ax.grid(True, alpha=0.3, axis='x')

# Add value labels
for i, (bar, value) in enumerate(zip(bars, final_results['Accuracy/Performance'])):
    ax.text(value + 0.01, bar.get_y() + bar.get_height()/2, 
            f'{value:.4f}', va='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('../results/figures/final_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# # 8. Save Models

# %%
# Save Random Forest models
rf_optimizer.save_model('../results/models/rf_optimized.pkl', 'optimized')
rf_optimizer.save_model('../results/models/rf_baseline.pkl', 'baseline')

# Save best Neural Network
nn_exp.save_model('deep', '../results/models/nn_deep.h5')

print("✓ All models saved successfully!")

# %% [markdown]
# # 9. Conclusions
# 
# ## Key Findings:
# 
# 1. **Random Forest Performance:**
#    - Baseline accuracy: XX%
#    - Optimized accuracy: XX%
#    - Improvement: +XX%
#    - Most important feature: [Feature name]
# 
# 2. **Neural Network Performance:**
#    - Best architecture: [Simple/Medium/Deep]
#    - Test accuracy: XX%
#    - Early stopping prevented overfitting
# 
# 3. **Clustering Analysis:**
#    - K-Means with K=5 showed best silhouette score
#    - Clusters align moderately with true mine types (ARI: XX)
#    - DBSCAN identified XX noise points
# 
# 4. **Overall Best Model:** [Model name] with XX% accuracy
# 
# ## Recommendations:
# - [Add your recommendations based on results]
# - [Future work suggestions]

print("\n✅ ANALYSIS COMPLETE!")