# Getting Started with RESOLVE

This notebook demonstrates how to use RESOLVE to predict plot-level environmental attributes from species composition data.

## Installation

```bash
pip install resolve-ml
```

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# Import RESOLVE
import resolve

## Prepare Sample Data

RESOLVE requires two CSV files:
1. **Header file**: One row per plot with plot ID, coordinates, covariates, and target variables
2. **Species file**: One row per species-plot occurrence with species ID, plot ID, and abundance

In [None]:
# Example header data (plot-level attributes)
header_df = pd.DataFrame({
    'plot_id': [f'P{i:03d}' for i in range(1, 101)],
    'longitude': np.random.uniform(4.0, 5.0, 100),
    'latitude': np.random.uniform(50.5, 51.0, 100),
    'elevation': np.random.uniform(50, 200, 100),
    'biomass': np.random.exponential(50, 100),  # Target: regression
    'habitat': np.random.choice(['forest', 'grassland', 'wetland'], 100)  # Target: classification
})

print("Header data shape:", header_df.shape)
header_df.head()

In [None]:
# Example species data (species occurrences per plot)
species_list = [
    ('Quercus robur', 'Quercus', 'Fagaceae'),
    ('Fagus sylvatica', 'Fagus', 'Fagaceae'),
    ('Pinus sylvestris', 'Pinus', 'Pinaceae'),
    ('Betula pendula', 'Betula', 'Betulaceae'),
    ('Acer pseudoplatanus', 'Acer', 'Sapindaceae'),
    ('Fraxinus excelsior', 'Fraxinus', 'Oleaceae'),
    ('Carpinus betulus', 'Carpinus', 'Betulaceae'),
    ('Tilia cordata', 'Tilia', 'Malvaceae'),
]

# Generate random species occurrences
species_records = []
for plot_id in header_df['plot_id']:
    n_species = np.random.randint(3, 8)
    selected = np.random.choice(len(species_list), n_species, replace=False)
    for idx in selected:
        name, genus, family = species_list[idx]
        species_records.append({
            'plot_id': plot_id,
            'species': name,
            'cover': np.random.uniform(5, 80),
            'genus': genus,
            'family': family
        })

species_df = pd.DataFrame(species_records)
print("Species data shape:", species_df.shape)
species_df.head(10)

In [None]:
# Save to temporary CSV files
header_path = Path('temp_header.csv')
species_path = Path('temp_species.csv')

header_df.to_csv(header_path, index=False)
species_df.to_csv(species_path, index=False)

print(f"Saved header to {header_path}")
print(f"Saved species to {species_path}")

## Load Dataset with Role Mapping

RESOLVE uses role mapping to understand your data structure. Map your column names to semantic roles.

In [None]:
# Define column role mapping
roles = {
    'plot_id': 'plot_id',          # Plot identifier in header
    'species_id': 'species',        # Species name column in species file
    'species_plot_id': 'plot_id',   # Plot identifier in species file
    'abundance': 'cover',           # Abundance/cover values
    'coords_lon': 'longitude',      # Longitude column
    'coords_lat': 'latitude',       # Latitude column
    'taxonomy_genus': 'genus',      # Genus column
    'taxonomy_family': 'family'     # Family column
}

# Define target variables
targets = {
    'biomass': {
        'column': 'biomass',
        'task': 'regression',
        'transform': 'log1p'  # Log-transform for positive values
    }
}

In [None]:
# Load dataset
dataset = resolve.Dataset.from_csv(
    header_path=str(header_path),
    species_path=str(species_path),
    roles=roles,
    targets=targets
)

print(f"Dataset loaded:")
print(f"  Plots: {dataset.schema.n_plots}")
print(f"  Species: {dataset.schema.n_species}")
print(f"  Has coordinates: {dataset.schema.has_coordinates}")
print(f"  Has taxonomy: {dataset.schema.has_taxonomy}")

## Train a Model

In [None]:
# Configure training
trainer = resolve.Trainer(
    dataset,
    hidden_dims=[512, 256, 128, 64],
    species_encoding='hash',  # or 'embed' for learned embeddings
    hash_dim=32,
    device='cpu'  # Use 'cuda' for GPU acceleration
)

# Train the model
result = trainer.fit(
    max_epochs=100,
    patience=20,
    batch_size=32,
    lr=1e-3,
    verbose=True
)

print(f"\nTraining complete!")
print(f"Best epoch: {result.best_epoch}")
print(f"Best loss: {result.best_loss:.4f}")

In [None]:
# View final metrics
for target_name, metrics in result.final_metrics.items():
    print(f"\n{target_name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

## Make Predictions

In [None]:
# Predict on the training data (for demonstration)
predictions = trainer.predict(dataset)

print("Prediction keys:", list(predictions.predictions.keys()))
print("Biomass predictions shape:", predictions.predictions['biomass'].shape)

In [None]:
# Compare predictions vs actual
import matplotlib.pyplot as plt

actual = header_df['biomass'].values
predicted = predictions.predictions['biomass'].numpy()

plt.figure(figsize=(8, 6))
plt.scatter(actual, predicted, alpha=0.6)
plt.plot([0, max(actual)], [0, max(actual)], 'r--', label='1:1 line')
plt.xlabel('Actual Biomass')
plt.ylabel('Predicted Biomass')
plt.title('RESOLVE Predictions vs Actual')
plt.legend()
plt.tight_layout()
plt.show()

## Confidence-Based Filtering

RESOLVE tracks the fraction of unknown species per plot. Use this for confidence-based filtering.

In [None]:
# Get unknown fraction (0 = all species known, 1 = all unknown)
unknown_frac = predictions.unknown_fraction.numpy()
confidence = 1 - unknown_frac

print(f"Confidence range: [{confidence.min():.2f}, {confidence.max():.2f}]")
print(f"Mean confidence: {confidence.mean():.2f}")

In [None]:
# Filter predictions by confidence threshold
threshold = 0.5
high_confidence_mask = confidence >= threshold

print(f"Plots with confidence >= {threshold}: {high_confidence_mask.sum()} / {len(confidence)}")

## Save and Load Models

In [None]:
# Save the trained model
model_path = Path('resolve_model.pt')
trainer.save(str(model_path))
print(f"Model saved to {model_path}")

In [None]:
# Load the model for inference
predictor = resolve.Predictor.load(str(model_path))

# Make predictions with loaded model
new_predictions = predictor.predict(dataset)
print("Predictions from loaded model:", new_predictions.predictions['biomass'].shape)

## Cleanup

In [None]:
# Remove temporary files
header_path.unlink(exist_ok=True)
species_path.unlink(exist_ok=True)
model_path.unlink(exist_ok=True)
print("Temporary files cleaned up.")

## Next Steps

- **Multi-task learning**: Add multiple targets (regression + classification)
- **GPU acceleration**: Use `device='cuda'` for larger datasets
- **Hyperparameter tuning**: Experiment with `hidden_dims`, `hash_dim`, `species_encoding`
- **Loss configuration**: Try `lossConfig='combined'` for phased training

See the [documentation](https://github.com/gcol33/resolve) for more details.