In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic training data for carbon stock estimation
def generate_carbon_stock_data(n_samples=5000):
    """
    Generate realistic synthetic data for carbon stock estimation
    
    Parameters:
    n_samples: Number of samples to generate
    
    Returns:
    DataFrame with features and target variable
    """
    
    # Generate base features with realistic ranges
    # NDVI typically ranges from -1 to 1, but for vegetation it's usually 0.1 to 0.9
    ndvi_base = np.random.beta(2, 2, n_samples) * 0.8 + 0.1  # Range: 0.1 to 0.9
    
    # Canopy cover percentage (0 to 100%)
    canopy_base = np.random.beta(1.5, 1.5, n_samples) * 100
    
    # Soil carbon content (typically 0.5% to 8% organic carbon)
    soil_carbon_base = np.random.gamma(2, 1.5, n_samples) + 0.5
    soil_carbon_base = np.clip(soil_carbon_base, 0.5, 8.0)
    
    # Add realistic correlations between variables
    # Higher NDVI typically correlates with higher canopy cover
    correlation_noise = np.random.normal(0, 0.1, n_samples)
    canopy_cover = canopy_base * (0.7 + 0.3 * ndvi_base) + correlation_noise * 10
    canopy_cover = np.clip(canopy_cover, 0, 100)
    
    # Soil carbon often correlates with vegetation health
    soil_carbon = soil_carbon_base * (0.8 + 0.2 * ndvi_base) + correlation_noise * 0.5
    soil_carbon = np.clip(soil_carbon, 0.5, 8.0)
    
    # NDVI with some noise
    ndvi = ndvi_base + correlation_noise * 0.05
    ndvi = np.clip(ndvi, -1, 1)
    
    # Generate additional environmental factors (for more realistic modeling)
    # Elevation (meters above sea level)
    elevation = np.random.normal(500, 300, n_samples)
    elevation = np.clip(elevation, 0, 3000)
    
    # Temperature (annual average in Celsius)
    temperature = np.random.normal(15, 8, n_samples)
    
    # Precipitation (annual in mm)
    precipitation = np.random.gamma(2, 400, n_samples)
    precipitation = np.clip(precipitation, 200, 3000)
    
    # Calculate carbon sequestration based on realistic relationships
    # Formula based on research literature combining multiple factors
    
    # Base carbon sequestration influenced by vegetation indices
    vegetation_factor = (ndvi * 50) + (canopy_cover * 0.3)  # Strong vegetation influence
    
    # Soil carbon contribution
    soil_factor = soil_carbon * 8  # Soil carbon is major contributor
    
    # Environmental modifiers
    temp_modifier = 1 + 0.02 * (temperature - 15)  # Temperature effect
    precip_modifier = 1 + 0.0002 * (precipitation - 1000)  # Precipitation effect
    elevation_modifier = 1 - 0.0001 * elevation  # Slight elevation effect
    
    # Combine all factors with realistic coefficients
    carbon_sequestration = (
        vegetation_factor * 0.4 +  # 40% from vegetation
        soil_factor * 0.5 +        # 50% from soil
        5  # Base sequestration
    ) * temp_modifier * precip_modifier * elevation_modifier
    
    # Add realistic noise (measurement uncertainty, spatial variability)
    noise = np.random.normal(0, carbon_sequestration * 0.15)  # 15% coefficient of variation
    carbon_sequestration += noise
    
    # Ensure realistic range (0 to 150 tCO2e/ha is typical)
    carbon_sequestration = np.clip(carbon_sequestration, 0, 150)
    
    # Create DataFrame
    data = pd.DataFrame({
        'NDVI': ndvi,
        'Canopy_Cover_Percent': canopy_cover,
        'Soil_Carbon_Percent': soil_carbon,
        'Elevation_m': elevation,
        'Temperature_C': temperature,
        'Precipitation_mm': precipitation,
        'Carbon_Sequestration_tCO2e_ha': carbon_sequestration
    })
    
    return data

# Generate the dataset
print("Generating synthetic carbon stock estimation dataset...")
df = generate_carbon_stock_data(n_samples=5000)

# Display basic information about the dataset
print(f"\nDataset shape: {df.shape}")
print(f"\nDataset info:")
print(df.info())

print(f"\nFirst few rows:")
print(df.head())

print(f"\nBasic statistics:")
print(df.describe())

# Check for any missing values
print(f"\nMissing values:")
print(df.isnull().sum())

print("\nDataset generated successfully!")

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [2]:
from metagpt.tools.libs.terminal import Terminal
terminal = Terminal()
await terminal.run('pip install --upgrade pandas numpy scikit-learn matplotlib seaborn joblib')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic training data for carbon stock estimation
def generate_carbon_stock_data(n_samples=5000):
    """
    Generate realistic synthetic data for carbon stock estimation
    
    Parameters:
    n_samples: Number of samples to generate
    
    Returns:
    DataFrame with features and target variable
    """
    
    # Generate base features with realistic ranges
    # NDVI typically ranges from -1 to 1, but for vegetation it's usually 0.1 to 0.9
    ndvi_base = np.random.beta(2, 2, n_samples) * 0.8 + 0.1  # Range: 0.1 to 0.9
    
    # Canopy cover percentage (0 to 100%)
    canopy_base = np.random.beta(1.5, 1.5, n_samples) * 100
    
    # Soil carbon content (typically 0.5% to 8% organic carbon)
    soil_carbon_base = np.random.gamma(2, 1.5, n_samples) + 0.5
    soil_carbon_base = np.clip(soil_carbon_base, 0.5, 8.0)
    
    # Add realistic correlations between variables
    # Higher NDVI typically correlates with higher canopy cover
    correlation_noise = np.random.normal(0, 0.1, n_samples)
    canopy_cover = canopy_base * (0.7 + 0.3 * ndvi_base) + correlation_noise * 10
    canopy_cover = np.clip(canopy_cover, 0, 100)
    
    # Soil carbon often correlates with vegetation health
    soil_carbon = soil_carbon_base * (0.8 + 0.2 * ndvi_base) + correlation_noise * 0.5
    soil_carbon = np.clip(soil_carbon, 0.5, 8.0)
    
    # NDVI with some noise
    ndvi = ndvi_base + correlation_noise * 0.05
    ndvi = np.clip(ndvi, -1, 1)
    
    # Generate additional environmental factors (for more realistic modeling)
    # Elevation (meters above sea level)
    elevation = np.random.normal(500, 300, n_samples)
    elevation = np.clip(elevation, 0, 3000)
    
    # Temperature (annual average in Celsius)
    temperature = np.random.normal(15, 8, n_samples)
    
    # Precipitation (annual in mm)
    precipitation = np.random.gamma(2, 400, n_samples)
    precipitation = np.clip(precipitation, 200, 3000)
    
    # Calculate carbon sequestration based on realistic relationships
    # Formula based on research literature combining multiple factors
    
    # Base carbon sequestration influenced by vegetation indices
    vegetation_factor = (ndvi * 50) + (canopy_cover * 0.3)  # Strong vegetation influence
    
    # Soil carbon contribution
    soil_factor = soil_carbon * 8  # Soil carbon is major contributor
    
    # Environmental modifiers
    temp_modifier = 1 + 0.02 * (temperature - 15)  # Temperature effect
    precip_modifier = 1 + 0.0002 * (precipitation - 1000)  # Precipitation effect
    elevation_modifier = 1 - 0.0001 * elevation  # Slight elevation effect
    
    # Combine all factors with realistic coefficients
    carbon_sequestration = (
        vegetation_factor * 0.4 +  # 40% from vegetation
        soil_factor * 0.5 +        # 50% from soil
        5  # Base sequestration
    ) * temp_modifier * precip_modifier * elevation_modifier
    
    # Add realistic noise (measurement uncertainty, spatial variability)
    noise = np.random.normal(0, carbon_sequestration * 0.15)  # 15% coefficient of variation
    carbon_sequestration += noise
    
    # Ensure realistic range (0 to 150 tCO2e/ha is typical)
    carbon_sequestration = np.clip(carbon_sequestration, 0, 150)
    
    # Create DataFrame
    data = pd.DataFrame({
        'NDVI': ndvi,
        'Canopy_Cover_Percent': canopy_cover,
        'Soil_Carbon_Percent': soil_carbon,
        'Elevation_m': elevation,
        'Temperature_C': temperature,
        'Precipitation_mm': precipitation,
        'Carbon_Sequestration_tCO2e_ha': carbon_sequestration
    })
    
    return data

# Generate the dataset
print("Generating synthetic carbon stock estimation dataset...")
df = generate_carbon_stock_data(n_samples=5000)

# Display basic information about the dataset
print(f"\nDataset shape: {df.shape}")
print(f"\nDataset info:")
print(df.info())

print(f"\nFirst few rows:")
print(df.head())

print(f"\nBasic statistics:")
print(df.describe())

# Check for any missing values
print(f"\nMissing values:")
print(df.isnull().sum())

print("\nDataset generated successfully!")

2025-08-27 00:42:19.848 | INFO     | metagpt.const:get_metagpt_root:33 - PROJECT_ROOT set from environment variable to /


ModuleNotFoundError: No module named 'pandas'

In [3]:
from metagpt.tools.libs.terminal import Terminal
terminal = Terminal()

# Try multiple installation approaches
try:
    await terminal.run('pip uninstall -y pandas numpy scikit-learn matplotlib seaborn')
    await terminal.run('pip install pandas==1.5.3 numpy==1.24.3 scikit-learn matplotlib seaborn joblib')
except:
    try:
        await terminal.run('conda install -y pandas numpy scikit-learn matplotlib seaborn joblib -c conda-forge')
    except:
        await terminal.run('pip install --no-cache-dir --force-reinstall pandas numpy scikit-learn matplotlib seaborn joblib')

# Import libraries with fallback options
try:
    import pandas as pd
    PANDAS_AVAILABLE = True
except ImportError:
    PANDAS_AVAILABLE = False
    print("Pandas not available, using numpy arrays instead")

import numpy as np
import matplotlib.pyplot as plt
try:
    import seaborn as sns
except ImportError:
    print("Seaborn not available, using matplotlib only")

try:
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from sklearn.neural_network import MLPRegressor
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
    import joblib
    SKLEARN_AVAILABLE = True
except ImportError:
    SKLEARN_AVAILABLE = False
    print("Scikit-learn not available, will implement basic functionality")

import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic training data for carbon stock estimation
def generate_carbon_stock_data(n_samples=5000):
    """
    Generate realistic synthetic data for carbon stock estimation
    
    Parameters:
    n_samples: Number of samples to generate
    
    Returns:
    Dictionary or DataFrame with features and target variable
    """
    
    # Generate base features with realistic ranges
    # NDVI typically ranges from -1 to 1, but for vegetation it's usually 0.1 to 0.9
    ndvi_base = np.random.beta(2, 2, n_samples) * 0.8 + 0.1  # Range: 0.1 to 0.9
    
    # Canopy cover percentage (0 to 100%)
    canopy_base = np.random.beta(1.5, 1.5, n_samples) * 100
    
    # Soil carbon content (typically 0.5% to 8% organic carbon)
    soil_carbon_base = np.random.gamma(2, 1.5, n_samples) + 0.5
    soil_carbon_base = np.clip(soil_carbon_base, 0.5, 8.0)
    
    # Add realistic correlations between variables
    # Higher NDVI typically correlates with higher canopy cover
    correlation_noise = np.random.normal(0, 0.1, n_samples)
    canopy_cover = canopy_base * (0.7 + 0.3 * ndvi_base) + correlation_noise * 10
    canopy_cover = np.clip(canopy_cover, 0, 100)
    
    # Soil carbon often correlates with vegetation health
    soil_carbon = soil_carbon_base * (0.8 + 0.2 * ndvi_base) + correlation_noise * 0.5
    soil_carbon = np.clip(soil_carbon, 0.5, 8.0)
    
    # NDVI with some noise
    ndvi = ndvi_base + correlation_noise * 0.05
    ndvi = np.clip(ndvi, -1, 1)
    
    # Generate additional environmental factors (for more realistic modeling)
    # Elevation (meters above sea level)
    elevation = np.random.normal(500, 300, n_samples)
    elevation = np.clip(elevation, 0, 3000)
    
    # Temperature (annual average in Celsius)
    temperature = np.random.normal(15, 8, n_samples)
    
    # Precipitation (annual in mm)
    precipitation = np.random.gamma(2, 400, n_samples)
    precipitation = np.clip(precipitation, 200, 3000)
    
    # Calculate carbon sequestration based on realistic relationships
    # Formula based on research literature combining multiple factors
    
    # Base carbon sequestration influenced by vegetation indices
    vegetation_factor = (ndvi * 50) + (canopy_cover * 0.3)  # Strong vegetation influence
    
    # Soil carbon contribution
    soil_factor = soil_carbon * 8  # Soil carbon is major contributor
    
    # Environmental modifiers
    temp_modifier = 1 + 0.02 * (temperature - 15)  # Temperature effect
    precip_modifier = 1 + 0.0002 * (precipitation - 1000)  # Precipitation effect
    elevation_modifier = 1 - 0.0001 * elevation  # Slight elevation effect
    
    # Combine all factors with realistic coefficients
    carbon_sequestration = (
        vegetation_factor * 0.4 +  # 40% from vegetation
        soil_factor * 0.5 +        # 50% from soil
        5  # Base sequestration
    ) * temp_modifier * precip_modifier * elevation_modifier
    
    # Add realistic noise (measurement uncertainty, spatial variability)
    noise = np.random.normal(0, carbon_sequestration * 0.15)  # 15% coefficient of variation
    carbon_sequestration += noise
    
    # Ensure realistic range (0 to 150 tCO2e/ha is typical)
    carbon_sequestration = np.clip(carbon_sequestration, 0, 150)
    
    # Create data structure
    data = {
        'NDVI': ndvi,
        'Canopy_Cover_Percent': canopy_cover,
        'Soil_Carbon_Percent': soil_carbon,
        'Elevation_m': elevation,
        'Temperature_C': temperature,
        'Precipitation_mm': precipitation,
        'Carbon_Sequestration_tCO2e_ha': carbon_sequestration
    }
    
    if PANDAS_AVAILABLE:
        return pd.DataFrame(data)
    else:
        return data

# Generate the dataset
print("Generating synthetic carbon stock estimation dataset...")
data = generate_carbon_stock_data(n_samples=5000)

if PANDAS_AVAILABLE:
    df = data
    # Display basic information about the dataset
    print(f"\nDataset shape: {df.shape}")
    print(f"\nDataset info:")
    print(df.info())
    
    print(f"\nFirst few rows:")
    print(df.head())
    
    print(f"\nBasic statistics:")
    print(df.describe())
    
    # Check for any missing values
    print(f"\nMissing values:")
    print(df.isnull().sum())
else:
    # Alternative display using numpy arrays
    print(f"\nDataset size: {len(data['NDVI'])} samples")
    print(f"\nFeatures: {list(data.keys())}")
    
    print(f"\nFirst 5 samples:")
    for key in data.keys():
        print(f"{key}: {data[key][:5]}")
    
    print(f"\nBasic statistics for each feature:")
    for key, values in data.items():
        print(f"{key}: mean={np.mean(values):.3f}, std={np.std(values):.3f}, min={np.min(values):.3f}, max={np.max(values):.3f}")

print("\nDataset generated successfully!")

# Save data to workspace directory
print("\nSaving dataset to /workspace directory...")
try:
    import os
    os.makedirs('/workspace', exist_ok=True)
    
    if PANDAS_AVAILABLE:
        df.to_csv('/workspace/carbon_stock_dataset.csv', index=False)
        print("Dataset saved as CSV file to /workspace/carbon_stock_dataset.csv")
    else:
        # Save as numpy arrays
        np.savez('/workspace/carbon_stock_dataset.npz', **data)
        print("Dataset saved as NPZ file to /workspace/carbon_stock_dataset.npz")
        
except Exception as e:
    print(f"Could not save to /workspace: {e}")
    print("Dataset is available in memory as 'data' variable")

Pandas not available, using numpy arrays instead
Seaborn not available, using matplotlib only
Scikit-learn not available, will implement basic functionality
Generating synthetic carbon stock estimation dataset...

Dataset size: 5000 samples

Features: ['NDVI', 'Canopy_Cover_Percent', 'Soil_Carbon_Percent', 'Elevation_m', 'Temperature_C', 'Precipitation_mm', 'Carbon_Sequestration_tCO2e_ha']

First 5 samples:
NDVI: [0.59441193 0.49246481 0.59564566 0.35221039 0.82572169]
Canopy_Cover_Percent: [17.71722383 50.78589564 72.46957828 21.03880765 49.40648103]
Soil_Carbon_Percent: [2.2849587  2.04223721 5.37630528 1.79620843 7.75230956]
Elevation_m: [461.68091287 279.67126847 314.72213457 318.04958895   0.        ]
Temperature_C: [ 8.76726682 11.0455822  22.32536447 13.73400516  9.02010654]
Precipitation_mm: [ 653.9033301  1273.53814557  749.0850644  1173.23712437  200.        ]
Carbon_Sequestration_tCO2e_ha: [25.67114456 33.37812763 45.24962012 26.88082773 44.83206878]

Basic statistics for ea