# Random Forest: Predicting ENVO Local Scale from Google Earth Embeddings

This notebook trains a Random Forest classifier to predict environmental ontology (ENVO) local scale terms from Google Earth Engine satellite embeddings.

**Question**: Can geographic/satellite imagery data predict environmental classification labels?

## Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

## Load and Prepare Data

In [None]:
# Load the dataset
df = pd.read_csv('../data/satisfying_biosamples_normalized_with_google_embeddings_with_envo_embeddings.csv')

print(f"Total rows: {len(df)}")
print(f"Columns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Check for missing values in key columns
print("Missing values:")
print(df[['google_earth_embeddings', 'env_local_scale']].isna().sum())

# Filter to rows with both embeddings and labels
df_clean = df[df['google_earth_embeddings'].notna() & df['env_local_scale'].notna()].copy()
print(f"\nRows with both Google Earth embeddings and env_local_scale: {len(df_clean)}")

## Parse Embeddings

In [None]:
def parse_embedding(embedding_str):
    """
    Parse embedding string to numpy array.
    
    Args:
        embedding_str: String representation of embedding list
        
    Returns:
        numpy array of embedding values
    """
    try:
        if isinstance(embedding_str, str):
            embedding_list = ast.literal_eval(embedding_str)
        else:
            embedding_list = embedding_str
        return np.array(embedding_list, dtype=np.float32)
    except Exception as e:
        print(f"Error parsing embedding: {e}")
        return None

# Parse Google Earth embeddings
print("Parsing Google Earth embeddings...")
df_clean['ge_embedding'] = df_clean['google_earth_embeddings'].apply(parse_embedding)

# Remove any rows where parsing failed
df_clean = df_clean[df_clean['ge_embedding'].notna()].copy()

print(f"Rows with valid embeddings: {len(df_clean)}")
if len(df_clean) > 0:
    embedding_dim = len(df_clean.iloc[0]['ge_embedding'])
    print(f"Google Earth embedding dimension: {embedding_dim}")

## Explore Target Variable

In [None]:
# Examine env_local_scale values
print("=== TARGET VARIABLE: env_local_scale ===")
print(f"\nUnique values: {df_clean['env_local_scale'].nunique()}")
print(f"\nValue counts:")
value_counts = df_clean['env_local_scale'].value_counts()
print(value_counts)

# Show some examples
print(f"\nExample values:")
for val in df_clean['env_local_scale'].unique()[:10]:
    print(f"  - {val}")

In [None]:
# Visualize class distribution
fig, ax = plt.subplots(figsize=(14, 6))
value_counts.plot(kind='bar', ax=ax)
ax.set_title('Distribution of ENVO Local Scale Terms', fontsize=14, fontweight='bold')
ax.set_xlabel('ENVO Local Scale Term')
ax.set_ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print(f"\nClass balance analysis:")
print(f"Most common class: {value_counts.iloc[0]} samples ({value_counts.iloc[0]/len(df_clean)*100:.1f}%)")
print(f"Least common class: {value_counts.iloc[-1]} samples ({value_counts.iloc[-1]/len(df_clean)*100:.1f}%)")

## Prepare Features and Labels

In [None]:
# Create feature matrix X from embeddings
X = np.vstack(df_clean['ge_embedding'].values)
print(f"Feature matrix shape: {X.shape}")

# Create target vector y
y = df_clean['env_local_scale'].values
print(f"Target vector shape: {y.shape}")

print(f"\nNumber of samples: {len(X)}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of classes: {len(np.unique(y))}")

## Train/Test Split

In [None]:
# Determine if stratification is possible
min_class_count = value_counts.min()
use_stratify = min_class_count >= 2  # Need at least 2 samples per class for stratification

print(f"Minimum class count: {min_class_count}")
print(f"Using stratified split: {use_stratify}")

# Split the data
test_size = 0.2
if use_stratify:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=RANDOM_STATE, stratify=y
    )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=RANDOM_STATE
    )

print(f"\nTraining set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"\nTraining set class distribution:")
print(pd.Series(y_train).value_counts())

## Train Random Forest Classifier

In [None]:
# Train Random Forest
print("Training Random Forest Classifier...")
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf_classifier.fit(X_train, y_train)
print("Training complete!")

## Model Evaluation

In [None]:
# Make predictions
y_train_pred = rf_classifier.predict(X_train)
y_test_pred = rf_classifier.predict(X_test)

# Calculate accuracies
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("=== MODEL PERFORMANCE ===")
print(f"Training accuracy: {train_accuracy:.3f}")
print(f"Test accuracy: {test_accuracy:.3f}")
print(f"\nDifference (overfitting check): {train_accuracy - test_accuracy:.3f}")

In [None]:
# Cross-validation on training set
if len(X_train) >= 5:  # Need enough samples for CV
    cv_folds = min(5, len(X_train))
    print(f"\n=== CROSS-VALIDATION ({cv_folds}-fold) ===")
    cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=cv_folds, scoring='accuracy')
    print(f"CV Accuracy scores: {cv_scores}")
    print(f"Mean CV accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
else:
    print("\nDataset too small for cross-validation")

In [None]:
# Detailed classification report
print("\n=== CLASSIFICATION REPORT (Test Set) ===")
print(classification_report(y_test, y_test_pred, zero_division=0))

## Confusion Matrix

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
class_labels = rf_classifier.classes_

# Plot confusion matrix
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_labels, yticklabels=class_labels,
            ax=ax, cbar_kws={'label': 'Count'})
ax.set_title('Confusion Matrix: Predicted vs Actual ENVO Local Scale', 
             fontsize=14, fontweight='bold')
ax.set_ylabel('True Label')
ax.set_xlabel('Predicted Label')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## Feature Importance

In [None]:
# Get feature importances
feature_importances = rf_classifier.feature_importances_
feature_names = [f"GE_dim_{i}" for i in range(len(feature_importances))]

# Sort by importance
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

print("=== TOP 20 MOST IMPORTANT FEATURES ===")
print(importance_df.head(20))

In [None]:
# Plot top 20 feature importances
top_n = 20
fig, ax = plt.subplots(figsize=(10, 8))
importance_df.head(top_n).plot(x='feature', y='importance', kind='barh', ax=ax, legend=False)
ax.set_title(f'Top {top_n} Most Important Google Earth Embedding Dimensions', 
             fontsize=14, fontweight='bold')
ax.set_xlabel('Importance')
ax.set_ylabel('Feature (Embedding Dimension)')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

## Prediction Analysis

In [None]:
# Get prediction probabilities
y_test_proba = rf_classifier.predict_proba(X_test)
max_probabilities = y_test_proba.max(axis=1)

print("=== PREDICTION CONFIDENCE ===")
print(f"Mean prediction confidence: {max_probabilities.mean():.3f}")
print(f"Std prediction confidence: {max_probabilities.std():.3f}")
print(f"Min prediction confidence: {max_probabilities.min():.3f}")
print(f"Max prediction confidence: {max_probabilities.max():.3f}")

In [None]:
# Plot prediction confidence distribution
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(max_probabilities, bins=20, alpha=0.7, edgecolor='black')
ax.axvline(max_probabilities.mean(), color='red', linestyle='--', linewidth=2, 
           label=f'Mean: {max_probabilities.mean():.3f}')
ax.set_title('Distribution of Prediction Confidence Scores', fontsize=14, fontweight='bold')
ax.set_xlabel('Maximum Probability (Confidence)')
ax.set_ylabel('Frequency')
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# Show some example predictions
print("\n=== EXAMPLE PREDICTIONS ===")
n_examples = min(10, len(y_test))
for i in range(n_examples):
    true_label = y_test[i]
    pred_label = y_test_pred[i]
    confidence = max_probabilities[i]
    correct = "✓" if true_label == pred_label else "✗"
    print(f"\n{correct} Sample {i+1}:")
    print(f"  True: {true_label}")
    print(f"  Predicted: {pred_label} (confidence: {confidence:.3f})")

## Summary and Conclusions

In [None]:
print("=== EXPERIMENT SUMMARY ===")
print(f"\nDataset:")
print(f"  - Total samples: {len(df_clean)}")
print(f"  - Training samples: {len(X_train)}")
print(f"  - Test samples: {len(X_test)}")
print(f"  - Number of classes: {len(np.unique(y))}")
print(f"  - Feature dimensions: {X.shape[1]}")

print(f"\nModel:")
print(f"  - Algorithm: Random Forest Classifier")
print(f"  - Number of trees: {rf_classifier.n_estimators}")
print(f"  - Max depth: {rf_classifier.max_depth}")

print(f"\nPerformance:")
print(f"  - Training accuracy: {train_accuracy:.3f}")
print(f"  - Test accuracy: {test_accuracy:.3f}")
print(f"  - Mean prediction confidence: {max_probabilities.mean():.3f}")

print(f"\nKey Findings:")
if test_accuracy > 0.7:
    print("  ✓ Google Earth embeddings show STRONG predictive power for ENVO local scale")
elif test_accuracy > 0.5:
    print("  ~ Google Earth embeddings show MODERATE predictive power for ENVO local scale")
else:
    print("  ✗ Google Earth embeddings show LIMITED predictive power for ENVO local scale")

if train_accuracy - test_accuracy > 0.2:
    print("  ⚠ Significant overfitting detected - model may not generalize well")
elif train_accuracy - test_accuracy > 0.1:
    print("  ⚠ Moderate overfitting - consider regularization")
else:
    print("  ✓ Good generalization - minimal overfitting")