# Solar Activity Classifier
## A Cyberinfrastructure-Enabled Machine Learning Tool

**Author:** Christopher Cruz & Ameer Hassan
**Date:** December 2025

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries imported successfully")

In [None]:
def generate_solar_data(n_samples=1000):
    """Generate synthetic solar activity data"""
    np.random.seed(42)
    
    sunspot_number = np.random.gamma(shape=2, scale=50, size=n_samples)
    sunspot_area = sunspot_number * np.random.uniform(0.8, 1.2, n_samples)
    new_regions = np.random.poisson(lam=sunspot_number/50, size=n_samples)
    solar_flux = 65 + sunspot_number * 0.5 + np.random.normal(0, 10, n_samples)
    prev_day = np.random.choice([0, 1, 2], size=n_samples, p=[0.5, 0.3, 0.2])
    
    X = np.column_stack([sunspot_number, sunspot_area, new_regions, solar_flux, prev_day])
    
    y = np.zeros(n_samples, dtype=int)
    y[sunspot_number > 50] = 1
    y[sunspot_number > 100] = 2
    
    feature_names = ['Sunspot Number', 'Sunspot Area', 'New Active Regions', 
                    'Solar Flux (10.7cm)', 'Previous Day Activity']
    
    return X, y, feature_names

X, y, feature_names = generate_solar_data(n_samples=1000)

print(f"Dataset shape: {X.shape}")
print(f"\nClass distribution:")
print(f"  Low Activity:    {np.sum(y==0)} ({100*np.sum(y==0)/len(y):.1f}%)")
print(f"  Medium Activity: {np.sum(y==1)} ({100*np.sum(y==1)/len(y):.1f}%)")
print(f"  High Activity:   {np.sum(y==2)} ({100*np.sum(y==2)/len(y):.1f}%)")

In [None]:
# Train the model
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples:  {len(X_test)}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train_scaled, y_train)

print("\nTraining complete!")

In [None]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"Test Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Low', 'Medium', 'High']))

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=['Low', 'Medium', 'High'],
           yticklabels=['Low', 'Medium', 'High'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance
importance = model.feature_importances_
indices = np.argsort(importance)[::-1]

plt.figure(figsize=(10, 6))
plt.title('Feature Importance')
plt.bar(range(len(importance)), importance[indices])
plt.xticks(range(len(importance)), [feature_names[i] for i in indices], rotation=45, ha='right')
plt.ylabel('Importance')
plt.tight_layout()
plt.show()

print("Feature Ranking:")
for i, idx in enumerate(indices):
    print(f"{i+1}. {feature_names[idx]:25s} {importance[idx]:.4f}")