# Notebook 1 - Using a TSI from Scripps Pier to Train a GOES RGB Cloud Model


In [1]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier, export_text
import sys

sys.path.append('../')
from analysis_utils import combine_daily_rgb_to_monthly

## Preprocess TSI data

In [2]:
start_time = "2023-04-01"
end_time = "2023-09-01"

# Convert start_time and end_time to numpy.datetime64
start_time = np.datetime64(start_time)
end_time = np.datetime64(end_time)

tsi_ds = xr.open_dataset('/storage/cdalden/goes/surface_obs/scripps_total_sky_imager/scripps_total_sky_imager_20230215_20240214.nc')

# Select time range and filter between 8am local and 6pm local (MDT) in one step
tsi_ds = tsi_ds.sel(
    time=tsi_ds['time'].where(
        (tsi_ds['time'] >= start_time) & 
        (tsi_ds['time'] <= end_time) & 
        (tsi_ds['time'].dt.hour >= 14)
    ).dropna('time')
)

da_percent_opaque =  tsi_ds['percent_opaque']

# Create a new cloud binary variable
tsi_ds['cloud_binary'] = xr.where(
    tsi_ds['percent_opaque'] > 75, 1,  # Cloudy: Set to 1 when > 75
    xr.where(tsi_ds['percent_opaque'] < 25, 0,  # Clear: Set to 0 when >= 0 and < 25
        np.nan  # Otherwise, set to NaN (nighttime and mixed cloud cover)
    ))

In [3]:
combo_ds = combine_daily_rgb_to_monthly('scripps', 'goes18', '08', '2023')

Error: [Errno 2] No such file or directory: '/storage/cdalden/goes/scripps/goes18/rgb_composite/goes18_C02_C05_C13_rgb_scripps_20230826.nc' for file 20230826
Error: [Errno 2] No such file or directory: '/storage/cdalden/goes/scripps/goes18/rgb_composite/goes18_C02_C05_C13_rgb_scripps_20230827.nc' for file 20230827
Error: [Errno 2] No such file or directory: '/storage/cdalden/goes/scripps/goes18/rgb_composite/goes18_C02_C05_C13_rgb_scripps_20230828.nc' for file 20230828
Error: [Errno 2] No such file or directory: '/storage/cdalden/goes/scripps/goes18/rgb_composite/goes18_C02_C05_C13_rgb_scripps_20230829.nc' for file 20230829
Error: [Errno 2] No such file or directory: '/storage/cdalden/goes/scripps/goes18/rgb_composite/goes18_C02_C05_C13_rgb_scripps_20230830.nc' for file 20230830
Error: [Errno 2] No such file or directory: '/storage/cdalden/goes/scripps/goes18/rgb_composite/goes18_C02_C05_C13_rgb_scripps_20230831.nc' for file 20230831
done with combo
processed and saved RGB file to comb

In [4]:
path = '/storage/cdalden/goes/scripps/goes18/rgb_composite/'
file_template = 'combined_goes18_C02_C05_C13_rgb_scripps_{yearmonth}.nc'
datasets = {}
files = ['202305', '202306', '202307', '202308']

for file in files:
    datasets[file] = xr.open_dataset(path + file_template.format(yearmonth=file))
    print('opened {i}'.format(i=file))

if len(files) > 1:
    ds = xr.concat([datasets[file] for file in files], dim='t', combine_attrs='override')
else:
    ds = (datasets[files[0]])
ds = ds.rename({'t': 'time'})

opened 202305
opened 202306
opened 202307
opened 202308


In [5]:
# Compute spatial averages for red, green, and blue bands over x and y dimensions
spatial_avg = ds[['red', 'green', 'blue']].mean(dim=['latitude', 'longitude'])

# Select time range and filter between hour 14 and 21 UTC in one step
spatial_avg = spatial_avg.sel(
    time=spatial_avg['time'].where(
        (spatial_avg['time'] >= start_time) & 
        (spatial_avg['time'] <= end_time) & 
        (spatial_avg['time'].dt.hour >= 14)
    ).dropna('time')
)

# Select the nearest TSI timesteps for each spatial_avg timestep
nearest_tsi = tsi_ds['cloud_binary'].sel(time=spatial_avg['time'], method='nearest')
# Ensure no duplicate timestamps in the nearest_tsi dataset
nearest_tsi = nearest_tsi.drop_duplicates(dim='time')

spatial_avg['cloud_binary'] = nearest_tsi

# for testing, let's slice the training ds a little
goes_tsi_ds = spatial_avg.sel(time=slice('2023-05-01', '2023-06-30'))

# Create a mask where all variables are non-NaN
mask = ~goes_tsi_ds.to_array().isnull().any(dim='variable')

# Apply the mask to filter the dataset
goes_tsi_ds = goes_tsi_ds.sel(time=goes_tsi_ds['time'][mask])

goes_tsi_ds

In [6]:
# Prepare the data
X = goes_tsi_ds[['red', 'green', 'blue']].to_dataframe().dropna()
y = goes_tsi_ds['cloud_binary'].to_dataframe().loc[X.index]

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train the decision tree classifier
clf = DecisionTreeClassifier(max_depth=2, min_samples_split=10, class_weight='balanced', random_state=42)  # Limit depth for interpretability
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Output the decision rules
tree_rules = export_text(clf, feature_names=['red', 'green', 'blue'])
print(tree_rules)

# Print a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Cloud', 'Cloud']))

# Print feature importance
feature_importances = clf.feature_importances_
for feature, importance in zip(['red', 'green', 'blue'], feature_importances):
    print(f"Feature: {feature}, Importance: {importance:.4f}")

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Print the training and evaluation sample sizes
print(f"Training sample size: {len(X_train)}")
print(f"Evaluation sample size: {len(X_test)}")

|--- green <= 0.15
|   |--- green <= 0.12
|   |   |--- class: 0.0
|   |--- green >  0.12
|   |   |--- class: 0.0
|--- green >  0.15
|   |--- green <= 0.18
|   |   |--- class: 1.0
|   |--- green >  0.18
|   |   |--- class: 1.0


Classification Report:
              precision    recall  f1-score   support

    No Cloud       0.99      0.98      0.98       625
       Cloud       0.99      0.99      0.99      1239

    accuracy                           0.99      1864
   macro avg       0.99      0.99      0.99      1864
weighted avg       0.99      0.99      0.99      1864

Feature: red, Importance: 0.0000
Feature: green, Importance: 1.0000
Feature: blue, Importance: 0.0000
Training sample size: 4349
Evaluation sample size: 1864


## Test San Juan Islands-derived RGB Thresholds

In [7]:
red = goes_tsi_ds['red']
green = goes_tsi_ds['green']
blue = goes_tsi_ds['blue']

# Combine conditions (example: all thresholds must be met)
combined_condition = ((red > 0) & (green<=0.13)) | (green>0.13)

# Create predictions (1 for positive class, 0 for negative class)
predictions = combined_condition.astype(int)

# Extract ground truth labels
ground_truth = goes_tsi_ds['cloud_binary'].values.flatten()  # Flatten to 1D array
predictions = predictions.values.flatten()  # Flatten to 1D array

# Calculate F1 score
f1 = f1_score(ground_truth, predictions)

print(f"F1 Score: {f1}")

F1 Score: 0.9818831942789035
