<a href="https://colab.research.google.com/github/corricelli/NGSS-Dynamic-Data-Gem/blob/main/ddg_engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**GENERATE SYNTHETIC DATA ALIGNED WITH NGSS TOPICS**

This Colab handles two steps:
1) generate precise theoretical values based on scientific laws
2) introduce controlled, realistic variability

In [68]:
#CODE CELL 1: Setup and imports

import numpy as np
import pandas as pd
import gspread #library for interacting with Google Sheets
from scipy.optimize import curve_fit #useful to for advanced model fitting (HS students)
from google.colab import auth

print("Libraries loaded. Ready for scientific modeling.")

Libraries loaded. Ready for scientific modeling.


In [69]:
# CODE CELL 6 (CORRECTED): Google Authentication

# 1. Import module for credential defaults
from google.auth import default

# 2. Authenticate Colab to access Google services (this will prompt a sign-in in the browser)
# This uses the credentials of the logged-in Google account (your Google School account).
from google.colab import auth
auth.authenticate_user()

# 3. Retrieve default credentials (token)
creds, _ = default()

# 4. Use the authenticated credentials to create the gspread client instance.
# 'gc' is the authorized client object we will use in CODE CELL 7 to open the sheet.
gc = gspread.authorize(creds)

print("Google Authentication complete. Ready to access sheets.")

Google Authentication complete. Ready to access sheets.


In [70]:
# CODE CELL 7: Sheet Reading Logic

# --------------------------------------------------------------------------
# CRITICAL: PASTE YOUR UNIQUE SPREADSHEET ID HERE
# This ID is taken from the URL you provided: 1othRGkeNPnWQUpepGsNaM1F_OS-MWlDUdRND9-s7ul8
SPREADSHEET_ID = '1othRGkeNPnWQUpepGsNaM1F_OS-MWlDUdRND9-s7ul8'
# --------------------------------------------------------------------------


def get_latest_params_from_sheet():
    """
    Connects to the specified Google Sheet via its ID and reads the latest
    parameters from the first worksheet (Sheet1).

    Returns: A dictionary of clean parameters, or None if reading fails.
    """
    # NOTE: 'gc' is the authorized client object defined in CODE CELL 6
    global gc, SPREADSHEET_ID

    try:
        # 1. Open the sheet by its unique ID
        worksheet = gc.open_by_key(SPREADSHEET_ID).sheet1

        # 2. Get all records as a list of dictionaries (using header names)
        list_of_records = worksheet.get_all_records()

        if not list_of_records:
            print("Error: Control Sheet is empty besides the header row.")
            return None

        # 3. The latest entry is the last dictionary in the list
        latest_params = list_of_records[-1]

        # 4. Standardize and type-convert the parameters (CRITICAL STEP)
        return {
            'PE_ID': latest_params.get('PE_ID'),
            # Convert string inputs from the sheet to floats for calculation.
            'L_param': float(latest_params.get('Param_L', 0)),
            'Noise_Sigma': float(latest_params.get('Noise_Sigma', 0)),
            'k_param': float(latest_params.get('k_param (Secondary DCI Parameter)', 0.7)),
            't_range': int(latest_params.get('t_range (Simulation Time/Range)', 60)),
            'Mass_Const': float(latest_params.get('Mass_Const (Kinetic Energy)', 10.0))
        }

    except gspread.exceptions.SpreadsheetNotFound:
        print(f"Error: Spreadsheet with ID '{SPREADSHEET_ID}' not found. Check the ID and sharing permissions.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during sheet reading: {e}")
        # This occurs if, for example, the PE_ID column header name is wrong.
        return None

In [71]:
# CODE CELL 2: Scientific Models (DCI Functions)
# logistic growth is related to population dynamics required in MS/HS-LS2-1

def logistic_growth_model(t, L, k, t0):
    """
    Calculates population count based on the Logistic Growth (S-curve) function.
    t: time array
    L: Carrying Capacity (max population)
    k: Growth Rate parameter
    t0: Time shift parameter (midpoint)
    """
    # Mathematical expression for the S-curve: D(t) = L / (1 + e^(-k * (t - t0))) [3]
    return L / (1.0 + np.exp(-k * (t - t0)))


def kinetic_energy_model(mass, velocity):
    """
    Calculates Kinetic Energy (KE) based on Newtonian mechanics (MS-PS3-1). [5]
    KE = 0.5 * m * v^2
    """
    # Calculate Kinetic Energy: KE is proportional to mass and the square of velocity
    return 0.5 * mass * (velocity**2)

In [72]:
# CODE CELL 9: Model for MS-ESS1-3 (Solar System Scaling)

def model_solar_system_scaling(params):
    """Generates comparative data for solar system objects (Mass, Radius) for MS-ESS1-3. [1, 2]"""

    scale_factor = params.get('L_param', 100.0)
    num_data_points = int(params.get('t_range', 10))
    sigma = params.get('Noise_Sigma')

    # Base data: (8 planetary objects)
    mass_raw = np.array([1, 0.82, 0.11, 318, 95, 14.5, 17, 0.012])
    radii_raw = np.array([1, 0.95, 0.53, 11.2, 9.4, 4.0, 3.8, 0.27])
    density_raw = np.array([5.5, 5.2, 3.9, 1.3, 0.7, 1.3, 1.6, 3.3])

    # Apply user's scaling factor
    mass_scaled = (mass_raw / scale_factor) * 1000
    radii_scaled = (radii_raw / scale_factor) * 100

    num_entries = len(mass_raw)

    data = pd.DataFrame({
        'Object_ID': np.arange(1, num_entries + 1),
        'Mass_Scaled (x1000)': mass_scaled,
        'Radius_Scaled (x100)': radii_scaled,
        'Actual_Density (g/cm3)': density_raw
    }).head(num_data_points)

    # Apply scaled noise for realism
    # Applying noise to the relevant numeric columns (excluding 'Object_ID')
    data.iloc[:, 1:] += np.random.normal(0, sigma / 5000.0, (len(data), data.shape[1] - 1))

    # --- FINAL CRASH-SAFE METADATA ASSIGNMENT (CORRECTED) ---
    # Assigns metadata to new columns instead of overwriting the DataFrame.
    data['PE_ID'] = params.get('PE_ID')
    data['Noise_Sigma_Used'] = round(sigma,1)
    data['Scale_Factor_Used'] = round(scale_factor,1)

    return data

In [73]:
# CODE CELL 10: Model for HS-PS3-1 (Energy Conservation)

def model_energy_conservation(params):
    """
    Generates data for energy transfer in a closed system (E_in = E_out + E_loss)
    for HS-PS3-1. [3, 4]
    """

    E_in = params.get('L_param', 1000.0)
    efficiency = params.get('k_param', 0.85)
    num_trials = int(params.get('t_range', 30))
    sigma = params.get('Noise_Sigma')

    trials = np.arange(1, num_trials + 1)

    # Theoretical Values
    E_out_theoretical = E_in * efficiency
    E_loss_theoretical = E_in * (1.0 - efficiency)

    # Inject Noise
    noise_magnitude = sigma / 20.0
    E_out_noisy = E_out_theoretical + np.random.normal(0, noise_magnitude, num_trials)
    E_loss_noisy = E_loss_theoretical + np.random.normal(0, noise_magnitude, num_trials)

    # Calculate the measured change
    E_change_measured = E_in - E_out_noisy - E_loss_noisy

    data = pd.DataFrame({
        'Trial': trials,
        'E_Input (J)': E_in,
        'E_Output_Measured (J)': E_out_noisy.round(2),
        'E_Loss_Thermal_Measured (J)': E_loss_noisy.round(2),
        'E_Net_Change_Measured (J)': E_change_measured.round(2)
    })

    # --- FINAL CRASH-SAFE METADATA ASSIGNMENT (CORRECTED) ---
    # Assigns metadata to new columns instead of overwriting the DataFrame.
    data['PE_ID'] = params.get('PE_ID')
    data['Efficiency_Used'] = round(efficiency,2)
    data['Noise_Sigma_Used'] = round(sigma,1)

    return data

In [74]:
# CODE CELL 11: Model for HS-PS1-5 (Reaction Kinetics - Multivariate Enzyme)

def model_enzyme_kinetics_multivariate(params):
    """
    Generates multivariate data simulating three distinct enzyme activity profiles vs. pH. [5]
    """
    opt_temp = params.get('L_param', 40.0)
    stability_factor = params.get('k_param', 3.0)
    data_points = int(params.get('t_range', 140))
    sigma = params.get('Noise_Sigma')

    pH_values = np.linspace(1, 14, data_points)

    std_dev = stability_factor / 2.0

    activity_A = 90 * np.exp(-0.5 * ((pH_values - 2.5) / std_dev)**2)
    activity_B = 90 * np.exp(-0.5 * ((pH_values - 7.5) / std_dev)**2)
    activity_C = 90 * np.exp(-0.5 * ((pH_values - 12.5) / std_dev)**2)

    noise = np.random.normal(0, sigma / 150.0, len(pH_values))

    data = pd.DataFrame({
        'pH': pH_values.round(1),
        'Enzyme A (% Activity)': np.clip(activity_A + noise, 0, 100).round(0).astype(int),
        'Enzyme B (% Activity)': np.clip(activity_B + noise, 0, 100).round(0).astype(int),
        'Enzyme C (% Activity)': np.clip(activity_C + noise, 0, 100).round(0).astype(int)
    })

    # --- FINAL CRASH-SAFE METADATA ASSIGNMENT (CORRECTED) ---
    # Assigns metadata to new columns instead of overwriting the DataFrame.
    data['PE_ID'] = params.get('PE_ID')
    data['Optimum_Temp'] = round(opt_temp,1)
    data['Stability_Factor'] = round(stability_factor,1)
    data['Noise_Sigma_Used'] = round(sigma,1)

    return data

In [75]:
# CODE CELL 12: Model for LS2-1 (Population Dynamics)

def model_logistic_growth(params):
    """
    Generates population data using the logistic growth curve (MS/HS-LS2-1).
    This model shows initial exponential growth slowing down to a carrying capacity (L).
    """

    # --- 1. PARAMETER EXTRACTION (From Streamlit/Sheet) ---
    L_param = params.get('L_param', 8000.0)      # Carrying Capacity
    k_param = params.get('k_param', 0.7)         # Growth Rate
    t_range = int(params.get('t_range', 60))     # Time Steps
    sigma_noise = params.get('Noise_Sigma', 200.0)
    t_array = np.arange(0, t_range, 1)

    # --- 2. THEORETICAL CALCULATION (DCI Core Idea) ---
    # Calculates clean S-curve using logistic_growth_model (from CODE CELL 2)
    t0_estimate = t_range / 3.0
    D_theoretical = logistic_growth_model(t_array, L_param, k_param, t0_estimate)

    # --- 3. NOISE INJECTION (SEP Practice) ---
    # Simulates error in annual census counts. [1]
    noise = np.random.normal(loc=0.0, scale=sigma_noise, size=len(t_array))
    # Apply noise and ensure populations are non-negative, then convert to integer counts
    D_synthetic = np.clip((D_theoretical + noise), a_min=0, a_max=None).astype(int)

    # --- 4. DATAFRAME CREATION AND METADATA ---
    final_data = pd.DataFrame({
        'Time_Step': t_array,
        'Population_Count': D_synthetic,
    })

    # FINAL CRASH-SAFE FIX: Assign metadata to new column names (CORRECTED)
    final_data['PE_ID'] = params.get('PE_ID')
    final_data['Noise_Sigma_Used'] = round(sigma_noise,1)
    final_data['Carrying_Capacity_Used'] = round(L_param,0)

    return final_data

In [76]:
# CODE CELL 13: Model for PS3-1_KE (Kinetic Energy)

def model_kinetic_energy(params):
    """
    Generates Kinetic Energy data vs. Speed, supporting quantitative analysis of MS/HS-PS3-1. [2]
    """

    # --- 1. PARAMETER EXTRACTION (From Streamlit/Sheet) ---
    mass_const = params.get('Mass_Const')    # Mass (m) - held constant
    v_max = int(params.get('t_range', 60))         # Max Velocity (V_max) - independent variable
    sigma_noise = params.get('Noise_Sigma', 200.0)

    # Independent variable array (Velocity)
    v_array = np.arange(0, v_max, 1)

    # --- 2. THEORETICAL CALCULATION (DCI Core Idea) ---
    # Calculates clean KE using kinetic_energy_model (from CODE CELL 2)
    D_theoretical = kinetic_energy_model(mass_const, v_array)

    # --- 3. NOISE INJECTION (SEP Practice) ---
    # Simulates measurement error in the KE sensor/reading. [3]
    noise = np.random.normal(loc=0.0, scale=sigma_noise, size=len(v_array))
    D_synthetic = np.clip((D_theoretical + noise), a_min=0, a_max=None).astype(float)

    # --- 4. DATAFRAME CREATION AND METADATA ---
    final_data = pd.DataFrame({
        'Velocity (m/s)': v_array,
        'Kinetic_Energy (J)': D_synthetic.round(1),
    })

    # FINAL CRASH-SAFE FIX: Assign metadata to new column names (CORRECTED)
    final_data['PE_ID'] = params.get('PE_ID')
    final_data['Noise_Sigma_Used'] = round(sigma_noise,1)
    final_data['Mass_Used'] = round(mass_const,1)

    return final_data

In [77]:
# CODE CELL 14: Model for HS-ESS2-2 (Climate Feedback Loop)

def model_climate_feedback(params):
    """
    Generates time-series data for a climate feedback loop (Ice-Albedo).
    Students analyze how changes in one variable (GHG/Temp) are amplified
    by changes in another (Albedo/Ice).
    """

    # --- 1. PARAMETER EXTRACTION (From Streamlit/Sheet) ---
    Initial_GHG_Increase = params.get('L_param', 10.0) # Initial temp shock
    Feedback_Strength = params.get('k_param', 0.1)     # Albedo feedback multiplier
    Time_Steps_Years = int(params.get('t_range', 100))
    sigma = params.get('Noise_Sigma')

    years = np.arange(0, Time_Steps_Years)

    # --- 2. THEORETICAL CALCULATION (DCI Core Idea) ---

    # Initialize arrays with starting conditions
    temp = np.zeros(Time_Steps_Years)
    ice = np.zeros(Time_Steps_Years)
    albedo = np.zeros(Time_Steps_Years)

    # Set initial values for Year 0
    temp[0] = 0.5 + Initial_GHG_Increase / 10.0 # Initial warming
    ice[0] = 100.0
    albedo[0] = 0.6
    # Note: Added [0] index to initial assignments to match the array initialization

    # Run the iterative feedback model
    for i in range(1, Time_Steps_Years):
        # 1. Temperature drives ice melt
        ice_melt = temp[i-1] * 0.5
        ice[i] = np.clip(ice[i-1] - ice_melt, 0, 100)

        # 2. Ice affects albedo (reflectivity)
        albedo[i] = 0.3 + 0.3 * (ice[i] / 100.0)

        # 3. Albedo change feeds back into temperature (amplification)
        temp_change = 0.05 + (albedo[i-1] - albedo[i]) * Feedback_Strength * 50
        temp[i] = temp[i-1] + temp_change

        # Add noise to temperature measurement [2]
        temp[i] += np.random.normal(0, sigma / 500.0)

    # --- 4. DATAFRAME CREATION AND METADATA ---
    data = pd.DataFrame({
        'Year': years,
        'Global_Temp_Anomaly_C': temp.round(2),
        'Ice_Volume_Index': ice.round(1),
        'Albedo_Reflectivity': albedo.round(2)
    })

    # FINAL CRASH-SAFE FIX: Assign metadata to new column names (CORRECTED)
    data['PE_ID'] = params.get('PE_ID')
    data['Noise_Sigma_Used'] = round(sigma, 1)
    data['Feedback_Strength_Used'] = round(Feedback_Strength,2)

    return data

In [78]:
# CODE CELL 3: DDG Core Engine - Model Dispatcher

def generate_ddg_data(pe_id, params):
    """
    Routes the request to the correct modeling function based on the PE_ID.
    """
    pe_id_str = params.get('PE_ID')

    # 1. Model Dispatcher: Route request to the appropriate dedicated function

    # --- Life Science Models ---
    if pe_id_str == "LS2-1 (Population Dynamics)":
        # Calls CODE CELL 12 function (Logistic Growth)
        return model_logistic_growth(params)

    elif pe_id_str == "HS-PS1-5 (Reaction Kinetics)":
        # Calls CODE CELL 11 function (Enzyme Kinetics)
        return model_enzyme_kinetics_multivariate(params)

    # --- Physical Science Models ---
    elif pe_id_str == "PS3-1_KE (Kinetic Energy)":
        # Calls CODE CELL 13 function (Kinetic Energy)
        return model_kinetic_energy(params)

    elif pe_id_str == "HS-PS3-1 (Energy Conservation)":
        # Calls CODE CELL 10 function
        return model_energy_conservation(params)

    # --- Earth Science Models ---
    elif pe_id_str == "MS-ESS1-3 (Solar System Scaling)":
        # Calls CODE CELL 9 function
        return model_solar_system_scaling(params)

    # CRITICAL ADDITION: HS-ESS2-2 (Climate Feedback Loop)
    elif pe_id_str == "HS-ESS2-2 (Climate Feedback Loop)":
        # Calls CODE CELL 14 function
        return model_climate_feedback(params)

    # --- Error Handling ---
    else:
        # Error handling for unrecognized PE_ID
        print(f"ERROR: Unrecognized PE ID: {pe_id_str}. Returning empty data.")
        # This syntax is confirmed to work in your Colab environment
        return pd.DataFrame({"Error":''})

In [79]:
# CODE CELL 4 (REVISED): Live Sheet Reading and Data Generation Test

# 1. Read the latest input data from the Google Sheet
# This calls the function defined in CODE CELL 7, which uses your SHEET_ID and gc client.
live_params = get_latest_params_from_sheet()

# 2. Check if the read was successful and proceed
if live_params:
    # Extract the PE ID (e.g., 'LS2-1' or 'PS3-1_KE')
    pe_to_run = live_params.get('PE_ID')

    # Print a summary of the parameters successfully read
    print("--- LIVE RUN START ---")
    print(f"Target Phenomenon: {pe_to_run}")
    print(f"DCI Parameter (L/Mass): {live_params['L_param']}")
    print(f"SEP Parameter (Noise): {live_params}")

    # 3. Generate the data using the live parameters.
    # The generate_ddg_data function is defined in CODE CELL 3.
    synthetic_df = generate_ddg_data(pe_to_run, live_params)

    # 4. Display the results
    print("\n--- SYNTHETIC DATA GENERATED ---")
    print(synthetic_df.head(15))
    print(f"\nData Shape: {synthetic_df.shape} (Number of rows generated)")

else:
    print("Could not retrieve parameters from Google Sheet. Aborting generation test.")

# The generated DataFrame 'synthetic_df' is now available for the next step (CODE CELL 5).

--- LIVE RUN START ---
Target Phenomenon: PS3-1_KE (Kinetic Energy)
DCI Parameter (L/Mass): 0.0
SEP Parameter (Noise): {'PE_ID': 'PS3-1_KE (Kinetic Energy)', 'L_param': 0.0, 'Noise_Sigma': 200.0, 'k_param': 0.7, 't_range': 60, 'Mass_Const': 10.0}

--- SYNTHETIC DATA GENERATED ---
    Velocity (m/s)  Kinetic_Energy (J)                      PE_ID  \
0                0               197.0  PS3-1_KE (Kinetic Energy)   
1                1                 0.0  PS3-1_KE (Kinetic Energy)   
2                2               302.9  PS3-1_KE (Kinetic Energy)   
3                3                 0.0  PS3-1_KE (Kinetic Energy)   
4                4                14.8  PS3-1_KE (Kinetic Energy)   
5                5                 0.0  PS3-1_KE (Kinetic Energy)   
6                6               144.9  PS3-1_KE (Kinetic Energy)   
7                7               318.4  PS3-1_KE (Kinetic Energy)   
8                8               388.7  PS3-1_KE (Kinetic Energy)   
9                9           

In [80]:
# CODE CELL 5: Prepare Final Output (CSV String)

def prepare_csv_output(dataframe):
    """
    Converts the Pandas DataFrame to a CSV formatted string, ready for Apps Script transfer.
    """
    # Convert the DataFrame to a CSV string.
    # index=False ensures we do not include the Python row numbers in the output data.
    csv_string = dataframe.to_csv(index=False)

    return csv_string

# Run the preparation step using the DataFrame generated in CODE CELL 4
csv_data_string = prepare_csv_output(synthetic_df)

# Uncomment the line below if you want to visually inspect the raw CSV output string:
# print(csv_data_string)

print("\nData prepared as CSV string for Apps Script delivery.")


Data prepared as CSV string for Apps Script delivery.


In [81]:
# CODE CELL 8 (NEW): Final Data Write to Output Sheet

# --------------------------------------------------------------------------
# CRITICAL: PASTE YOUR UNIQUE OUTPUT SHEET ID HERE
# This ID is taken from the URL you provided: 1nWEXvgeKDfa1gQG0iFBK0mMiraWvu73QtJjMtGYD9oI
OUTPUT_SHEET_ID = '1nWEXvgeKDfa1gQG0iFBK0mMiraWvu73QtJjMtGYD9oI'
# --------------------------------------------------------------------------


def import_csv_to_sheet(sheet_id, csv_string):
    """
    Writes the generated CSV data string directly to the specified Google Sheet.
    This overwrites the content of the first worksheet ('Sheet1').
    """
    global gc
    try:
        # Open the target Output Sheet by its unique ID
        spreadsheet = gc.open_by_key(sheet_id)
        worksheet = spreadsheet.sheet1 # Assuming the first tab is where data goes

        # 1. Clear existing content to ensure a clean slate
        worksheet.clear()

        # 2. Convert the CSV string into a list of lists (required by gspread's batch update)
        import csv
        from io import StringIO

        data_io = StringIO(csv_string)
        reader = csv.reader(data_io)
        data_list = list(reader)

        # 3. Write the new data list (including headers) back to the sheet
        # This uses the A1 notation to start writing from the top-left corner
        worksheet.update('A1', data_list)

        print(f"\nSUCCESS: Data written to Output Sheet (ID: {sheet_id})")
        print(f"Data contains {len(data_list) - 1} rows.")
        return True

    except Exception as e:
        print(f"\nERROR during data write: {e}")
        return False

# Run the final write operation
import_csv_to_sheet(OUTPUT_SHEET_ID, csv_data_string)

  worksheet.update('A1', data_list)



SUCCESS: Data written to Output Sheet (ID: 1nWEXvgeKDfa1gQG0iFBK0mMiraWvu73QtJjMtGYD9oI)
Data contains 60 rows.


True