In [1]:
import openai
import json
import pandas as pd
import time
import os

In [2]:
from dotenv import load_dotenv

In [3]:
from openai import OpenAI

In [4]:
load_dotenv(dotenv_path="/Users/wan_aubel/Desktop/focus-detection/focus-detection-using-physiological-data/.env")

True

In [5]:
api_key = os.getenv("GROK_API_KEY")

In [6]:
def build_prompt(subject_id):
    return f"""
🎯 Objective:
Generate realistic physiological data for one synthetic subject across 70 rows (each row = 30 seconds of monitoring). Data should be biologically plausible and reflect both inter-subject and intra-subject variability.

📌 Key Constraints:
- Each subject has a unique **baseline physiology**:
    - HR_baseline ∈ [55, 100] bpm (mean heart rate)
    - RESP_rate_baseline ∈ [12, 20] breaths/min
    - EDA_baseline ∈ [0.5, 2.5] µS

- All other signals should be derived from those baseline values using realistic physiological relationships, plus small stochastic variation (±5–15%).

📊 Columns per row (generate all):
- ACC_x_mean, ACC_y_mean, ACC_z_mean
- net_acc_mean, net_acc_std
- EDA_mean, EDA_std, EDA_slope
- RESP_rate, RESP_regularity
- HR
- IBI, RMSSD, SDNN, pNN50
- lf/hf
- label (1 = baseline, 2 = stress, 3 = amusement, 4 = meditation)
- focus_label (0 = not focused, 1 = focused)
- subject (e.g., 18)

🧠 Signal Relationships (enforce them softly):
- HR ⬆ → IBI ⬇ (HR = 60000 / IBI)
- HR ⬆ → HRV ⬇ (lower RMSSD, SDNN, pNN50)
- RESP_regularity ⬆ in focused states
- EDA_std ⬇ and EDA_slope ≈ 0 in focused states
- lf/hf tends to be **lower** in focused states, **higher** under stress

🧪 Label Notes:
- Ensure label is varied realistically among [1, 2, 3, 4]
- Assign focus_label = 1 during flow/meditation or high regularity, low noise
- Assign focus_label = 0 when stress or large variability in EDA, HRV

📎 Output Format:
Return a valid **JSON list of 70 dictionaries**, one per row. Do not include markdown, explanation, or formatting.

📌 Reminder:
Use the style/format of this sample below, but DO NOT copy values.
[
  {{
    "ACC_x_mean": -34.7,
    "ACC_y_mean": 26.4,
    "ACC_z_mean": 9.1,
    "net_acc_mean": 60.1,
    "net_acc_std": 9.5,
    "EDA_mean": 1.21,
    "EDA_std": 0.006,
    "EDA_slope": -0.000002,
    "RESP_rate": 19.5,
    "RESP_regularity": 2.8,
    "HR": 78.2,
    "IBI": 767.0,
    "RMSSD": 125.3,
    "SDNN": 90.2,
    "pNN50": 48.1,
    "lf/hf": 2.7,
    "label": 1,
    "subject": 18,
    "focus_label": 1
  }}
]
"""


In [7]:
all_subject = []

In [None]:
for subject_id in range(18, 53):
    print(f"Generating subject {subject_id}...")

    client = OpenAI(
        api_key=api_key,
        base_url="https://api.x.ai/v1",
        )

    try:
        response = client.chat.completions.create(
            model="grok-3-mini-latest",
            temperature=1.0,
            messages=[
                {"role": "system", "content": "You are a physiological data simulator generating synthetic data for a focus detection experiment."},
                {"role": "user", "content": build_prompt(subject_id)}
            ]
        )

        json_text = response.choices[0].message.content

        with open(f"raw/subject_{subject_id}.txt", "w", encoding="utf-8") as f:
            f.write(json_text)

        try:
            data = json.loads(json_text)
            all_subject.extend(data if isinstance(data, list) else [data])
            with open(f"parsed/subject_{subject_id}.json", "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2)
        except Exception as e_json:
            print(f"JSON parsing error (subject {subject_id}): {e_json}")
            continue

    except Exception as e:
        print(f"Error with subject {subject_id}: {e}")
        continue

    time.sleep(1.5)

🧠 Generating subject 18...


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(all_subject)

In [None]:
df.to_csv("synthetic_focus_dataset.csv", index=False)