In [1]:
import json
import pandas as pd
import time
import os

In [2]:
from dotenv import load_dotenv

In [3]:
from openai import OpenAI

In [4]:
load_dotenv(dotenv_path="/Users/wan_aubel/Desktop/focus-detection/focus-detection-using-physiological-data/.env")

True

In [5]:
api_key = os.getenv("AUBEL_GROK_KEY")

In [6]:
def build_prompt(subject_id):
    return f"""🎯 Objective:
Generate realistic physiological data for one synthetic subject across 70 rows (each row = 30 seconds of monitoring). Data should be biologically plausible and reflect both inter-subject and intra-subject variability.

Include a realistic distribution of attentional states (focused vs. not focused), reflecting transitions across meditation, stress, and amusement. Your goal is to simulate when and why focus increases or degrades, using signal patterns.

📌 Key Constraints:
- Each subject has a unique **baseline physiology**:
    - HR_baseline ∈ [55, 100] bpm (mean heart rate)
    - RESP_rate_baseline ∈ [12, 20] breaths/min
    - EDA_baseline ∈ [0.5, 2.5] µS
- Use subject_id = {subject_id} as a seed for variability

📊 Columns per row (generate all):
- ACC_x_mean, ACC_y_mean, ACC_z_mean
- net_acc_mean, net_acc_std
- EDA_mean, EDA_std, EDA_slope
- RESP_rate, RESP_regularity
- HR
- IBI, RMSSD, SDNN, pNN50
- lf/hf
- label (1 = baseline, 2 = stress, 3 = amusement, 4 = meditation)
- focus_label (0 = not focused, 1 = focused)
- subject (e.g., 18)

🧠 Signal Relationships (enforce them softly):
- HR ⬆ → IBI ⬇ (HR = 60000 / IBI)
- HR ⬆ → HRV ⬇ (lower RMSSD, SDNN, pNN50)
- RESP_regularity ⬆ in focused states
- EDA_std ⬇ and EDA_slope ≈ 0 in focused states
- lf/hf tends to be **lower** in focused states, **higher** under stress

🔍 Focus-specific patterns:
- In focused states (e.g., during meditation or deep task engagement):
    - RESP_regularity ↑
    - EDA_std ↓ and EDA_slope ≈ 0
    - HR slightly ↓
    - HRV metrics (SDNN, RMSSD) ↑ if relaxed, ↓ if intensely focused
    - lf/hf ↓ (except in meditation where it might rise slightly)

- In non-focused states (e.g., during stress or distraction):
    - HR ↑ and IBI ↓
    - HRV ↓
    - EDA_std ↑, EDA_slope fluctuates
    - RESP_rate ↑ and RESP_regularity ↓
    - lf/hf ↑

🧪 Label Notes:
- Ensure label is varied realistically among [1, 2, 3, 4]
- Assign focus_label = 1 during flow/meditation or high regularity, low noise
- Assign focus_label = 0 when stress or large variability in EDA, HRV

📎 Output Format:
Return a valid **JSON list of 70 dictionaries**, one per row. Do not include markdown, explanation, or formatting.

📌 Reminder:
Use the style/format of this sample below, but DO NOT copy values.
[
  {{
    "ACC_x_mean": -34.7,
    "ACC_y_mean": 26.4,
    "ACC_z_mean": 9.1,
    "net_acc_mean": 60.1,
    "net_acc_std": 9.5,
    "EDA_mean": 1.21,
    "EDA_std": 0.006,
    "EDA_slope": -0.000002,
    "RESP_rate": 19.5,
    "RESP_regularity": 2.8,
    "HR": 78.2,
    "IBI": 767.0,
    "RMSSD": 125.3,
    "SDNN": 90.2,
    "pNN50": 48.1,
    "lf/hf": 2.7,
    "label": 1,
    "subject": 18,
    "focus_label": 1
  }}
]
"""


In [7]:
def safe_generate(subject_id, api_key, max_retries=3):
    for attempt in range(max_retries):
        try:
            client = OpenAI(api_key=api_key, base_url="https://api.x.ai/v1")
            response = client.chat.completions.create(
                model="grok-3-latest",
                temperature=0.7,
                messages=[
                    {"role": "system", "content": "You are a physiological data simulator generating synthetic data for a focus detection experiment."},
                    {"role": "user", "content": build_prompt(subject_id)}
                ]
            )
            json_text = response.choices[0].message.content
            data = json.loads(json_text)
            return data if isinstance(data, list) else [data]
        except Exception as e:
            print(f"[Subject {subject_id}] Attempt {attempt+1} failed: {e}")
            time.sleep(1.5 + random.uniform(0, 1.0))
    print(f"[Subject {subject_id}] Failed after {max_retries} retries.")
    return []

In [8]:
all_subject = []

In [9]:
os.makedirs("parsed", exist_ok=True)

In [10]:
for subject_id in range(18, 32):
    print(f"Generating subject {subject_id}...")
    data = safe_generate(subject_id, api_key)
    if data:
        all_subject.extend(data)
        with open(f"parsed/subject_{subject_id}.json", "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2)

pd.DataFrame(all_subject).to_csv("synthetic_focus_dataset.csv", index=False)

Generating subject 18...
Generating subject 19...
Generating subject 20...
Generating subject 21...
Generating subject 22...
Generating subject 23...
Generating subject 24...
Generating subject 25...
Generating subject 26...
Generating subject 27...
Generating subject 28...
Generating subject 29...
Generating subject 30...
Generating subject 31...
