In [5]:
import pandas as pd
import numpy as np

# Load registry
df = pd.read_csv('../data/ra_registry.csv')

# Define visit days
visit_days = [0, 30, 60]

# Initialize CRF data list
crf_data = []

# Simulate 100 patients with 3 visits each
for i in range(100):
    row = df.sample(1).iloc[0]  # randomly pick a patient from registry
    for day in visit_days:
        crf_entry = {
            'Patient_ID': f"RA{i:03d}",  # unique patient ID like RA001, RA002...
            'Visit_Day': day,
            'RA_Severity_Score': np.clip(row['RA_Severity_Score'] + np.random.randint(-2, 3), 0, 10),
            'Symptom_Notes': np.random.choice(['Stable', 'Improved', 'Worsened']),
            'Drug_Dosage_mg': np.random.choice([10, 20, 30], p=[0.33, 0.33, 0.34]),
            'Consent_Status': row['Consent_Status']
        }
        crf_data.append(crf_entry)

# Convert to DataFrame
crf_df = pd.DataFrame(crf_data)

# Save to CSV
crf_df.to_csv('../data/crf_simulated.csv', index=False)

# Preview
print("✅ Simulated CRF entries:", crf_df.shape)
print(crf_df.head())


✅ Simulated CRF entries: (300, 6)
  Patient_ID  Visit_Day  RA_Severity_Score Symptom_Notes  Drug_Dosage_mg  \
0      RA000          0                  7        Stable              20   
1      RA000         30                  8        Stable              30   
2      RA000         60                  5        Stable              20   
3      RA001          0                  7      Worsened              30   
4      RA001         30                  6        Stable              10   

  Consent_Status  
0            Yes  
1            Yes  
2            Yes  
3            Yes  
4            Yes  


**Ethics Note:**  
This CRF simulation is based on mock registry data and does not represent real patient records. Visit logic, symptom tracking, and drug dosage are randomized for educational purposes. Consent status is retained to reflect GCP-compliant trial structure. All outputs are documented for reproducibility and ethical transparency.
