In [1]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from dotenv import load_dotenv, find_dotenv
from redcap import Project #PyCap
import os

In [49]:
# Load the REDCap API URL and Key from the .env file
load_dotenv(find_dotenv())
api_url = os.getenv('REDCAP_API_URL')
api_key = os.getenv('REDCAP_API_KEY')

In [52]:
# Export the REDCap project using PyCap
project = Project(api_url, api_key)

In [60]:
# Generate a random fake patient dataset to upload to REDCap

# Set number of cases to generate
amount = 10000

# Set random seed for reproducibility
np.random.seed(42)

# Helper function to generate random dates
def random_date(start_date, end_date):
    time_between = end_date - start_date
    days_between = time_between.days
    random_days = np.random.randint(0, days_between)
    dates = start_date + timedelta(days=random_days)
    # Convert to strings in the format 'YYYY-MM-DD'
    dates = dates.strftime('%Y-%m-%d')
    return dates

# Generate unique patient IDs
patient_ids = np.random.choice(range(0, 100000), size=amount, replace=False)

# Generate random dates of birth between 1930-2000
dob_start = datetime(1930, 1, 1)
dob_end = datetime(2000, 1, 1)
dobs = [random_date(dob_start, dob_end) for _ in range(amount)]

# Generate random dates of death between 2000-2024
dod_start = datetime(2000, 1, 1)
dod_end = datetime(2024, 1, 1)
dods = [random_date(dod_start, dod_end) for _ in range(amount)]
# Set 80% of patients to be alive (set dod to blanks)
dods = np.where(np.random.rand(amount) < 0.8, "", dods)

# Generate random diagnoses (random int between 1 and 5)
diagnoses = np.random.choice(range(1, 6), size=amount)

# Generate random sex (1 or 2)
sexes = np.random.choice([1, 2], size=amount)

# Generate random handedness (1, 2, or 3)
handedness = np.random.choice([1, 2, 3], size=amount)

# Generate random visit dates
visit_start = datetime(1990, 1, 1)
visit_end = datetime(2024, 1, 1)
visit_dates = [random_date(visit_start, visit_end) for _ in range(amount)]

# Generate random mmse scores (0-30)
# Where people with diagnosis 5 have higher scores (28-30)
# Whereas people with other diangosis have lower scores (0-27)
# Make sure the diagnoses (not control) have a normal distribution centered around 20
mmse_scores = np.random.normal(15, 5, amount)
mmse_scores = np.where(diagnoses == 5, np.random.randint(28, 31, amount), mmse_scores)
mmse_scores = np.where(mmse_scores < 0, 0, mmse_scores)
mmse_scores = np.where(mmse_scores > 30, 30, mmse_scores)
mmse_scores = mmse_scores.astype(int)


# Create the dataframe
df = pd.DataFrame({
    'patient_id': patient_ids,
    'birth_date': dobs,
    'diagnosis': diagnoses,
    'sex': sexes,
    'handedness': handedness,
    'visit_date': visit_dates,
    'death_date': dods,
    'mmse_tot': mmse_scores
})

# Sort by patient_id for better readability
df = df.sort_values('patient_id').reset_index(drop=True)

# Convert all numbers to strings and 
# randomly set 10% of values to blanks (except patient_id):
def random_naner(df):
    for col in df.columns:
        if col != 'patient_id':
            df[col] = df[col].astype(str)
            df.loc[df.sample(frac=0.1).index, col] = ""
    return df

df = random_naner(df)

display(df)

Unnamed: 0,patient_id,birth_date,diagnosis,sex,handedness,visit_date,death_date,mmse_tot
0,23,,2,2,3,1998-02-12,,2
1,53,1989-04-04,4,2,1,1991-10-08,,9
2,70,1969-06-01,5,,3,1994-04-05,,28
3,87,1945-11-02,1,2,1,2010-10-10,,11
4,97,1974-05-19,4,2,3,,,13
...,...,...,...,...,...,...,...,...
9995,99943,1950-10-13,2,,1,2016-06-06,,15
9996,99954,1999-03-24,5,2,3,1995-01-22,2002-02-21,30
9997,99971,1990-08-27,2,,1,2011-11-07,,22
9998,99991,1930-05-16,5,1,1,2020-12-09,,28


In [63]:
# Import the generated data into REDCap:
# Convert the df to a json:
import_data = df.to_dict(orient='records')
response = project.import_records(import_data, overwrite='overwrite')
print(response)

{'count': 10000}
