In [1]:
import random
import csv
from pathlib import Path
from datetime import datetime, timedelta

# Predefined possible values for some fields
genders = ["Male", "Female"]
vaccination_doses = ["1st Dose", "2nd Dose", "Booster"]
conditions = ["None", "Hypertension", "Diabetes", "Heart Disease", "Asthma"]
us_locations = [
    "New York", "Los Angeles", "Chicago", "Houston", "Phoenix", 
    "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose",
    "Austin", "Jacksonville", "Fort Worth", "Columbus", "Charlotte",
    "Indianapolis", "Seattle", "Denver", "Washington", "Boston"
]
blood_pressure_levels = ["Very Low", "Low", "Normal", "High", "Very High"]
cholesterol_levels = [180, 190, 200, 210, 220, 250, 275, 300, 350]
smoking_history = ["Yes", "No"]
diabetes_status = ["Yes", "No"]

# Helper function to generate random dates
def random_date(start_date, end_date):
    return start_date + timedelta(days=random.randint(0, (end_date - start_date).days))

# Helper function to generate random values for each field
def generate_random_data():
    # Randomly generate values for each field
    patient_id = str(random.randint(100, 999))
    age = str(random.randint(18, 90))  # Random age between 18 and 90
    gender = random.choice(genders)
    vaccination_date = random_date(datetime(2020, 1, 1), datetime(2023, 12, 31)).strftime("%Y-%m-%d")
    vaccine_dose = random.choice(vaccination_doses)
    pre_existing_conditions = random.choice(conditions)
    
    # 10% chance of having a heart attack
    heart_attack_date = ""  
    heart_attack_related_to_vaccine = False

    # 30% chance to have heart attack
    if random.random() < 0.3:  # 10% chance
        # 50% chance that the heart attack is related to vaccination
        heart_attack_related_to_vaccine = random.random() < 0.5
        
        if heart_attack_related_to_vaccine:
            # Heart attack occurs within 1-6 weeks after vaccination
            heart_attack_date = random_date(
                datetime.strptime(vaccination_date, "%Y-%m-%d"),
                datetime.strptime(vaccination_date, "%Y-%m-%d") + timedelta(weeks=6)
            ).strftime("%Y-%m-%d")
        else:
            # Random heart attack date
            heart_attack_date = random_date(datetime(2000, 1, 1), datetime(2023, 12, 31)).strftime("%Y-%m-%d")

    location = random.choice(us_locations)
    blood_pressure = random.choice(blood_pressure_levels)
    cholesterol_level = str(random.choice(cholesterol_levels))
    bmi = round(random.uniform(18.5, 40.0), 1)  # Random BMI between 18.5 and 40.0
    smoking_history_status = random.choice(smoking_history)
    diabetes_status_value = random.choice(diabetes_status)

    # Construct the random data dictionary
    random_data = {
        "Patient ID": patient_id,
        "Age": age,
        "Gender": gender,
        "Vaccination Date": vaccination_date,
        "Vaccine Dose": vaccine_dose,
        "Pre-existing Conditions": pre_existing_conditions,
        "Heart Attack Date": heart_attack_date,
        "Heart Attack Related to Vaccine": heart_attack_related_to_vaccine,
        "Location": location,
        "Blood Pressure": blood_pressure,
        "Cholesterol Level": cholesterol_level,
        "BMI": bmi,
        "Smoking History": smoking_history_status,
        "Diabetes Status": diabetes_status_value
    }

    return random_data

# Generate a list of random patient data
def generate_multiple_patients(num_patients=10000):
    patient_data_list = []
    for _ in range(num_patients):
        patient_data_list.append(generate_random_data())
    return patient_data_list

# Generate data for 10000 random patients
random_patients = generate_multiple_patients(10000)

# Define the CSV file path
csv_file_path = Path("../artifacts/data.csv")

# Define the CSV header (columns)
csv_header = [
    "Patient ID", "Age", "Gender", "Vaccination Date", "Vaccine Dose", 
    "Pre-existing Conditions", "Heart Attack Date", "Heart Attack Related to Vaccine",
    "Location", "Blood Pressure", "Cholesterol Level", "BMI", 
    "Smoking History", "Diabetes Status"
]

# Save the data to a CSV file
with open(csv_file_path, "w", newline="") as csvfile:
    csv_writer = csv.DictWriter(csvfile, fieldnames=csv_header)
    csv_writer.writeheader()  # Write the header
    csv_writer.writerows(random_patients)  # Write the patient data

print(f"10000 patient records generated with 10% heart attack condition and vaccine-heart attack relationship saved to '{csv_file_path}'")


10000 patient records generated with 10% heart attack condition and vaccine-heart attack relationship saved to '../artifacts/data.csv'


In [2]:
random_patients

[{'Patient ID': '879',
  'Age': '82',
  'Gender': 'Male',
  'Vaccination Date': '2022-06-06',
  'Vaccine Dose': '2nd Dose',
  'Pre-existing Conditions': 'Asthma',
  'Heart Attack Date': '',
  'Heart Attack Related to Vaccine': False,
  'Location': 'San Diego',
  'Blood Pressure': 'Very Low',
  'Cholesterol Level': '180',
  'BMI': 35.0,
  'Smoking History': 'No',
  'Diabetes Status': 'Yes'},
 {'Patient ID': '492',
  'Age': '48',
  'Gender': 'Female',
  'Vaccination Date': '2020-09-11',
  'Vaccine Dose': 'Booster',
  'Pre-existing Conditions': 'Heart Disease',
  'Heart Attack Date': '',
  'Heart Attack Related to Vaccine': False,
  'Location': 'Charlotte',
  'Blood Pressure': 'Low',
  'Cholesterol Level': '220',
  'BMI': 22.5,
  'Smoking History': 'No',
  'Diabetes Status': 'Yes'},
 {'Patient ID': '146',
  'Age': '80',
  'Gender': 'Male',
  'Vaccination Date': '2023-02-03',
  'Vaccine Dose': '1st Dose',
  'Pre-existing Conditions': 'Hypertension',
  'Heart Attack Date': '',
  'Heart Atta