## Patient Class

##### Final combination

In [1]:
import pandas as pd
import json
from collections import defaultdict

In [2]:
# ============ STEP 1: BUILD THE DICTIONARY ============
def build_patient_dictionary(heart_csv, micro_csv):
    """Build the core patient-admission dictionary structure."""
    
    df1 = pd.read_csv(heart_csv)
    df3 = pd.read_csv(micro_csv)
    
    print(f"Original microbiology shape: {df3.shape}")
    print(f"Original heart shape: {df1.shape}")
    
    # Filter microbiology
    df3_filtered = df3[df3['qc_flag'] != 'QC_FAIL'].copy()
    print(f"After removing QC_FAIL: {df3_filtered.shape}")
    
    patient_dict = {}
    heart_cols = df1.columns.tolist()
    micro_cols = df3_filtered.columns.tolist()
    demo_cols = ['gender', 'age', 'anchor_year', 'dod']
    
    # Process heart dataset
    print("\nProcessing heart dataset...")
    for idx, row in df1.iterrows():
        subject_id = row['subject_id']
        hadm_id = row['hadm_id']
        
        if subject_id not in patient_dict:
            patient_dict[subject_id] = {
                'demographics': {'subject_id': subject_id},
                'admissions': {}
            }
            for col in demo_cols:
                if col in heart_cols:
                    patient_dict[subject_id]['demographics'][col] = row[col]
        
        if hadm_id not in patient_dict[subject_id]['admissions']:
            patient_dict[subject_id]['admissions'][hadm_id] = {
                'hadm_id': hadm_id,
                'heart_notes': [],
                'microbiology': []
            }
        
        cols_to_exclude = ['note_type', 'note_seq', 'charttime'] + demo_cols + ['subject_id', 'hadm_id']
        heart_record = {col: row[col] for col in heart_cols if col not in cols_to_exclude}
        patient_dict[subject_id]['admissions'][hadm_id]['heart_notes'].append(heart_record)
    
    # Process microbiology dataset
    print("Processing microbiology dataset...")
    for idx, row in df3_filtered.iterrows():
        subject_id = row['subject_id']
        hadm_id = row['hadm_id']
        
        if subject_id not in patient_dict:
            patient_dict[subject_id] = {
                'demographics': {'subject_id': subject_id},
                'admissions': {}
            }
        
        if hadm_id not in patient_dict[subject_id]['admissions']:
            patient_dict[subject_id]['admissions'][hadm_id] = {
                'hadm_id': hadm_id,
                'heart_notes': [],
                'microbiology': []
            }
        
        micro_record = {col: row[col] for col in micro_cols if col not in ['subject_id', 'hadm_id']}
        patient_dict[subject_id]['admissions'][hadm_id]['microbiology'].append(micro_record)
    
    return patient_dict

In [3]:
# ============ STEP 2: PATIENT DATABASE CLASS ============
class PatientDatabase:
    """Wrapper class for patient dictionary with useful query methods."""
    
    def __init__(self, patient_dict):
        self.data = patient_dict
    
    def get_patient(self, subject_id):
        """Get all data for a specific patient."""
        return self.data.get(subject_id, None)
    
    def get_admission(self, subject_id, hadm_id):
        """Get data for a specific admission."""
        patient = self.get_patient(subject_id)
        if patient:
            return patient['admissions'].get(hadm_id, None)
        return None
    
    def get_demographics(self, subject_id):
        """Get demographics for a patient."""
        patient = self.get_patient(subject_id)
        return patient['demographics'] if patient else None
    
    def get_all_admissions(self, subject_id):
        """Get all admissions for a patient."""
        patient = self.get_patient(subject_id)
        return patient['admissions'] if patient else {}
    
    def search_patients(self, **criteria):
        """
        Search patients by criteria.
        Examples:
            search_patients(gender='F')
            search_patients(age_min=60, age_max=80)
            search_patients(has_microbiology=True)
        """
        results = []
        
        for subject_id, patient in self.data.items():
            match = True
            
            # Check demographics criteria
            if 'gender' in criteria:
                if patient['demographics'].get('gender') != criteria['gender']:
                    match = False
            
            if 'age_min' in criteria:
                age = patient['demographics'].get('age')
                if age is None or age < criteria['age_min']:
                    match = False
            
            if 'age_max' in criteria:
                age = patient['demographics'].get('age')
                if age is None or age > criteria['age_max']:
                    match = False
            
            # Check data availability criteria
            if 'has_heart' in criteria and criteria['has_heart']:
                has_heart = any(len(adm['heart_notes']) > 0 for adm in patient['admissions'].values())
                if not has_heart:
                    match = False
            
            if 'has_microbiology' in criteria and criteria['has_microbiology']:
                has_micro = any(len(adm['microbiology']) > 0 for adm in patient['admissions'].values())
                if not has_micro:
                    match = False
            
            if match:
                results.append(subject_id)
        
        return results
    
    def get_statistics(self):
        """Get comprehensive statistics about the database."""
        stats = {
            'total_patients': len(self.data),
            'total_admissions': 0,
            'patients_with_heart': 0,
            'patients_with_microbiology': 0,
            'patients_with_both': 0,
            'admissions_with_both': 0,
            'avg_admissions_per_patient': 0,
            'gender_distribution': defaultdict(int),
            'age_distribution': {'min': None, 'max': None, 'avg': 0}
        }
        
        total_age = 0
        age_count = 0
        ages = []
        
        for patient in self.data.values():
            num_admissions = len(patient['admissions'])
            stats['total_admissions'] += num_admissions
            
            has_heart = any(len(adm['heart_notes']) > 0 for adm in patient['admissions'].values())
            has_micro = any(len(adm['microbiology']) > 0 for adm in patient['admissions'].values())
            
            if has_heart:
                stats['patients_with_heart'] += 1
            if has_micro:
                stats['patients_with_microbiology'] += 1
            if has_heart and has_micro:
                stats['patients_with_both'] += 1
            
            for adm in patient['admissions'].values():
                if len(adm['heart_notes']) > 0 and len(adm['microbiology']) > 0:
                    stats['admissions_with_both'] += 1
            
            # Demographics
            gender = patient['demographics'].get('gender')
            if gender:
                stats['gender_distribution'][gender] += 1
            
            age = patient['demographics'].get('age')
            if age is not None:
                ages.append(age)
                total_age += age
                age_count += 1
        
        if stats['total_patients'] > 0:
            stats['avg_admissions_per_patient'] = stats['total_admissions'] / stats['total_patients']
        
        if ages:
            stats['age_distribution']['min'] = min(ages)
            stats['age_distribution']['max'] = max(ages)
            stats['age_distribution']['avg'] = total_age / age_count
        
        stats['gender_distribution'] = dict(stats['gender_distribution'])
        
        return stats
    
    def export_patient_list(self, subject_ids=None):
        """Export list of patients with key info."""
        if subject_ids is None:
            subject_ids = self.data.keys()
        
        patient_list = []
        for subject_id in subject_ids:
            if subject_id in self.data:
                patient = self.data[subject_id]
                patient_list.append({
                    'subject_id': subject_id,
                    'demographics': patient['demographics'],
                    'num_admissions': len(patient['admissions']),
                    'has_heart': any(len(adm['heart_notes']) > 0 for adm in patient['admissions'].values()),
                    'has_microbiology': any(len(adm['microbiology']) > 0 for adm in patient['admissions'].values())
                })
        
        return patient_list
    
    def save(self, filename='patient_database.json'):
        """Save database to JSON file."""
        with open(filename, 'w') as f:
            json_dict = {str(k): v for k, v in self.data.items()}
            json.dump(json_dict, f, indent=2, default=str)
        print(f"Database saved to '{filename}'")
    
    def get_patient_count(self):
        """Get total number of patients."""
        return len(self.data)
    
    def get_admission_count(self):
        """Get total number of admissions."""
        return sum(len(p['admissions']) for p in self.data.values())

In [4]:
# ============ USAGE ============
print("Building patient dictionary...")
patient_dict = build_patient_dictionary("heart_diagnoses_1.csv", "microbiology_events_codes_3.csv")

print("\nInitializing PatientDatabase...")
db = PatientDatabase(patient_dict)

print("\nPatientDatabase ready! Use db.method() to query the data.")

Building patient dictionary...
Original microbiology shape: (15587, 14)
Original heart shape: (4864, 25)
After removing QC_FAIL: (14808, 14)

Processing heart dataset...
Processing microbiology dataset...

Initializing PatientDatabase...

PatientDatabase ready! Use db.method() to query the data.


In [5]:
print("\n=== DATABASE STATISTICS ===")
stats = db.get_statistics()
for key, value in stats.items():
    print(f"{key}: {value}")

# Saving database
db.save('patient_database.json')


=== DATABASE STATISTICS ===
total_patients: 4679
total_admissions: 5151
patients_with_heart: 4392
patients_with_microbiology: 2572
patients_with_both: 2285
admissions_with_both: 2420
avg_admissions_per_patient: 1.1008762556101732
gender_distribution: {'F': 533, nan: 3128, 'M': 731}
age_distribution: {'min': 18.0, 'max': 95.0, 'avg': nan}
Database saved to 'patient_database.json'


In [6]:
# Specific patient (10000980)
patient = db.get_patient(10000980)
print(f"Patient data: {patient['demographics']}")

Patient data: {'subject_id': 10000980, 'gender': 'F', 'age': 75.0, 'anchor_year': 2186.0, 'dod': '2193-08-26'}


In [7]:
# Searching for female patients aged 60-80
female_seniors = db.search_patients(gender='F', age_min=60, age_max=80)
print(f"Found {len(female_seniors)} female patients aged 60-80")

Found 256 female patients aged 60-80


In [8]:
# Get patients with both datasets
complete_patients = db.search_patients(has_heart=True, has_microbiology=True)
print(f"Patients with complete data: {len(complete_patients)}")

Patients with complete data: 2285


In [9]:
# Export patient list
patient_list = db.export_patient_list(complete_patients[:10])
print(f"Exported {len(patient_list)} patients")

Exported 10 patients


In [10]:
all_patients = list(db.data.keys())

In [11]:
for patient_id, patient in db.data.items():
    print(f"\n===== Patient {patient_id} =====")
    
    admissions = patient.get("admissions", {})
    
    for hadm_id, adm in admissions.items():
        print(f"  Admission {hadm_id}:")
        
        # Loop through everything inside the admission
        for key, records in adm.items():
            # Skip if value is not a list (e.g., timestamps)
            if isinstance(records, list):
                print(f"    {key}: {len(records)} record(s)")



===== Patient 10000980 =====
  Admission 29654838:
    heart_notes: 1 record(s)
    microbiology: 0 record(s)
  Admission 26913865:
    heart_notes: 1 record(s)
    microbiology: 1 record(s)

===== Patient 10002013 =====
  Admission 24760295:
    heart_notes: 1 record(s)
    microbiology: 0 record(s)

===== Patient 10002155 =====
  Admission 23822395:
    heart_notes: 1 record(s)
    microbiology: 12 record(s)

===== Patient 10004457 =====
  Admission 28723315:
    heart_notes: 1 record(s)
    microbiology: 0 record(s)

===== Patient 10007058 =====
  Admission 22954658:
    heart_notes: 1 record(s)
    microbiology: 2 record(s)

===== Patient 10010424 =====
  Admission 28388172:
    heart_notes: 1 record(s)
    microbiology: 0 record(s)

===== Patient 10012343 =====
  Admission 27658045:
    heart_notes: 1 record(s)
    microbiology: 0 record(s)

===== Patient 10013569 =====
  Admission 22891949:
    heart_notes: 1 record(s)
    microbiology: 1 record(s)

===== Patient 10014651 =====


---

<hr>
<p style="font-size:60px; text-align:center;">FIN</p>
<hr>

---

## SafeGuard Code

In [12]:
import pandas as pd
import json

# Read the datasets
df3 = pd.read_csv("microbiology_events_codes_3.csv")
df1 = pd.read_csv("heart_diagnoses_1.csv")

print(f"Original microbiology shape: {df3.shape}")
print(f"Original heart shape: {df1.shape}")

# ============ MICROBIOLOGY PROCESSING ============
df3_filtered = df3[df3['qc_flag'] != 'QC_FAIL'].copy()
print(f"After removing QC_FAIL: {df3_filtered.shape}")

def create_subject_dict(df):
    """Creates dictionary with subject_id as keys, microbiology data as lists of lists."""
    subject_dict = {}
    grouped = df.groupby('subject_id')
    
    for subject_id, group in grouped:
        rows_as_lists = group.values.tolist()
        subject_dict[subject_id] = {
            "microbiology": rows_as_lists
        }
    
    return subject_dict

# Create the dictionary with microbiology data
microbiology_dict = create_subject_dict(df3_filtered)
print(f"Unique subjects in microbiology: {len(microbiology_dict)}")

# ============ HEART DATASET PROCESSING ============
columns_to_drop = ['note_type', 'note_seq', 'charttime']
existing_cols_to_drop = [col for col in columns_to_drop if col in df1.columns]
df1_processed = df1.drop(columns=existing_cols_to_drop)

print(f"Heart shape after dropping columns: {df1_processed.shape}")

def add_heart_to_dict(subject_dict, df):
    """Adds 'heart' data to existing subject_dict."""
    grouped = df.groupby('subject_id')
    
    for subject_id, group in grouped:
        rows_as_lists = group.values.tolist()
        
        if subject_id in subject_dict:
            subject_dict[subject_id]["heart"] = rows_as_lists
        else:
            subject_dict[subject_id] = {
                "heart": rows_as_lists
            }
    
    return subject_dict

# Add heart data to the dictionary
microbiology_dict = add_heart_to_dict(microbiology_dict, df1_processed)

# ============ SUMMARY ============
print(f"\nTotal unique subjects: {len(microbiology_dict)}")

subjects_with_both = sum(1 for sid, data in microbiology_dict.items() 
                         if "microbiology" in data and "heart" in data)
subjects_only_micro = sum(1 for sid, data in microbiology_dict.items() 
                          if "microbiology" in data and "heart" not in data)
subjects_only_heart = sum(1 for sid, data in microbiology_dict.items() 
                          if "heart" in data and "microbiology" not in data)

print(f"Subjects with both: {subjects_with_both}")
print(f"Subjects with only microbiology: {subjects_only_micro}")
print(f"Subjects with only heart: {subjects_only_heart}")

# # Example usage (commented):
# first_subject_id = list(microbiology_dict.keys())[0]
# print(f"\nExample - Subject ID: {first_subject_id}")
# print(f"Keys: {list(microbiology_dict[first_subject_id].keys())}")
# if "microbiology" in microbiology_dict[first_subject_id]:
#     print(f"Microbiology records: {len(microbiology_dict[first_subject_id]['microbiology'])}")
# if "heart" in microbiology_dict[first_subject_id]:
#     print(f"Heart records: {len(microbiology_dict[first_subject_id]['heart'])}")

# Save to JSON
with open('combined_dict.json', 'w') as f:
    json_dict = {str(k): v for k, v in microbiology_dict.items()}
    json.dump(json_dict, f, indent=2, default=str)
print("\nSaved to 'combined_dict.json'")

Original microbiology shape: (15587, 14)
Original heart shape: (4864, 25)
After removing QC_FAIL: (14808, 14)
Unique subjects in microbiology: 2572
Heart shape after dropping columns: (4864, 22)

Total unique subjects: 4679
Subjects with both: 2285
Subjects with only microbiology: 287
Subjects with only heart: 2107

Saved to 'combined_dict.json'
