In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import warnings
from pathlib import Path
import re
import pytz


In [2]:
print("Loading dataset...")
df = pd.read_csv('../data/patient_records.csv', low_memory=False)
print(f"Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")

# Hungarian timezone
hungarian_tz = pytz.timezone('Europe/Budapest')

# Convert date columns with Hungarian timezone
# created: full datetime with time - localize to Hungarian timezone
# birth_date: date only - parse and localize at midnight
# ultrasound3_date: date/datetime - localize to Hungarian timezone
date_columns = ['created', 'birth_date', 'ultrasound3_date']
for col in date_columns:
    if col in df.columns:
        # Parse as datetime first
        df[col] = pd.to_datetime(df[col], errors='coerce')
        # Localize naive datetimes to Hungarian timezone
        # Note: If datetime is already timezone-aware, this will raise an error
        # so we check if it's naive first
        if df[col].dtype == 'datetime64[ns]':
            # Check if any non-null values exist
            if df[col].notna().any():
                # Localize naive datetimes to Hungarian timezone
                df[col] = df[col].dt.tz_localize(hungarian_tz, ambiguous='NaT', nonexistent='NaT')

# Parse time columns if they exist
time_columns = ['ultrasound4_time1', 'ultrasound5_time2']
for col in time_columns:
    if col in df.columns:
        # Try to parse as time, but keep as string if parsing fails
        # These might be time-only values, so we'll handle them separately if needed
        pass

# Calculate age if birth_date is available
# Age calculation works with timezone-aware datetimes
if 'birth_date' in df.columns and 'created' in df.columns:
    df['age'] = (df['created'] - df['birth_date']).dt.days / 365.25
    df['age'] = df['age'].round(1)

print(f"Data types: {df.dtypes.value_counts().to_dict()}\n")
print(f"Datetime columns timezone info:")
for col in date_columns:
    if col in df.columns:
        print(f"  {col}: {df[col].dtype}")

df.head()


Loading dataset...
Dataset loaded: 26,551 rows, 177 columns
Data types: {dtype('float64'): 120, dtype('O'): 55, datetime64[ns, Europe/Budapest]: 3}

Datetime columns timezone info:
  created: datetime64[ns, Europe/Budapest]
  birth_date: datetime64[ns, Europe/Budapest]
  ultrasound3_date: datetime64[ns, Europe/Budapest]


Unnamed: 0,created,birth_name,mothers_name,patient_name,birth_date,birth_place,clinic_name,mep,settlement,mep_region,...,extended_bp8_unknown,extended_bp9_unknown,extended_bp10_unknown,measurements_otoscope_data,measurements_diabetes_data,measurements_cov2_data,icd3_code,pid,taj_present,age
0,2024-07-09 12:27:18+02:00,Varga Medárd Rikárdó,Varga Dzsenifer,Varga Medárd Rikárdó,2019-02-04 00:00:00+01:00,Pécs,Drávaiványi rendelő,,Drávaiványi,Hirics MEP régió,...,,,,,,,A09,8c4a1e6788d9f0dbe446ae0878093a771c1f2d76035d61...,yes,5.4
1,2024-07-09 21:15:48+02:00,Kelemen Mária,Kürti Mária,Ferkó Istvánné,1955-06-11 00:00:00+02:00,Kiskunhalas,Nógrádszakál rendelő,,Nógrádszakál,Litke MEP régió,...,,,,,,,E74,279159d05b4d9db0bc548ac4db6a934cfbcc6715646876...,yes,69.1
2,2023-03-30 05:07:50+02:00,Kiss Sándor,Fehér Mária,Kiss Sándor,1971-07-09 00:00:00+01:00,Siklós,Hirics rendelő,,Hirics,Hirics MEP régió,...,,,,,,,I10,7c2a4b0d57e732af7252a5304c1f12718e0d063a199f3b...,yes,51.7
3,2023-03-08 11:13:58+01:00,,Szilágyi Ilona,Talabos Dávid Csabáné Jéri Edina,1996-05-01 00:00:00+02:00,Nyiregyháza,Nyírkáta rendelő,,Nyírkáta,Nyírkáta MEP régió,...,,,,,,,Z13,8cee43857a01ec5adfb64eba5bf1b57b19bbeb85090d14...,yes,26.9
4,2024-11-22 14:32:28+01:00,,Bedők Mária,Varjú Lászlóné,1947-07-24 00:00:00+02:00,Zebegény,Központ rendelő,,Központ,Egyéb,...,,,,,,,N30,241502f62bc78e65a13679ea5e193c8b9de3b932105847...,yes,77.3


In [3]:
# Drop duplicate rows
print(f"\nChecking for duplicate rows...")
print(f"Shape before removing duplicates: {df.shape}")

# Count duplicates
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count:,}")

# Drop duplicates (keep first occurrence)
df = df.drop_duplicates()

print(f"Shape after removing duplicates: {df.shape}")
print(f"Rows removed: {duplicate_count:,}")



Checking for duplicate rows...
Shape before removing duplicates: (26551, 178)
Number of duplicate rows: 5
Shape after removing duplicates: (26546, 178)
Rows removed: 5


In [4]:
# Ensure created is valid datetime for every row
print(f"\nValidating 'created' datetime column...")
print(f"Shape before validation: {df.shape}")

# Check for invalid datetime values (NaT/null)
invalid_created = df['created'].isna().sum()
print(f"Rows with invalid 'created' datetime: {invalid_created:,}")

if invalid_created > 0:
    # Optionally show sample of invalid rows
    if invalid_created <= 10:
        print(f"\nRows with invalid 'created':")
        print(df[df['created'].isna()][['created', 'pid']].head())
    
    # Drop rows with invalid created datetime
    print(f"\nRows removed: {invalid_created:,}")
else:
    print("All rows have valid 'created' datetime ✓")

print(f"Shape after validation: {df.shape}")



Validating 'created' datetime column...
Shape before validation: (26546, 178)
Rows with invalid 'created' datetime: 0
All rows have valid 'created' datetime ✓
Shape after validation: (26546, 178)


In [5]:
# Parse bp_systolic_temp to extract numeric systolic BP values
print(f"\nParsing bp_systolic_temp to extract numeric values...")

if 'bp_systolic_temp' in df.columns:
    def extract_systolic_bp(value):
        """
        Extract systolic BP from bp_systolic_temp string.
        Handles formats like:
        - '110 / 76' -> 110 (extract first number)
        - '133' -> 133 (single number)
        - NaN -> NaN
        """
        if pd.isna(value):
            return np.nan
        
        # Convert to string if not already
        value_str = str(value).strip()
        
        # Check if it contains a slash (format: "SBP / DBP")
        if '/' in value_str:
            # Extract first number before the slash
            parts = value_str.split('/')
            if len(parts) > 0:
                first_part = parts[0].strip()
                try:
                    return float(first_part)
                except ValueError:
                    return np.nan
        else:
            # Single number, try to parse directly
            try:
                return float(value_str)
            except ValueError:
                return np.nan
    
    # Apply extraction function
    parsed_systolic = df['bp_systolic_temp'].apply(extract_systolic_bp)
    
    # Count how many values we can extract
    non_null_parsed = parsed_systolic.notna().sum()
    print(f"  Extracted {non_null_parsed:,} numeric systolic BP values from bp_systolic_temp")
    
    # Update bp_systolic_temp column with parsed numeric values
    df['bp_systolic_temp'] = parsed_systolic
    print(f"  Updated bp_systolic_temp column with parsed numeric values")
    
    # Fill missing values in bp_systolic where we have parsed values
    if 'bp_systolic' in df.columns:
        # Count how many missing bp_systolic values we can fill
        missing_bp_systolic = df['bp_systolic'].isna()
        can_fill = (missing_bp_systolic & parsed_systolic.notna()).sum()
        
        if can_fill > 0:
            df.loc[missing_bp_systolic & parsed_systolic.notna(), 'bp_systolic'] = \
                parsed_systolic[missing_bp_systolic & parsed_systolic.notna()]
            print(f"  Filled {can_fill:,} missing bp_systolic values from bp_systolic_temp")
        
        # Optionally, also update existing values if bp_systolic_temp is more complete
        # For now, we'll keep existing bp_systolic values and only fill missing ones
    else:
        # If bp_systolic doesn't exist, create it from parsed values
        df['bp_systolic'] = parsed_systolic
        print(f"  Created bp_systolic column from bp_systolic_temp")
    
    print(f"  bp_systolic now has {df['bp_systolic'].notna().sum():,} non-null values")
else:
    print("  bp_systolic_temp column not found, skipping parsing")



Parsing bp_systolic_temp to extract numeric values...
  Extracted 8,306 numeric systolic BP values from bp_systolic_temp
  Updated bp_systolic_temp column with parsed numeric values
  bp_systolic now has 8,306 non-null values


In [6]:
print(df['pid'].isna().sum())

38


In [7]:
## drop missing pid

df_seg = df[df['pid'].notna()].copy()

In [8]:
df_seg['pid'].isna().sum()

np.int64(0)

In [9]:
print(df_seg.columns)

Index(['created', 'birth_name', 'mothers_name', 'patient_name', 'birth_date',
       'birth_place', 'clinic_name', 'mep', 'settlement', 'mep_region',
       ...
       'extended_bp8_unknown', 'extended_bp9_unknown', 'extended_bp10_unknown',
       'measurements_otoscope_data', 'measurements_diabetes_data',
       'measurements_cov2_data', 'icd3_code', 'pid', 'taj_present', 'age'],
      dtype='object', length=178)


In [10]:
## drop columns not needed
columns_to_drop = [
"birth_name",
"mothers_name",
"patient_name",
"birth_date",
"birth_place",
"clinic_name",
"mep",
"settlement",
"icd_code_name",
"prescribed_medication",
"prescribed_medication_active_ingredient",
"prescribed_medication_ttt",
"visit_reason",
"doctor_stamp",
"doctor_name",
"screening_administrative",
"ultrasound_description",
"ultrasound1_area_code_id",
"ultrasound2_machine_code_id",
"ultrasound4_time1",
"ultrasound5_time2",
"referral_institution",
"referral_specialty",
"cv_screening1_unknown",
"cv_screening2_bp",
"cv_screening3_unknown",
"cv_screening4_unknown",
"cv_screening9_unknown",
"telemedicine1_unknown",
"telemedicine2_unknown",
"telemedicine3_unknown",
"telemedicine4_unknown",
"telemedicine5_unknown",
"telemedicine6_unknown",
"ekg1_description",
"ekg2_unknown",
"ekg3_unknown",
"vision1_unknown",
"vision2_unknown",
"vision3_unknown",
"vision4_unknown",
"vision5_unknown",
"vision6_unknown",
"vision7_unknown",
"vision8_unknown",
"vision9_unknown",
"vision10_unknown",
"vision11_unknown",
"vision12_unknown",
"vision13_unknown",
"vision14_unknown",
"vision15_unknown",
"vision16_unknown",
"vision17_unknown",
"vision18_unknown",
"vision19_unknown",
"vision20_unknown",
"vision21_unknown",
"vision22_unknown",
"vision23_unknown",
"vision24_unknown",
"vision25_unknown",
"vision26_unknown",
"vision27_unknown",
"vision28_unknown",
"vision29_unknown",
"vision30_unknown",
"vision31_unknown",
"vision32_unknown",
"vision33_unknown",
"vision34_unknown",
"vision35_unknown",
"vision36_unknown",
"vision37_unknown",
"vision38_unknown",
"vision39_unknown",
"vision40_unknown",
"vision41_unknown",
"vision42_unknown",
"vision43_unknown",
"vision44_unknown",
"vision45_unknown",
"vision46_unknown",
"vision47_unknown",
"vision48_unknown",
"vision49_unknown",
"vision50_unknown",
"vision51_unknown",
"vision52_unknown",
"vision53_unknown",
"vision54_unknown",
"vision55_unknown",
"vision56_unknown",
"vision57_unknown",
"vision58_unknown",
"vision59_unknown",
"vision60_unknown",
"vision61_unknown",
"vision62_unknown",
"vision63_unknown",
"vision64_unknown",
"vision65_unknown",
"vision66_unknown",
"vision67_unknown",
"vision68_unknown",
"vision69_unknown",
"vision70_unknown",
"vision71_unknown",
"vision72_unknown",
"vision73_unknown",
"vision74_unknown",
"vision75_unknown",
"vision76_unknown",
"vision77_unknown",
"vision78_unknown",
"vision79_unknown",
"vision80_unknown",
"vision81_unknown",
"vision82_unknown",
"vision83_unknown",
"vision84_unknown",
"vision85_unknown",
"vision86_unknown",
"vision87_unknown",
"vision88_unknown",
"vision89_unknown",
"vision90_unknown",
"vision91_unknown",
"vision93_unknown",
"vision94_unknown",
"vision95_unknown",
"vision96_unknown",
"children_dentistry1_unknown",
"children_dentistry2_unknown",
"osas1_unknown",
"osas2_unknown",
"measurements_respiratory_function_data",
"measurements_arteriography_data",
"extended_bp1_unknown",
"extended_bp2_unknown",
"extended_bp3_unknown",
"extended_bp4_unknown",
"extended_bp5_unknown",
"extended_bp6_unknown",
"extended_bp7_unknown",
"extended_bp8_unknown",
"extended_bp9_unknown",
"extended_bp10_unknown",
"measurements_otoscope_data",
"measurements_cov2_data",
"taj_present",
"measurements_diabetes_data"
]



In [11]:
## drop columns not needed

df_seg.drop(columns=columns_to_drop, axis=1,inplace=True)

In [12]:
df_seg.shape

(26508, 26)

In [13]:
df_seg.columns

Index(['created', 'mep_region', 'patient_gender', 'prescribed_medication_atc',
       'specialty_name', 'measurements_ultrasound_category',
       'ultrasound3_date', 'bp_systolic_temp', 'bp_systolic', 'bp_diastolic',
       'bp_systolic2', 'bp_diastolic2', 'pulse', 'cv_screening5_height',
       'cv_screening6_weight', 'cv_screening7_bmi',
       'cv_screening8_waist_circumference', 'physical1_height',
       'physical2_weight', 'physical3_bmi', 'physical4_waist_circumference',
       'pulse_oximetry1_saturation', 'pulse_oximetry2_pulse', 'icd3_code',
       'pid', 'age'],
      dtype='object')

In [14]:
df_seg.info()
df_seg.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 26508 entries, 0 to 26550
Data columns (total 26 columns):
 #   Column                             Non-Null Count  Dtype                          
---  ------                             --------------  -----                          
 0   created                            26508 non-null  datetime64[ns, Europe/Budapest]
 1   mep_region                         26508 non-null  object                         
 2   patient_gender                     26508 non-null  object                         
 3   prescribed_medication_atc          6681 non-null   object                         
 4   specialty_name                     26508 non-null  object                         
 5   measurements_ultrasound_category   4076 non-null   object                         
 6   ultrasound3_date                   3794 non-null   datetime64[ns, Europe/Budapest]
 7   bp_systolic_temp                   8294 non-null   float64                        
 8   bp_systolic

Unnamed: 0,bp_systolic_temp,bp_systolic,bp_diastolic,bp_systolic2,bp_diastolic2,pulse,cv_screening7_bmi,physical3_bmi,age
count,8294.0,8294.0,8229.0,4990.0,4885.0,8499.0,3242.0,6311.0,26506.0
mean,134.598987,134.598987,84.976303,133.658918,84.979939,79.660313,347.227,30.81549,48.294763
std,20.010447,20.010447,23.036153,21.72852,25.062968,12.773219,17914.33,126.198131,20.369219
min,77.0,77.0,4.0,1.0,2.0,7.0,11.46,1.46,0.0
25%,121.0,121.0,77.0,119.0,76.0,71.0,24.61,24.135,35.9
50%,133.0,133.0,84.0,132.0,84.0,79.0,28.84,28.48,51.0
75%,147.0,147.0,92.0,147.0,92.0,88.0,33.67,33.31,64.6
max,230.0,230.0,830.0,222.0,780.0,165.0,1020000.0,10000.0,96.3


In [15]:
df_seg['mep_region'].value_counts()


mep_region
Nyírkáta MEP régió     7053
Litke MEP régió        5326
Zalakomár MEP régió    4755
Szalonna MEP régió     4264
Hirics MEP régió       2837
Egyéb                  1717
Heves praxis            285
Máltai Iskolák          146
Ápolási intézmények     125
Name: count, dtype: int64

In [16]:
df_seg['patient_gender'].value_counts()

patient_gender
nő       18440
férfi     8068
Name: count, dtype: int64

In [17]:
df_seg['prescribed_medication_atc'].unique()



array([nan, 'J01XX01', 'S01AE05, S01AA12', ..., 'C08GA02',
       'C02AC06, C10AA07, C09DB04, C07AB12, C03BA11, C02CA04',
       'C10AA07, N02AJ14'], shape=(2160,), dtype=object)

In [18]:
# Parse cv_screening5_height to extract numeric height values
print(f"\nParsing cv_screening5_height to extract numeric values...")

if 'cv_screening5_height' in df_seg.columns:
    def extract_height(value):
        """
        Extract numeric height from cv_screening5_height string.
        Handles formats like:
        - '153' -> 153 (simple number)
        - '158,8' -> 158.8 (comma decimal separator - Hungarian format)
        - '165.0' -> 165.0 (dot decimal separator)
        - '108 76' -> 108 (multiple numbers, take first)
        - '\xa0173 ' -> 173 (whitespace/special characters, clean and extract)
        - NaN -> NaN
        """
        if pd.isna(value):
            return np.nan
        
        # Convert to string if not already and strip whitespace
        value_str = str(value).strip()
        
        # Remove any non-breaking spaces or special characters
        value_str = re.sub(r'[\xa0\u200b\u200c\u200d\u2060]', '', value_str)
        value_str = value_str.strip()
        
        # Handle multiple numbers separated by space (take first)
        if ' ' in value_str:
            parts = value_str.split()
            if len(parts) > 0:
                value_str = parts[0]
        
        # Replace comma decimal separator with dot (Hungarian format: 158,8 -> 158.8)
        value_str = value_str.replace(',', '.')
        
        # Try to parse as float
        try:
            return float(value_str)
        except ValueError:
            return np.nan
    
    # Apply extraction function to all values
    parsed_height = df_seg['cv_screening5_height'].apply(extract_height)
    
    # Count how many values we can extract
    non_null_parsed = parsed_height.notna().sum()
    print(f"  Extracted {non_null_parsed:,} numeric height values from cv_screening5_height")
    
    # Replace the entire column with parsed numeric values
    # This ensures ALL string values are converted to numbers (NaN for unparseable)
    df_seg['cv_screening5_height'] = parsed_height.astype('float64')
    
    print(f"  Updated cv_screening5_height column with parsed numeric values (dtype: {df_seg['cv_screening5_height'].dtype})")
    
    # Fill missing values in physical1_height if it exists and we have parsed values
    if 'physical1_height' in df_seg.columns:
        missing_physical_height = df_seg['physical1_height'].isna()
        can_fill = (missing_physical_height & parsed_height.notna()).sum()
        
        if can_fill > 0:
            df_seg.loc[missing_physical_height & parsed_height.notna(), 'physical1_height'] = \
                parsed_height[missing_physical_height & parsed_height.notna()]
            print(f"  Filled {can_fill:,} missing physical1_height values from cv_screening5_height")
    
    print(f"  cv_screening5_height now has {df_seg['cv_screening5_height'].notna().sum():,} non-null values")
    print(f"  Final dtype: {df_seg['cv_screening5_height'].dtype}")
else:
    print("  cv_screening5_height column not found, skipping parsing")



Parsing cv_screening5_height to extract numeric values...
  Extracted 3,241 numeric height values from cv_screening5_height
  Updated cv_screening5_height column with parsed numeric values (dtype: float64)
  Filled 28 missing physical1_height values from cv_screening5_height
  cv_screening5_height now has 3,241 non-null values
  Final dtype: float64


In [19]:
df_seg['cv_screening8_waist_circumference'].unique()

array([nan, '98.0', '109.0', '100.0', '107.0', '121.0', '93.0', '70.0',
       '94.0', '73.0', '106.0', '97.0', '96.0', '112.0', '90.0', '95.0',
       '130.0', '83.0', '129.0', '82.0', '91.0', '105.0', '110.0', '92.0',
       '76.0', '67.0', '99.0', '115.0', '89.0', '111.0', '84.0', '77.0',
       '88.0', '101.0', '119.0', '104.0', '85.0', '116.0', '87.0', '79.0',
       '80.0', '108.0', '132.0', '103.0', '72.0', '102.0', '124.0',
       '118.0', '62.0', '136.0', '74.0', '135.0', '113.0', '68.0', '81.0',
       '117.0', '122.0', '71.0', '123.0', '86.0', '69.0', '120.0',
       '114.0', '134.0', '60.0', '78.0', '140.0', '128.0', '127.0',
       '126.0', '57.0', '75.0', '147.0', '148.0', '139.0', '133.0',
       '131.0', '66.0', '141.0', '63.0', '96', '105', '86', '115', '123',
       '136', '121', '103', '94', '84', '130', '91', '81', '98', '80',
       '111', '97', '87', '85', '109', '72', '88', '92', '104', '112',
       '116', '108', '110', '118', '89', '113', '107', '100', '95', '9

In [20]:
# Parse cv_screening6_weight to extract numeric weight values
print(f"\nParsing cv_screening6_weight to extract numeric values...")

if 'cv_screening6_weight' in df_seg.columns:
    def extract_weight(value):
        """
        Extract numeric weight from cv_screening6_weight string.
        Handles formats like:
        - '68' -> 68 (simple number)
        - '76,5' -> 76.5 (comma decimal separator - Hungarian format)
        - '80.3' -> 80.3 (dot decimal separator)
        - '62,5 ' -> 62.5 (whitespace/special characters, clean and extract)
        - '\xa0104,3' -> 104.3 (non-breaking space, clean and extract)
        - '5300' or '5700' -> NaN (likely data entry errors, too high for kg)
        - NaN -> NaN
        """
        if pd.isna(value):
            return np.nan
        
        # Convert to string if not already and strip whitespace
        value_str = str(value).strip()
        
        # Remove any non-breaking spaces or special characters
        value_str = re.sub(r'[\xa0\u200b\u200c\u200d\u2060]', '', value_str)
        value_str = value_str.strip()
        
        # Handle multiple numbers separated by space (take first)
        if ' ' in value_str:
            parts = value_str.split()
            if len(parts) > 0:
                value_str = parts[0]
        
        # Replace comma decimal separator with dot (Hungarian format: 76,5 -> 76.5)
        value_str = value_str.replace(',', '.')
        
        # Try to parse as float
        try:
            weight = float(value_str)
            # Filter out obvious data entry errors (weights > 1000 kg are likely in grams, not kg)
            if weight > 1000:
                return np.nan
            return weight
        except ValueError:
            return np.nan
    
    # Apply extraction function to all values
    parsed_weight = df_seg['cv_screening6_weight'].apply(extract_weight)
    
    # Count how many values we can extract
    non_null_parsed = parsed_weight.notna().sum()
    print(f"  Extracted {non_null_parsed:,} numeric weight values from cv_screening6_weight")
    
    # Replace the entire column with parsed numeric values
    # This ensures ALL string values are converted to numbers (NaN for unparseable)
    df_seg['cv_screening6_weight'] = parsed_weight.astype('float64')
    
    print(f"  Updated cv_screening6_weight column with parsed numeric values (dtype: {df_seg['cv_screening6_weight'].dtype})")
    
    # Fill missing values in physical2_weight if it exists and we have parsed values
    if 'physical2_weight' in df_seg.columns:
        missing_physical_weight = df_seg['physical2_weight'].isna()
        can_fill = (missing_physical_weight & parsed_weight.notna()).sum()
        
        if can_fill > 0:
            df_seg.loc[missing_physical_weight & parsed_weight.notna(), 'physical2_weight'] = \
                parsed_weight[missing_physical_weight & parsed_weight.notna()]
            print(f"  Filled {can_fill:,} missing physical2_weight values from cv_screening6_weight")
    
    print(f"  cv_screening6_weight now has {df_seg['cv_screening6_weight'].notna().sum():,} non-null values")
    print(f"  Final dtype: {df_seg['cv_screening6_weight'].dtype}")
else:
    print("  cv_screening6_weight column not found, skipping parsing")



Parsing cv_screening6_weight to extract numeric values...
  Extracted 3,230 numeric weight values from cv_screening6_weight
  Updated cv_screening6_weight column with parsed numeric values (dtype: float64)
  Filled 752 missing physical2_weight values from cv_screening6_weight
  cv_screening6_weight now has 3,230 non-null values
  Final dtype: float64


In [21]:
df_seg['cv_screening8_waist_circumference'].unique()

array([nan, '98.0', '109.0', '100.0', '107.0', '121.0', '93.0', '70.0',
       '94.0', '73.0', '106.0', '97.0', '96.0', '112.0', '90.0', '95.0',
       '130.0', '83.0', '129.0', '82.0', '91.0', '105.0', '110.0', '92.0',
       '76.0', '67.0', '99.0', '115.0', '89.0', '111.0', '84.0', '77.0',
       '88.0', '101.0', '119.0', '104.0', '85.0', '116.0', '87.0', '79.0',
       '80.0', '108.0', '132.0', '103.0', '72.0', '102.0', '124.0',
       '118.0', '62.0', '136.0', '74.0', '135.0', '113.0', '68.0', '81.0',
       '117.0', '122.0', '71.0', '123.0', '86.0', '69.0', '120.0',
       '114.0', '134.0', '60.0', '78.0', '140.0', '128.0', '127.0',
       '126.0', '57.0', '75.0', '147.0', '148.0', '139.0', '133.0',
       '131.0', '66.0', '141.0', '63.0', '96', '105', '86', '115', '123',
       '136', '121', '103', '94', '84', '130', '91', '81', '98', '80',
       '111', '97', '87', '85', '109', '72', '88', '92', '104', '112',
       '116', '108', '110', '118', '89', '113', '107', '100', '95', '9

In [22]:
# Parse cv_screening8_waist_circumference to extract numeric waist circumference values
print(f"\nParsing cv_screening8_waist_circumference to extract numeric values...")

if 'cv_screening8_waist_circumference' in df_seg.columns:
    def extract_waist_circumference(value):
        """
        Extract numeric waist circumference from cv_screening8_waist_circumference string.
        Handles formats like:
        - '96' -> 96 (simple number)
        - '98.0' -> 98.0 (dot decimal separator)
        - '82,5' -> 82.5 (comma decimal separator - Hungarian format)
        - '106 ' -> 106 (whitespace, clean and extract)
        - '103\xa0' -> 103 (non-breaking space, clean and extract)
        - 'gravidaként nem mértünk' -> NaN (text values indicating not measured)
        - NaN -> NaN
        """
        if pd.isna(value):
            return np.nan
        
        # Convert to string if not already and strip whitespace
        value_str = str(value).strip()
        
        # Remove any non-breaking spaces or special characters
        value_str = re.sub(r'[\xa0\u200b\u200c\u200d\u2060]', '', value_str)
        value_str = value_str.strip()
        
        # Check if it's a text value indicating not measured (e.g., "gravidaként nem mértünk")
        # Convert to lowercase for comparison
        value_lower = value_str.lower()
        if any(keyword in value_lower for keyword in ['nem', 'not', 'mért', 'measured', 'n/a', 'na']):
            return np.nan
        
        # Handle multiple numbers separated by space (take first)
        if ' ' in value_str:
            parts = value_str.split()
            if len(parts) > 0:
                value_str = parts[0]
        
        # Replace comma decimal separator with dot (Hungarian format: 82,5 -> 82.5)
        value_str = value_str.replace(',', '.')
        
        # Try to parse as float
        try:
            return float(value_str)
        except ValueError:
            return np.nan
    
    # Apply extraction function to all values
    parsed_waist = df_seg['cv_screening8_waist_circumference'].apply(extract_waist_circumference)
    
    # Count how many values we can extract
    non_null_parsed = parsed_waist.notna().sum()
    print(f"  Extracted {non_null_parsed:,} numeric waist circumference values from cv_screening8_waist_circumference")
    
    # Replace the entire column with parsed numeric values
    # This ensures ALL string values are converted to numbers (NaN for unparseable)
    df_seg['cv_screening8_waist_circumference'] = parsed_waist.astype('float64')
    
    print(f"  Updated cv_screening8_waist_circumference column with parsed numeric values (dtype: {df_seg['cv_screening8_waist_circumference'].dtype})")
    
    # Fill missing values in physical4_waist_circumference if it exists and we have parsed values
    if 'physical4_waist_circumference' in df_seg.columns:
        missing_physical_waist = df_seg['physical4_waist_circumference'].isna()
        can_fill = (missing_physical_waist & parsed_waist.notna()).sum()
        
        if can_fill > 0:
            df_seg.loc[missing_physical_waist & parsed_waist.notna(), 'physical4_waist_circumference'] = \
                parsed_waist[missing_physical_waist & parsed_waist.notna()]
            print(f"  Filled {can_fill:,} missing physical4_waist_circumference values from cv_screening8_waist_circumference")
    
    print(f"  cv_screening8_waist_circumference now has {df_seg['cv_screening8_waist_circumference'].notna().sum():,} non-null values")
    print(f"  Final dtype: {df_seg['cv_screening8_waist_circumference'].dtype}")
else:
    print("  cv_screening8_waist_circumference column not found, skipping parsing")



Parsing cv_screening8_waist_circumference to extract numeric values...


  Extracted 3,027 numeric waist circumference values from cv_screening8_waist_circumference
  Updated cv_screening8_waist_circumference column with parsed numeric values (dtype: float64)
  Filled 1,067 missing physical4_waist_circumference values from cv_screening8_waist_circumference
  cv_screening8_waist_circumference now has 3,027 non-null values
  Final dtype: float64


In [23]:
df_seg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26508 entries, 0 to 26550
Data columns (total 26 columns):
 #   Column                             Non-Null Count  Dtype                          
---  ------                             --------------  -----                          
 0   created                            26508 non-null  datetime64[ns, Europe/Budapest]
 1   mep_region                         26508 non-null  object                         
 2   patient_gender                     26508 non-null  object                         
 3   prescribed_medication_atc          6681 non-null   object                         
 4   specialty_name                     26508 non-null  object                         
 5   measurements_ultrasound_category   4076 non-null   object                         
 6   ultrasound3_date                   3794 non-null   datetime64[ns, Europe/Budapest]
 7   bp_systolic_temp                   8294 non-null   float64                        
 8   bp_systolic

In [24]:
##show all unique values pf physical2_weight
# Display all unique values without truncation
unique_values = df_seg['pulse_oximetry2_pulse'].unique()
print(f"Total unique values: {len(unique_values)}")
print("\nAll unique values:")
# Convert to list and print to avoid numpy array truncation
print(list(unique_values))

Total unique values: 131

All unique values:
[nan, '66', '100', '64', '75', '80', '71', '74', '79', '90', '63', '89', '70', '56', '87', '82', '129', '65', '97', '76', '66/min', '110', '130', '85', '54', '72', '101', '58', '62', '120', '108', '84', '92', '60', '95', '81', '98', '78', '83', '67', '77', '61', '91', '86', '88', '104', '68', '51', '50', '105', '57', '106', '69', '59', '73', '93', '36.8', '96', '138', '52', '116', '99', '79.0', '97.0', '78.0', '96.0', '55.0', '88.0', '102.0', '87.0', '76.0', '112.0', '84.0', '89.0', '95.0', '9064.0', '71.0', '64.0', '83.0', '98.0', '80.0', '68.0', '107.0', '86.0', '53.0', '70.0', '67.0', '62.0', '77.0', '118.0', '85.0', '92.0', '82.0', '94.0', '91.0', '73.0', '50.0', '72.0', '58.0', '66.0', '115.0', '100.0', '101.0', '81.0', '65.0', '104.0', '59.0', '60.0', '90.0', '103.0', '63.0', '74.0', '52.0', '108.0', '54.0', '99.0', '75.0', '61.0', '169.0', '69.0', '110.0', '57.0', '47.0', '105.0', '93.0', '51.0', '116.0', '120.0', '106.0', '56.0', '12

In [25]:
# Parse physical1_height to extract numeric height values
print(f"\nParsing physical1_height to extract numeric values...")

if 'physical1_height' in df_seg.columns:
    def extract_physical_height(value):
        """
        Extract numeric height from physical1_height string.
        Handles formats like:
        - '153' -> 153 (simple number)
        - 155.0 -> 155.0 (already numeric)
        - '162,5' -> 162.5 (comma decimal separator - Hungarian format)
        - '162.5' -> 162.5 (dot decimal separator)
        - '153 ' -> 153 (whitespace, clean and extract)
        - '\xa0173 ' -> 173 (non-breaking space, clean and extract)
        - '108 76' -> 108 (multiple numbers, take first)
        - '138 cm' -> 138 (remove unit)
        - '160.-' -> 160 (remove trailing dash)
        - 'nem mérhető' -> NaN (text indicating not measurable)
        - NaN -> NaN
        """
        if pd.isna(value):
            return np.nan
        
        # If already numeric, return as is
        if isinstance(value, (int, float)):
            return float(value)
        
        # Convert to string if not already and strip whitespace
        value_str = str(value).strip()
        
        # Remove any non-breaking spaces or special characters
        value_str = re.sub(r'[\xa0\u200b\u200c\u200d\u2060]', '', value_str)
        value_str = value_str.strip()
        
        # Check if it's a text value indicating not measurable (e.g., "nem mérhető")
        value_lower = value_str.lower()
        if any(keyword in value_lower for keyword in ['nem', 'not', 'mérhető', 'measurable', 'n/a', 'na']):
            return np.nan
        
        # Remove units (cm, etc.)
        value_str = re.sub(r'\s*(cm|kg|g)\s*$', '', value_str, flags=re.IGNORECASE)
        value_str = value_str.strip()
        
        # Remove trailing dashes or dots (e.g., '160.-' -> '160')
        value_str = re.sub(r'[.-]+$', '', value_str)
        value_str = value_str.strip()
        
        # Handle multiple numbers separated by space (take first)
        if ' ' in value_str:
            parts = value_str.split()
            if len(parts) > 0:
                value_str = parts[0]
        
        # Replace comma decimal separator with dot (Hungarian format: 162,5 -> 162.5)
        value_str = value_str.replace(',', '.')
        
        # Try to parse as float
        try:
            return float(value_str)
        except ValueError:
            return np.nan
    
    # Apply extraction function to all values
    parsed_height = df_seg['physical1_height'].apply(extract_physical_height)
    
    # Count how many values we can extract
    non_null_parsed = parsed_height.notna().sum()
    print(f"  Extracted {non_null_parsed:,} numeric height values from physical1_height")
    
    # Replace the entire column with parsed numeric values
    # This ensures ALL string values are converted to numbers (NaN for unparseable)
    df_seg['physical1_height'] = parsed_height.astype('float64')
    
    print(f"  Updated physical1_height column with parsed numeric values (dtype: {df_seg['physical1_height'].dtype})")
    
    # Fill missing values in cv_screening5_height if it exists and we have parsed values
    if 'cv_screening5_height' in df_seg.columns:
        missing_cv_height = df_seg['cv_screening5_height'].isna()
        can_fill = (missing_cv_height & parsed_height.notna()).sum()
        
        if can_fill > 0:
            df_seg.loc[missing_cv_height & parsed_height.notna(), 'cv_screening5_height'] = \
                parsed_height[missing_cv_height & parsed_height.notna()]
            print(f"  Filled {can_fill:,} missing cv_screening5_height values from physical1_height")
    
    print(f"  physical1_height now has {df_seg['physical1_height'].notna().sum():,} non-null values")
    print(f"  Final dtype: {df_seg['physical1_height'].dtype}")
else:
    print("  physical1_height column not found, skipping parsing")



Parsing physical1_height to extract numeric values...
  Extracted 6,343 numeric height values from physical1_height
  Updated physical1_height column with parsed numeric values (dtype: float64)
  Filled 3,102 missing cv_screening5_height values from physical1_height
  physical1_height now has 6,343 non-null values
  Final dtype: float64


In [26]:
df_seg['physical1_height'].unique()

array([  nan, 153. , 167. , 168. , 164. , 161. , 160. , 152. , 155. ,
       156. , 183. , 171. , 150. , 165. , 134. , 157. , 176. , 175. ,
       162. , 177. , 159. , 163. , 154. , 169. , 170. ,  91. , 110. ,
       146. , 112. , 131. , 102. , 180. , 172. , 158. , 137. , 124. ,
       173. , 119. ,  68. , 149. , 120. , 101. , 143. , 130. , 116. ,
       118. , 114. , 185. , 144. , 108. , 178. , 151. , 145. , 186. ,
       174. , 166. , 122. ,  64. , 182. ,  67. , 135. ,  78. , 142. ,
       140. , 181. , 147. , 148. , 184. , 179. , 187. , 141. , 192. ,
       109. , 100. , 188. , 196. ,  99. , 198. , 197. , 162.5, 189. ,
       138. ,   1. , 195. , 185.5,  86. , 191. ,  80. , 160.5, 177.5,
       113. ,  72. ,  63. ,  83. ,  88.7, 105. , 117. ,  98. , 135.5,
        93. , 133. , 129. , 190. , 158.8, 194. ,  53. , 127. , 165.5,
       128. , 155.5, 139. , 167.5, 174.5, 136. ,  62. ,  71. , 103. ,
       183.5, 173.5,  74. ,  84. ,  96. , 104. , 178.5, 107. ,  42.5,
       193. , 132. ,

In [27]:
# Parse physical2_weight to extract numeric weight values
print(f"\nParsing physical2_weight to extract numeric values...")

if 'physical2_weight' in df_seg.columns:
    def extract_physical_weight(value):
        """
        Extract numeric weight from physical2_weight string.
        Handles formats like:
        - '68' -> 68 (simple number)
        - 65.0 -> 65.0 (already numeric)
        - '76,5' -> 76.5 (comma decimal separator - Hungarian format)
        - '76.5' -> 76.5 (dot decimal separator)
        - '62,5 ' -> 62.5 (whitespace, clean and extract)
        - '44.7\xa0' -> 44.7 (non-breaking space, clean and extract)
        - '\xa0104,3' -> 104.3 (non-breaking space at start)
        - '31,8 kg' -> 31.8 (remove unit)
        - '50,5 kg' -> 50.5 (remove unit)
        - '54,7kg' -> 54.7 (remove unit, no space)
        - '5300', '6190', '5700' -> NaN (likely data entry errors, too high for kg)
        - 'nem mérhető' -> NaN (text indicating not measurable)
        - NaN -> NaN
        """
        if pd.isna(value):
            return np.nan
        
        # If already numeric, return as is
        if isinstance(value, (int, float)):
            return float(value)
        
        # Convert to string if not already and strip whitespace
        value_str = str(value).strip()
        
        # Remove any non-breaking spaces or special characters
        value_str = re.sub(r'[\xa0\u200b\u200c\u200d\u2060]', '', value_str)
        value_str = value_str.strip()
        
        # Check if it's a text value indicating not measurable (e.g., "nem mérhető")
        value_lower = value_str.lower()
        if any(keyword in value_lower for keyword in ['nem', 'not', 'mérhető', 'measurable', 'n/a', 'na']):
            return np.nan
        
        # Remove units (kg, g, etc.) - handle both with and without space
        value_str = re.sub(r'\s*(kg|g)\s*$', '', value_str, flags=re.IGNORECASE)
        value_str = value_str.strip()
        
        # Handle multiple numbers separated by space (take first)
        if ' ' in value_str:
            parts = value_str.split()
            if len(parts) > 0:
                value_str = parts[0]
        
        # Replace comma decimal separator with dot (Hungarian format: 76,5 -> 76.5)
        value_str = value_str.replace(',', '.')
        
        # Try to parse as float
        try:
            weight = float(value_str)
            # Filter out obvious data entry errors (weights > 1000 kg are likely in grams, not kg)
            if weight > 1000:
                return np.nan
            return weight
        except ValueError:
            return np.nan
    
    # Apply extraction function to all values
    parsed_weight = df_seg['physical2_weight'].apply(extract_physical_weight)
    
    # Count how many values we can extract
    non_null_parsed = parsed_weight.notna().sum()
    print(f"  Extracted {non_null_parsed:,} numeric weight values from physical2_weight")
    
    # Replace the entire column with parsed numeric values
    # This ensures ALL string values are converted to numbers (NaN for unparseable)
    df_seg['physical2_weight'] = parsed_weight.astype('float64')
    
    print(f"  Updated physical2_weight column with parsed numeric values (dtype: {df_seg['physical2_weight'].dtype})")
    
    # Fill missing values in cv_screening6_weight if it exists and we have parsed values
    if 'cv_screening6_weight' in df_seg.columns:
        missing_cv_weight = df_seg['cv_screening6_weight'].isna()
        can_fill = (missing_cv_weight & parsed_weight.notna()).sum()
        
        if can_fill > 0:
            df_seg.loc[missing_cv_weight & parsed_weight.notna(), 'cv_screening6_weight'] = \
                parsed_weight[missing_cv_weight & parsed_weight.notna()]
            print(f"  Filled {can_fill:,} missing cv_screening6_weight values from physical2_weight")
    
    print(f"  physical2_weight now has {df_seg['physical2_weight'].notna().sum():,} non-null values")
    print(f"  Final dtype: {df_seg['physical2_weight'].dtype}")
else:
    print("  physical2_weight column not found, skipping parsing")



Parsing physical2_weight to extract numeric values...
  Extracted 6,539 numeric weight values from physical2_weight
  Updated physical2_weight column with parsed numeric values (dtype: float64)
  Filled 3,309 missing cv_screening6_weight values from physical2_weight
  physical2_weight now has 6,539 non-null values
  Final dtype: float64


In [28]:
# Parse physical4_waist_circumference to extract numeric waist circumference values
print(f"\nParsing physical4_waist_circumference to extract numeric values...")

if 'physical4_waist_circumference' in df_seg.columns:
    def extract_physical_waist(value):
        """
        Extract numeric waist circumference from physical4_waist_circumference string.
        Handles formats like:
        - '98' -> 98 (simple number)
        - 98.0 -> 98.0 (already numeric)
        - '82,5' -> 82.5 (comma decimal separator - Hungarian format)
        - '106,5' -> 106.5 (comma decimal)
        - '106 ' -> 106 (whitespace, clean and extract)
        - ' 112' -> 112 (leading whitespace)
        - '103\xa0' -> 103 (non-breaking space, clean and extract)
        - '113\xa0' -> 113 (non-breaking space)
        - 'gravidaként nem mértünk' -> NaN (text indicating not measured)
        - NaN -> NaN
        """
        if pd.isna(value):
            return np.nan
        
        # If already numeric, return as is
        if isinstance(value, (int, float)):
            return float(value)
        
        # Convert to string if not already and strip whitespace
        value_str = str(value).strip()
        
        # Remove any non-breaking spaces or special characters
        value_str = re.sub(r'[\xa0\u200b\u200c\u200d\u2060]', '', value_str)
        value_str = value_str.strip()
        
        # Check if it's a text value indicating not measured (e.g., "gravidaként nem mértünk")
        value_lower = value_str.lower()
        if any(keyword in value_lower for keyword in ['nem', 'not', 'mért', 'measured', 'n/a', 'na', 'gravidaként']):
            return np.nan
        
        # Handle multiple numbers separated by space (take first)
        if ' ' in value_str:
            parts = value_str.split()
            if len(parts) > 0:
                value_str = parts[0]
        
        # Replace comma decimal separator with dot (Hungarian format: 82,5 -> 82.5)
        value_str = value_str.replace(',', '.')
        
        # Try to parse as float
        try:
            return float(value_str)
        except ValueError:
            return np.nan
    
    # Apply extraction function to all values
    parsed_waist = df_seg['physical4_waist_circumference'].apply(extract_physical_waist)
    
    # Count how many values we can extract
    non_null_parsed = parsed_waist.notna().sum()
    print(f"  Extracted {non_null_parsed:,} numeric waist circumference values from physical4_waist_circumference")
    
    # Replace the entire column with parsed numeric values
    # This ensures ALL string values are converted to numbers (NaN for unparseable)
    df_seg['physical4_waist_circumference'] = parsed_waist.astype('float64')
    
    print(f"  Updated physical4_waist_circumference column with parsed numeric values (dtype: {df_seg['physical4_waist_circumference'].dtype})")
    
    # Fill missing values in cv_screening8_waist_circumference if it exists and we have parsed values
    if 'cv_screening8_waist_circumference' in df_seg.columns:
        missing_cv_waist = df_seg['cv_screening8_waist_circumference'].isna()
        can_fill = (missing_cv_waist & parsed_waist.notna()).sum()
        
        if can_fill > 0:
            df_seg.loc[missing_cv_waist & parsed_waist.notna(), 'cv_screening8_waist_circumference'] = \
                parsed_waist[missing_cv_waist & parsed_waist.notna()]
            print(f"  Filled {can_fill:,} missing cv_screening8_waist_circumference values from physical4_waist_circumference")
    
    print(f"  physical4_waist_circumference now has {df_seg['physical4_waist_circumference'].notna().sum():,} non-null values")
    print(f"  Final dtype: {df_seg['physical4_waist_circumference'].dtype}")
else:
    print("  physical4_waist_circumference column not found, skipping parsing")



Parsing physical4_waist_circumference to extract numeric values...
  Extracted 5,538 numeric waist circumference values from physical4_waist_circumference
  Updated physical4_waist_circumference column with parsed numeric values (dtype: float64)
  Filled 2,511 missing cv_screening8_waist_circumference values from physical4_waist_circumference
  physical4_waist_circumference now has 5,538 non-null values
  Final dtype: float64


In [29]:
# Parse pulse_oximetry1_saturation to extract numeric saturation values
print(f"\nParsing pulse_oximetry1_saturation to extract numeric values...")

if 'pulse_oximetry1_saturation' in df_seg.columns:
    def extract_saturation(value):
        """
        Extract numeric oxygen saturation from pulse_oximetry1_saturation string.
        Handles formats like:
        - '99' -> 99 (simple number)
        - '95%' -> 95 (remove percent sign)
        - '97%' -> 97 (remove percent sign)
        - ' 97' -> 97 (whitespace, clean and extract)
        - '9886' -> NaN (data entry error, too high)
        - '101', '104' -> NaN (above 100%, likely errors)
        - NaN -> NaN
        """
        if pd.isna(value):
            return np.nan
        
        # Convert to string if not already and strip whitespace
        value_str = str(value).strip()
        
        # Remove percent signs
        value_str = value_str.replace('%', '')
        value_str = value_str.strip()
        
        # Handle multiple numbers separated by space (take first)
        if ' ' in value_str:
            parts = value_str.split()
            if len(parts) > 0:
                value_str = parts[0]
        
        # Try to parse as float
        try:
            saturation = float(value_str)
            # Filter out obvious errors:
            # - Values > 100 (saturation can't exceed 100%)
            # - Values > 1000 (likely data entry errors like '9886')
            if saturation > 100:
                return np.nan
            return saturation
        except ValueError:
            return np.nan
    
    # Apply extraction function to all values
    parsed_saturation = df_seg['pulse_oximetry1_saturation'].apply(extract_saturation)
    
    # Count how many values we can extract
    non_null_parsed = parsed_saturation.notna().sum()
    print(f"  Extracted {non_null_parsed:,} numeric saturation values from pulse_oximetry1_saturation")
    
    # Replace the entire column with parsed numeric values
    # This ensures ALL string values are converted to numbers (NaN for unparseable)
    df_seg['pulse_oximetry1_saturation'] = parsed_saturation.astype('float64')
    
    print(f"  Updated pulse_oximetry1_saturation column with parsed numeric values (dtype: {df_seg['pulse_oximetry1_saturation'].dtype})")
    print(f"  pulse_oximetry1_saturation now has {df_seg['pulse_oximetry1_saturation'].notna().sum():,} non-null values")
    print(f"  Final dtype: {df_seg['pulse_oximetry1_saturation'].dtype}")
else:
    print("  pulse_oximetry1_saturation column not found, skipping parsing")



Parsing pulse_oximetry1_saturation to extract numeric values...
  Extracted 2,687 numeric saturation values from pulse_oximetry1_saturation
  Updated pulse_oximetry1_saturation column with parsed numeric values (dtype: float64)
  pulse_oximetry1_saturation now has 2,687 non-null values
  Final dtype: float64


In [30]:
# Parse pulse_oximetry2_pulse to extract numeric pulse values
print(f"\nParsing pulse_oximetry2_pulse to extract numeric values...")

if 'pulse_oximetry2_pulse' in df_seg.columns:
    def extract_pulse(value):
        """
        Extract numeric pulse rate from pulse_oximetry2_pulse string.
        Handles formats like:
        - '66' -> 66 (simple number)
        - 79.0 -> 79.0 (already numeric)
        - '66/min' -> 66 (remove unit)
        - '9064.0' -> NaN (data entry error, too high for pulse rate)
        - NaN -> NaN
        """
        if pd.isna(value):
            return np.nan
        
        # If already numeric, return as is
        if isinstance(value, (int, float)):
            pulse = float(value)
            # Filter out obvious errors (pulse > 220 bpm based on outlier clipping range)
            if pulse > 220:
                return np.nan
            return pulse
        
        # Convert to string if not already and strip whitespace
        value_str = str(value).strip()
        
        # Remove units (/min, /minute, bpm, etc.)
        value_str = re.sub(r'\s*(/min|/minute|bpm|/h|/hr)\s*$', '', value_str, flags=re.IGNORECASE)
        value_str = value_str.strip()
        
        # Handle multiple numbers separated by space (take first)
        if ' ' in value_str:
            parts = value_str.split()
            if len(parts) > 0:
                value_str = parts[0]
        
        # Try to parse as float
        try:
            pulse = float(value_str)
            # Filter out obvious errors (pulse > 220 bpm based on outlier clipping range)
            if pulse > 220:
                return np.nan
            return pulse
        except ValueError:
            return np.nan
    
    # Apply extraction function to all values
    parsed_pulse = df_seg['pulse_oximetry2_pulse'].apply(extract_pulse)
    
    # Count how many values we can extract
    non_null_parsed = parsed_pulse.notna().sum()
    print(f"  Extracted {non_null_parsed:,} numeric pulse values from pulse_oximetry2_pulse")
    
    # Replace the entire column with parsed numeric values
    # This ensures ALL string values are converted to numbers (NaN for unparseable)
    df_seg['pulse_oximetry2_pulse'] = parsed_pulse.astype('float64')
    
    print(f"  Updated pulse_oximetry2_pulse column with parsed numeric values (dtype: {df_seg['pulse_oximetry2_pulse'].dtype})")
    
    # Fill missing values in pulse column if it exists and we have parsed values
    if 'pulse' in df_seg.columns:
        missing_pulse = df_seg['pulse'].isna()
        can_fill = (missing_pulse & parsed_pulse.notna()).sum()
        
        if can_fill > 0:
            df_seg.loc[missing_pulse & parsed_pulse.notna(), 'pulse'] = \
                parsed_pulse[missing_pulse & parsed_pulse.notna()]
            print(f"  Filled {can_fill:,} missing pulse values from pulse_oximetry2_pulse")
    
    print(f"  pulse_oximetry2_pulse now has {df_seg['pulse_oximetry2_pulse'].notna().sum():,} non-null values")
    print(f"  Final dtype: {df_seg['pulse_oximetry2_pulse'].dtype}")
else:
    print("  pulse_oximetry2_pulse column not found, skipping parsing")



Parsing pulse_oximetry2_pulse to extract numeric values...
  Extracted 1,553 numeric pulse values from pulse_oximetry2_pulse
  Updated pulse_oximetry2_pulse column with parsed numeric values (dtype: float64)
  Filled 102 missing pulse values from pulse_oximetry2_pulse
  pulse_oximetry2_pulse now has 1,553 non-null values
  Final dtype: float64


In [31]:
df_seg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26508 entries, 0 to 26550
Data columns (total 26 columns):
 #   Column                             Non-Null Count  Dtype                          
---  ------                             --------------  -----                          
 0   created                            26508 non-null  datetime64[ns, Europe/Budapest]
 1   mep_region                         26508 non-null  object                         
 2   patient_gender                     26508 non-null  object                         
 3   prescribed_medication_atc          6681 non-null   object                         
 4   specialty_name                     26508 non-null  object                         
 5   measurements_ultrasound_category   4076 non-null   object                         
 6   ultrasound3_date                   3794 non-null   datetime64[ns, Europe/Budapest]
 7   bp_systolic_temp                   8294 non-null   float64                        
 8   bp_systolic

In [32]:
# Drop rows where icd3_code is empty
print(f"\nDropping rows with empty icd3_code...")
print(f"Shape before dropping: {df_seg.shape}")

if 'icd3_code' in df_seg.columns:
    # Count rows with empty/missing icd3_code
    empty_icd3 = df_seg['icd3_code'].isna().sum()
    empty_icd3_str = (df_seg['icd3_code'] == '').sum()
    total_empty = empty_icd3 + empty_icd3_str
    
    print(f"Rows with missing/empty icd3_code: {total_empty:,} (NaN: {empty_icd3:,}, empty string: {empty_icd3_str:,})")
    
    if total_empty > 0:
        # Drop rows where icd3_code is NaN or empty string
        df_seg = df_seg[(df_seg['icd3_code'].notna()) & (df_seg['icd3_code'] != '')].copy()
        print(f"Rows removed: {total_empty:,}")
    else:
        print("No rows with empty icd3_code found ✓")
else:
    print("icd3_code column not found, skipping")

print(f"Shape after dropping: {df_seg.shape}")



Dropping rows with empty icd3_code...
Shape before dropping: (26508, 26)
Rows with missing/empty icd3_code: 14 (NaN: 14, empty string: 0)
Rows removed: 14
Shape after dropping: (26494, 26)


In [33]:
# Outlier clipping with QA flags
print(f"\nOutlier clipping with QA flags...")
print(f"Shape before clipping: {df_seg.shape}")

# Define clipping ranges for each measurement type
clipping_config = {
    'SBP': {'columns': ['bp_systolic', 'bp_systolic2', 'bp_systolic_temp'], 'min': 70, 'max': 260},
    'DBP': {'columns': ['bp_diastolic', 'bp_diastolic2'], 'min': 40, 'max': 150},
    'Pulse': {'columns': ['pulse', 'pulse_oximetry2_pulse'], 'min': 30, 'max': 220},
    'Height': {'columns': ['cv_screening5_height', 'physical1_height'], 'min': 120, 'max': 220},
    'Weight': {'columns': ['cv_screening6_weight', 'physical2_weight'], 'min': 30, 'max': 300},
    'BMI': {'columns': ['cv_screening7_bmi', 'physical3_bmi'], 'min': 10, 'max': 80}
}

# Track clipping statistics
clipping_stats = {}

# Process each measurement type
for measurement, config in clipping_config.items():
    measurement_clipped = 0
    measurement_clipped_min = 0
    measurement_clipped_max = 0
    
    for col in config['columns']:
        if col in df_seg.columns:
            # Convert column to numeric first (handles string values, converts to NaN if not numeric)
            df_seg[col] = pd.to_numeric(df_seg[col], errors='coerce')
            
            # Count values outside range (excluding NaN)
            valid_mask = df_seg[col].notna()
            if valid_mask.any():
                below_min = (df_seg[col] < config['min']) & valid_mask
                above_max = (df_seg[col] > config['max']) & valid_mask
                
                clipped_min_count = below_min.sum()
                clipped_max_count = above_max.sum()
                
                # Clip values to boundaries (only for valid numeric values)
                df_seg.loc[valid_mask, col] = np.clip(df_seg.loc[valid_mask, col], config['min'], config['max'])
                
                # Create QA flag column
                flag_col = f'{col}_clipped'
                df_seg[flag_col] = False
                
                # Mark clipped values in flag column
                if clipped_min_count > 0:
                    df_seg.loc[below_min, flag_col] = True
                    measurement_clipped_min += clipped_min_count
                
                if clipped_max_count > 0:
                    df_seg.loc[above_max, flag_col] = True
                    measurement_clipped_max += clipped_max_count
                
                total_clipped = clipped_min_count + clipped_max_count
                measurement_clipped += total_clipped
                
                if total_clipped > 0:
                    print(f"  {col}: {total_clipped:,} values clipped ({clipped_min_count:,} below min, {clipped_max_count:,} above max)")
    
    if measurement_clipped > 0:
        clipping_stats[measurement] = {
            'total': measurement_clipped,
            'min': measurement_clipped_min,
            'max': measurement_clipped_max
        }

print(f"\nClipping summary:")
for measurement, stats in clipping_stats.items():
    print(f"  {measurement}: {stats['total']:,} values clipped ({stats['min']:,} to min, {stats['max']:,} to max)")

if not clipping_stats:
    print("  No values required clipping ✓")

print(f"\nShape after clipping: {df_seg.shape}")
print(f"New QA flag columns created: {sum(1 for col in df_seg.columns if col.endswith('_clipped'))}")



Outlier clipping with QA flags...
Shape before clipping: (26494, 26)
  bp_systolic2: 10 values clipped (10 below min, 0 above max)
  bp_diastolic: 22 values clipped (14 below min, 8 above max)
  bp_diastolic2: 12 values clipped (6 below min, 6 above max)
  pulse: 2 values clipped (2 below min, 0 above max)
  cv_screening5_height: 75 values clipped (75 below min, 0 above max)
  physical1_height: 75 values clipped (75 below min, 0 above max)
  cv_screening6_weight: 134 values clipped (134 below min, 0 above max)
  physical2_weight: 134 values clipped (134 below min, 0 above max)
  cv_screening7_bmi: 3 values clipped (0 below min, 3 above max)
  physical3_bmi: 4 values clipped (1 below min, 3 above max)

Clipping summary:
  SBP: 10 values clipped (10 to min, 0 to max)
  DBP: 34 values clipped (20 to min, 14 to max)
  Pulse: 2 values clipped (2 to min, 0 to max)
  Height: 150 values clipped (150 to min, 0 to max)
  Weight: 268 values clipped (268 to min, 0 to max)
  BMI: 7 values clipped 

In [34]:
# Canonicalize per encounter - create new canonical columns with source tracking
print(f"\nCanonicalizing vital signs per encounter...")
print(f"Shape before canonicalization: {df_seg.shape}")

# SBP priority: bp_systolic → bp_systolic2 → bp_systolic_temp
df_seg['sbp_enc'] = np.nan
df_seg['sbp_enc_source'] = ''

if all(col in df_seg.columns for col in ['bp_systolic', 'bp_systolic2', 'bp_systolic_temp']):
    # Priority 1: bp_systolic
    mask1 = df_seg['bp_systolic'].notna()
    df_seg.loc[mask1, 'sbp_enc'] = df_seg.loc[mask1, 'bp_systolic']
    df_seg.loc[mask1, 'sbp_enc_source'] = 'bp_systolic'
    
    # Priority 2: bp_systolic2 (if still missing)
    mask2 = df_seg['sbp_enc'].isna() & df_seg['bp_systolic2'].notna()
    df_seg.loc[mask2, 'sbp_enc'] = df_seg.loc[mask2, 'bp_systolic2']
    df_seg.loc[mask2, 'sbp_enc_source'] = 'bp_systolic2'
    
    # Priority 3: bp_systolic_temp (if still missing)
    mask3 = df_seg['sbp_enc'].isna() & df_seg['bp_systolic_temp'].notna()
    df_seg.loc[mask3, 'sbp_enc'] = df_seg.loc[mask3, 'bp_systolic_temp']
    df_seg.loc[mask3, 'sbp_enc_source'] = 'bp_systolic_temp'
    
    print(f"  SBP: {mask1.sum():,} from bp_systolic, {mask2.sum():,} from bp_systolic2, {mask3.sum():,} from bp_systolic_temp")

# DBP priority: bp_diastolic → bp_diastolic2
df_seg['dbp_enc'] = np.nan
df_seg['dbp_enc_source'] = ''

if all(col in df_seg.columns for col in ['bp_diastolic', 'bp_diastolic2']):
    # Priority 1: bp_diastolic
    mask1 = df_seg['bp_diastolic'].notna()
    df_seg.loc[mask1, 'dbp_enc'] = df_seg.loc[mask1, 'bp_diastolic']
    df_seg.loc[mask1, 'dbp_enc_source'] = 'bp_diastolic'
    
    # Priority 2: bp_diastolic2 (if still missing)
    mask2 = df_seg['dbp_enc'].isna() & df_seg['bp_diastolic2'].notna()
    df_seg.loc[mask2, 'dbp_enc'] = df_seg.loc[mask2, 'bp_diastolic2']
    df_seg.loc[mask2, 'dbp_enc_source'] = 'bp_diastolic2'
    
    print(f"  DBP: {mask1.sum():,} from bp_diastolic, {mask2.sum():,} from bp_diastolic2")

# Pulse priority: pulse → pulse_oximetry2_pulse
df_seg['pulse_enc'] = np.nan
df_seg['pulse_enc_source'] = ''

if all(col in df_seg.columns for col in ['pulse', 'pulse_oximetry2_pulse']):
    # Priority 1: pulse
    mask1 = df_seg['pulse'].notna()
    df_seg.loc[mask1, 'pulse_enc'] = df_seg.loc[mask1, 'pulse']
    df_seg.loc[mask1, 'pulse_enc_source'] = 'pulse'
    
    # Priority 2: pulse_oximetry2_pulse (if still missing)
    mask2 = df_seg['pulse_enc'].isna() & df_seg['pulse_oximetry2_pulse'].notna()
    df_seg.loc[mask2, 'pulse_enc'] = df_seg.loc[mask2, 'pulse_oximetry2_pulse']
    df_seg.loc[mask2, 'pulse_enc_source'] = 'pulse_oximetry2_pulse'
    
    print(f"  Pulse: {mask1.sum():,} from pulse, {mask2.sum():,} from pulse_oximetry2_pulse")

# Height priority: cv_screening5_height → physical1_height (in cm)
df_seg['height_cm_enc'] = np.nan
df_seg['height_cm_enc_source'] = ''

if all(col in df_seg.columns for col in ['cv_screening5_height', 'physical1_height']):
    # Priority 1: cv_screening5_height
    mask1 = df_seg['cv_screening5_height'].notna()
    df_seg.loc[mask1, 'height_cm_enc'] = df_seg.loc[mask1, 'cv_screening5_height']
    df_seg.loc[mask1, 'height_cm_enc_source'] = 'cv_screening5_height'
    
    # Priority 2: physical1_height (if still missing)
    mask2 = df_seg['height_cm_enc'].isna() & df_seg['physical1_height'].notna()
    df_seg.loc[mask2, 'height_cm_enc'] = df_seg.loc[mask2, 'physical1_height']
    df_seg.loc[mask2, 'height_cm_enc_source'] = 'physical1_height'
    
    print(f"  Height: {mask1.sum():,} from cv_screening5_height, {mask2.sum():,} from physical1_height")

# Weight priority: cv_screening6_weight → physical2_weight (in kg)
df_seg['weight_kg_enc'] = np.nan
df_seg['weight_kg_enc_source'] = ''

if all(col in df_seg.columns for col in ['cv_screening6_weight', 'physical2_weight']):
    # Priority 1: cv_screening6_weight
    mask1 = df_seg['cv_screening6_weight'].notna()
    df_seg.loc[mask1, 'weight_kg_enc'] = df_seg.loc[mask1, 'cv_screening6_weight']
    df_seg.loc[mask1, 'weight_kg_enc_source'] = 'cv_screening6_weight'
    
    # Priority 2: physical2_weight (if still missing)
    mask2 = df_seg['weight_kg_enc'].isna() & df_seg['physical2_weight'].notna()
    df_seg.loc[mask2, 'weight_kg_enc'] = df_seg.loc[mask2, 'physical2_weight']
    df_seg.loc[mask2, 'weight_kg_enc_source'] = 'physical2_weight'
    
    print(f"  Weight: {mask1.sum():,} from cv_screening6_weight, {mask2.sum():,} from physical2_weight")

# BMI priority: cv_screening7_bmi → physical3_bmi → (recompute if both missing but height+weight exist)
df_seg['bmi_enc'] = np.nan
df_seg['bmi_enc_source'] = ''

if all(col in df_seg.columns for col in ['cv_screening7_bmi', 'physical3_bmi']):
    # Priority 1: cv_screening7_bmi
    mask1 = df_seg['cv_screening7_bmi'].notna()
    df_seg.loc[mask1, 'bmi_enc'] = df_seg.loc[mask1, 'cv_screening7_bmi']
    df_seg.loc[mask1, 'bmi_enc_source'] = 'cv_screening7_bmi'
    
    # Priority 2: physical3_bmi (if still missing)
    mask2 = df_seg['bmi_enc'].isna() & df_seg['physical3_bmi'].notna()
    df_seg.loc[mask2, 'bmi_enc'] = df_seg.loc[mask2, 'physical3_bmi']
    df_seg.loc[mask2, 'bmi_enc_source'] = 'physical3_bmi'
    
    # Priority 3: Recompute from height and weight (if both missing but height+weight exist)
    mask3 = (df_seg['bmi_enc'].isna() & 
             df_seg['height_cm_enc'].notna() & 
             df_seg['weight_kg_enc'].notna())
    
    if mask3.sum() > 0:
        # BMI = weight (kg) / (height (m))^2
        # Height is in cm, so convert to meters: height_m = height_cm / 100
        df_seg.loc[mask3, 'bmi_enc'] = (
            df_seg.loc[mask3, 'weight_kg_enc'] / 
            (df_seg.loc[mask3, 'height_cm_enc'] / 100) ** 2
        )
        df_seg.loc[mask3, 'bmi_enc_source'] = 'computed_from_height_weight'
    
    print(f"  BMI: {mask1.sum():,} from cv_screening7_bmi, {mask2.sum():,} from physical3_bmi, {mask3.sum():,} recomputed")

# Create basic quality flags (True if value exists, False if missing)
df_seg['sbp_enc_quality'] = df_seg['sbp_enc'].notna()
df_seg['dbp_enc_quality'] = df_seg['dbp_enc'].notna()
df_seg['pulse_enc_quality'] = df_seg['pulse_enc'].notna()
df_seg['height_cm_enc_quality'] = df_seg['height_cm_enc'].notna()
df_seg['weight_kg_enc_quality'] = df_seg['weight_kg_enc'].notna()
df_seg['bmi_enc_quality'] = df_seg['bmi_enc'].notna()

print(f"\nShape after canonicalization: {df_seg.shape}")
print(f"Canonical columns created:")
print(f"  - sbp_enc: {df_seg['sbp_enc'].notna().sum():,} non-null values")
print(f"  - dbp_enc: {df_seg['dbp_enc'].notna().sum():,} non-null values")
print(f"  - pulse_enc: {df_seg['pulse_enc'].notna().sum():,} non-null values")
print(f"  - height_cm_enc: {df_seg['height_cm_enc'].notna().sum():,} non-null values")
print(f"  - weight_kg_enc: {df_seg['weight_kg_enc'].notna().sum():,} non-null values")
print(f"  - bmi_enc: {df_seg['bmi_enc'].notna().sum():,} non-null values")



Canonicalizing vital signs per encounter...
Shape before canonicalization: (26494, 39)
  SBP: 8,294 from bp_systolic, 818 from bp_systolic2, 0 from bp_systolic_temp
  DBP: 8,229 from bp_diastolic, 820 from bp_diastolic2
  Pulse: 8,601 from pulse, 0 from pulse_oximetry2_pulse
  Height: 6,343 from cv_screening5_height, 0 from physical1_height
  Weight: 6,539 from cv_screening6_weight, 0 from physical2_weight
  BMI: 3,242 from cv_screening7_bmi, 3,098 from physical3_bmi, 1 recomputed

Shape after canonicalization: (26494, 57)
Canonical columns created:
  - sbp_enc: 9,112 non-null values
  - dbp_enc: 9,049 non-null values
  - pulse_enc: 8,601 non-null values
  - height_cm_enc: 6,343 non-null values
  - weight_kg_enc: 6,539 non-null values
  - bmi_enc: 6,341 non-null values


In [35]:
df_seg.columns

Index(['created', 'mep_region', 'patient_gender', 'prescribed_medication_atc',
       'specialty_name', 'measurements_ultrasound_category',
       'ultrasound3_date', 'bp_systolic_temp', 'bp_systolic', 'bp_diastolic',
       'bp_systolic2', 'bp_diastolic2', 'pulse', 'cv_screening5_height',
       'cv_screening6_weight', 'cv_screening7_bmi',
       'cv_screening8_waist_circumference', 'physical1_height',
       'physical2_weight', 'physical3_bmi', 'physical4_waist_circumference',
       'pulse_oximetry1_saturation', 'pulse_oximetry2_pulse', 'icd3_code',
       'pid', 'age', 'bp_systolic_clipped', 'bp_systolic2_clipped',
       'bp_systolic_temp_clipped', 'bp_diastolic_clipped',
       'bp_diastolic2_clipped', 'pulse_clipped',
       'pulse_oximetry2_pulse_clipped', 'cv_screening5_height_clipped',
       'physical1_height_clipped', 'cv_screening6_weight_clipped',
       'physical2_weight_clipped', 'cv_screening7_bmi_clipped',
       'physical3_bmi_clipped', 'sbp_enc', 'sbp_enc_source', '

In [36]:
# Patient-level summary aggregation
print(f"\n{'='*60}")
print("Creating patient-level summary...")
print(f"{'='*60}")

# Helper functions for aggregations
def get_latest_value(group, col, index_date):
    """Get most recent non-null value at or before index_date"""
    valid = group[group[col].notna() & (group['created'] <= index_date)]
    if len(valid) > 0:
        return valid.sort_values('created', ascending=False).iloc[0][col]
    return np.nan

def get_baseline_value(group, col):
    """Get earliest non-null value"""
    valid = group[group[col].notna()]
    if len(valid) > 0:
        return valid.sort_values('created', ascending=True).iloc[0][col]
    return np.nan

def get_most_recent(group, col):
    """Get most recent non-null value"""
    valid = group[group[col].notna()]
    if len(valid) > 0:
        return valid.sort_values('created', ascending=False).iloc[0][col]
    return np.nan

# Group by patient
print(f"\nGrouping by pid...")
print(f"Total encounters: {len(df_seg):,}")
print(f"Unique patients: {df_seg['pid'].nunique():,}")

# Initialize summary DataFrame
patient_summary = []

# Process each patient group
for pid, group in df_seg.groupby('pid'):
    # Sort by created date
    group = group.sort_values('created')
    
    # Timeline & Utilization Metrics
    first_visit_date = group['created'].min()
    last_visit_date = group['created'].max()
    index_date = last_visit_date
    span_days = (last_visit_date - first_visit_date).days if first_visit_date != last_visit_date else 0
    encounter_count_total = len(group)
    
    # Count encounters in last 12 months
    date_12m_ago = index_date - pd.Timedelta(days=365)
    encounter_count_12m = len(group[group['created'] >= date_12m_ago])
    
    # Visits per year
    if span_days > 0:
        visits_per_year = encounter_count_total / (span_days / 365.25)
    else:
        visits_per_year = encounter_count_total if encounter_count_total > 0 else 0
    
    # Latest state values (at index_date)
    sbp_latest = get_latest_value(group, 'sbp_enc', index_date)
    dbp_latest = get_latest_value(group, 'dbp_enc', index_date)
    pulse_latest = get_latest_value(group, 'pulse_enc', index_date)
    bmi_latest = get_latest_value(group, 'bmi_enc', index_date)
    height_cm_latest = get_latest_value(group, 'height_cm_enc', index_date)
    weight_kg_latest = get_latest_value(group, 'weight_kg_enc', index_date)
    
    # Baseline values (earliest non-null)
    sbp_baseline = get_baseline_value(group, 'sbp_enc')
    dbp_baseline = get_baseline_value(group, 'dbp_enc')
    pulse_baseline = get_baseline_value(group, 'pulse_enc')
    bmi_baseline = get_baseline_value(group, 'bmi_enc')
    
    # Delta values (latest - baseline)
    sbp_delta = sbp_latest - sbp_baseline if (pd.notna(sbp_latest) and pd.notna(sbp_baseline)) else np.nan
    dbp_delta = dbp_latest - dbp_baseline if (pd.notna(dbp_latest) and pd.notna(dbp_baseline)) else np.nan
    pulse_delta = pulse_latest - pulse_baseline if (pd.notna(pulse_latest) and pd.notna(pulse_baseline)) else np.nan
    bmi_delta = bmi_latest - bmi_baseline if (pd.notna(bmi_latest) and pd.notna(bmi_baseline)) else np.nan
    
    # Demographics (most recent known)
    sex = get_most_recent(group, 'patient_gender')
    mep_region = get_most_recent(group, 'mep_region')
    
    # Age at index_date
    age_at_index = get_most_recent(group[group['created'] == index_date], 'age')
    if pd.isna(age_at_index):
        # Try to get age from any encounter and adjust
        age_any = get_most_recent(group, 'age')
        if pd.notna(age_any):
            # Approximate age at index (using average encounter date)
            avg_date = group['created'].mean()
            days_diff = (index_date - avg_date).days
            age_at_index = age_any + (days_diff / 365.25)
    
    # ICD-3 codes
    icd3_codes = group['icd3_code'].dropna()
    if len(icd3_codes) > 0:
        # Count distinct codes
        icd3_count = icd3_codes.nunique()
        
        # Most frequent ICD-3 (tie-break by most recent)
        icd3_counts = icd3_codes.value_counts()
        max_count = icd3_counts.max()
        most_frequent_codes = icd3_counts[icd3_counts == max_count].index.tolist()
        
        if len(most_frequent_codes) == 1:
            primary_icd3 = most_frequent_codes[0]
        else:
            # Tie-break: most recent
            tie_group = group[group['icd3_code'].isin(most_frequent_codes)]
            primary_icd3 = get_most_recent(tie_group, 'icd3_code')
        
        # Top 5 ICD-3 codes for list
        top_5_codes = icd3_counts.head(5).index.tolist()
        icd3_list = ', '.join(top_5_codes)
    else:
        primary_icd3 = np.nan
        icd3_count = 0
        icd3_list = ''
    
    # Ultrasound indicators
    # Check if measurements_ultrasound_category contains "thyroid" (case-insensitive)
    thyroid_ultrasound_rows = group[
        group['measurements_ultrasound_category'].notna() & 
        group['measurements_ultrasound_category'].str.contains('thyroid', case=False, na=False)
    ]
    
    thyroid_ultrasound_done = 1 if len(thyroid_ultrasound_rows) > 0 else 0
    thyroid_ultrasound_count = len(thyroid_ultrasound_rows)
    
    if len(thyroid_ultrasound_rows) > 0:
        thyroid_ultrasound_last_date = thyroid_ultrasound_rows['ultrasound3_date'].max()
    else:
        thyroid_ultrasound_last_date = pd.NaT
    
    # Interpretability bins
    # BP Stage
    if pd.notna(sbp_latest) and pd.notna(dbp_latest):
        if sbp_latest < 120 and dbp_latest < 80:
            bp_stage = "Normal"
        elif 120 <= sbp_latest < 130 and dbp_latest < 80:
            bp_stage = "Elevated"
        elif (130 <= sbp_latest < 140 or 80 <= dbp_latest < 90) and not (sbp_latest >= 140 or dbp_latest >= 90):
            bp_stage = "Stage-1"
        elif sbp_latest >= 140 or dbp_latest >= 90:
            bp_stage = "Stage-2"
        else:
            bp_stage = "Missing"
    else:
        bp_stage = "Missing"
    
    # BMI Class
    if pd.notna(bmi_latest):
        if 18.5 <= bmi_latest < 25:
            bmi_class = "Normal"
        elif 25 <= bmi_latest < 30:
            bmi_class = "Overweight"
        elif 30 <= bmi_latest < 35:
            bmi_class = "Obesity I"
        elif bmi_latest >= 35:
            bmi_class = "Obesity II+"
        else:
            bmi_class = "Missing"
    else:
        bmi_class = "Missing"
    
    # Age Bracket
    if pd.notna(age_at_index):
        if age_at_index < 18:
            age_bracket = "0-18"
        elif 18 <= age_at_index < 25:
            age_bracket = "18-25"
        elif 25 <= age_at_index < 40:
            age_bracket = "25-40"
        elif 40 <= age_at_index < 60:
            age_bracket = "40-59"
        elif 60 <= age_at_index < 80:
            age_bracket = "60-79"
        elif age_at_index >= 80:
            age_bracket = "80+"
        else:
            age_bracket = "Missing"
    else:
        age_bracket = "Missing"
    
    # Data quality indicators (missing flags)
    sbp_missing = 1 if pd.isna(sbp_latest) else 0
    dbp_missing = 1 if pd.isna(dbp_latest) else 0
    bmi_missing = 1 if pd.isna(bmi_latest) else 0
    pulse_missing = 1 if pd.isna(pulse_latest) else 0
    age_missing = 1 if pd.isna(age_at_index) else 0
    
    # Store patient summary
    patient_summary.append({
        'pid': pid,
        # Timeline & Utilization
        'first_visit_date': first_visit_date,
        'last_visit_date': last_visit_date,
        'index_date': index_date,
        'span_days': span_days,
        'encounter_count_total': encounter_count_total,
        'encounter_count_12m': encounter_count_12m,
        'visits_per_year': round(visits_per_year, 2),
        # Latest state
        'sbp_latest': sbp_latest,
        'dbp_latest': dbp_latest,
        'pulse_latest': pulse_latest,
        'bmi_latest': bmi_latest,
        'height_cm_latest': height_cm_latest,
        'weight_kg_latest': weight_kg_latest,
        # Baseline
        'sbp_baseline': sbp_baseline,
        'dbp_baseline': dbp_baseline,
        'pulse_baseline': pulse_baseline,
        'bmi_baseline': bmi_baseline,
        # Delta
        'sbp_delta': sbp_delta,
        'dbp_delta': dbp_delta,
        'pulse_delta': pulse_delta,
        'bmi_delta': bmi_delta,
        # Demographics
        'sex': sex,
        'age': age_at_index,
        'mep_region': mep_region,
        # ICD-3
        'primary_icd3': primary_icd3,
        'icd3_count': icd3_count,
        'icd3_list': icd3_list,
        # Ultrasound
        'thyroid_ultrasound_done': thyroid_ultrasound_done,
        'thyroid_ultrasound_count': thyroid_ultrasound_count,
        'thyroid_ultrasound_last_date': thyroid_ultrasound_last_date,
        # Interpretability bins
        'bp_stage': bp_stage,
        'bmi_class': bmi_class,
        'age_bracket': age_bracket,
        # Data quality
        'sbp_missing': sbp_missing,
        'dbp_missing': dbp_missing,
        'bmi_missing': bmi_missing,
        'pulse_missing': pulse_missing,
        'age_missing': age_missing
    })

# Create DataFrame
df_patient_summary = pd.DataFrame(patient_summary)

print(f"\nPatient summary created:")
print(f"  Shape: {df_patient_summary.shape}")
print(f"  Columns: {len(df_patient_summary.columns)}")
print(f"\nFirst few rows:")
print(df_patient_summary.head())



Creating patient-level summary...

Grouping by pid...
Total encounters: 26,494
Unique patients: 6,356

Patient summary created:
  Shape: (6356, 39)
  Columns: 39

First few rows:
                                                 pid  \
0  0022a311df715e9740427755bbc194533776e03242c47b...   
1  00343c575db967dc335a85dab4ede661c58786aa3180df...   
2  003d1426b37bb9341ba25c89156fee5ff70b6e0d402b36...   
3  0045f4e1273379f7d0161bc42ce8f9028bdc7e7fa46581...   
4  0048df6b40355e471b46ee1232b2d01c10c764a2be52fd...   

           first_visit_date           last_visit_date  \
0 2024-08-16 09:49:24+02:00 2024-08-16 09:49:24+02:00   
1 2023-08-15 10:51:17+02:00 2025-10-03 09:10:15+02:00   
2 2025-04-02 09:51:05+02:00 2025-06-12 05:07:40+02:00   
3 2023-08-07 13:09:16+02:00 2025-06-12 10:45:51+02:00   
4 2023-08-09 12:12:50+02:00 2025-10-13 08:38:59+02:00   

                 index_date  span_days  encounter_count_total  \
0 2024-08-16 09:49:24+02:00          0                      1   
1 2025-10-

In [37]:
# Add icd3_count: number of distinct ICD-3 codes the patient ever had
print(f"\n{'='*60}")
print("Adding icd3_count to df_patient_summary...")
print(f"{'='*60}")

# Ensure icd3_count exists (it should already be in the DataFrame)
# If for some reason it doesn't exist, we'll calculate it
if 'icd3_count' not in df_patient_summary.columns:
    print("Warning: icd3_count not found, calculating from original data...")
    # Calculate icd3_count from df_seg if needed
    icd3_counts = df_seg.groupby('pid')['icd3_code'].nunique()
    df_patient_summary['icd3_count'] = df_patient_summary['pid'].map(icd3_counts).fillna(0).astype(int)
    print(f"✓ icd3_count calculated and added")
else:
    print(f"✓ icd3_count already exists in DataFrame")

print(f"  Statistics: min={df_patient_summary['icd3_count'].min()}, max={df_patient_summary['icd3_count'].max()}, mean={df_patient_summary['icd3_count'].mean():.2f}")
print(f"  Patients with 0 ICD-3 codes: {(df_patient_summary['icd3_count'] == 0).sum():,}")
print(f"  Patients with 1-2 ICD-3 codes: {((df_patient_summary['icd3_count'] >= 1) & (df_patient_summary['icd3_count'] < 3)).sum():,}")
print(f"  Patients with 3+ ICD-3 codes: {(df_patient_summary['icd3_count'] >= 3).sum():,}")



Adding icd3_count to df_patient_summary...
✓ icd3_count already exists in DataFrame
  Statistics: min=1, max=25, mean=3.03
  Patients with 0 ICD-3 codes: 0
  Patients with 1-2 ICD-3 codes: 3,560
  Patients with 3+ ICD-3 codes: 2,796


In [38]:
# Add multimorbidity_flag: 1 if icd3_count >= 3, else 0
print(f"\n{'='*60}")
print("Adding multimorbidity_flag to df_patient_summary...")
print(f"{'='*60}")

# Ensure icd3_count exists before creating the flag
if 'icd3_count' not in df_patient_summary.columns:
    raise ValueError("icd3_count must exist in df_patient_summary before creating multimorbidity_flag")

# Add multimorbidity_flag: 1 if icd3_count >= 3, else 0
df_patient_summary['multimorbidity_flag'] = (df_patient_summary['icd3_count'] >= 3).astype(int)

print(f"✓ multimorbidity_flag added")
print(f"  Patients with multimorbidity (icd3_count >= 3): {df_patient_summary['multimorbidity_flag'].sum():,} ({df_patient_summary['multimorbidity_flag'].mean()*100:.1f}%)")
print(f"  Patients without multimorbidity (icd3_count < 3): {(df_patient_summary['multimorbidity_flag'] == 0).sum():,} ({(df_patient_summary['multimorbidity_flag'] == 0).mean()*100:.1f}%)")
print(f"\nUpdated DataFrame shape: {df_patient_summary.shape}")
print(f"Updated columns: {len(df_patient_summary.columns)}")



Adding multimorbidity_flag to df_patient_summary...
✓ multimorbidity_flag added
  Patients with multimorbidity (icd3_count >= 3): 2,796 (44.0%)
  Patients without multimorbidity (icd3_count < 3): 3,560 (56.0%)

Updated DataFrame shape: (6356, 40)
Updated columns: 40


In [39]:
# Save patient summary to CSV
print(f"\n{'='*60}")
print("Saving patient summary to CSV...")
print(f"{'='*60}")

# Ensure proper column order
column_order = [
    'pid',
    # Timeline & Utilization
    'first_visit_date', 'last_visit_date', 'index_date', 'span_days',
    'encounter_count_total', 'encounter_count_12m', 'visits_per_year',
    # Latest state
    'sbp_latest', 'dbp_latest', 'pulse_latest', 
    'bmi_latest', 'height_cm_latest', 'weight_kg_latest',
    # Baseline
    'sbp_baseline', 'dbp_baseline', 'pulse_baseline', 'bmi_baseline',
    # Delta
    'sbp_delta', 'dbp_delta', 'pulse_delta', 'bmi_delta',
    # Demographics
    'sex', 'age', 'mep_region',
    # ICD-3
    'primary_icd3', 'icd3_count', 'icd3_list',
    # Ultrasound
    'thyroid_ultrasound_done', 'thyroid_ultrasound_count', 'thyroid_ultrasound_last_date',
    # Interpretability bins
    'bp_stage', 'bmi_class', 'age_bracket',
    # Data quality
    'sbp_missing', 'dbp_missing', 'bmi_missing', 'pulse_missing', 'age_missing'
]

# Reorder columns (only include columns that exist)
df_patient_summary_final = df_patient_summary[[col for col in column_order if col in df_patient_summary.columns]]

# Save to CSV
output_path = Path('../data/patient_summary.csv')
df_patient_summary_final.to_csv(output_path, index=False)

print(f"\n✓ Patient summary saved to: {output_path}")
print(f"  Shape: {df_patient_summary_final.shape}")
print(f"  Total patients: {len(df_patient_summary_final):,}")
print(f"\nSummary statistics:")
print(f"  Patients with SBP: {df_patient_summary_final['sbp_latest'].notna().sum():,}")
print(f"  Patients with DBP: {df_patient_summary_final['dbp_latest'].notna().sum():,}")
print(f"  Patients with BMI: {df_patient_summary_final['bmi_latest'].notna().sum():,}")
print(f"  Patients with thyroid ultrasound: {df_patient_summary_final['thyroid_ultrasound_done'].sum():,}")
print(f"\nColumn names:")
for i, col in enumerate(df_patient_summary_final.columns, 1):
    print(f"  {i:2d}. {col}")



Saving patient summary to CSV...

✓ Patient summary saved to: ../data/patient_summary.csv
  Shape: (6356, 39)
  Total patients: 6,356

Summary statistics:
  Patients with SBP: 4,003
  Patients with DBP: 3,982
  Patients with BMI: 3,924
  Patients with thyroid ultrasound: 0

Column names:
   1. pid
   2. first_visit_date
   3. last_visit_date
   4. index_date
   5. span_days
   6. encounter_count_total
   7. encounter_count_12m
   8. visits_per_year
   9. sbp_latest
  10. dbp_latest
  11. pulse_latest
  12. bmi_latest
  13. height_cm_latest
  14. weight_kg_latest
  15. sbp_baseline
  16. dbp_baseline
  17. pulse_baseline
  18. bmi_baseline
  19. sbp_delta
  20. dbp_delta
  21. pulse_delta
  22. bmi_delta
  23. sex
  24. age
  25. mep_region
  26. primary_icd3
  27. icd3_count
  28. icd3_list
  29. thyroid_ultrasound_done
  30. thyroid_ultrasound_count
  31. thyroid_ultrasound_last_date
  32. bp_stage
  33. bmi_class
  34. age_bracket
  35. sbp_missing
  36. dbp_missing
  37. bmi_missin

In [40]:
# Check for duplicates and outliers in patient summary
print(f"\n{'='*60}")
print("Checking for duplicates in patient summary...")
print(f"{'='*60}")

# Check for duplicate pids (should be unique - one row per patient)
duplicate_pids = df_patient_summary_final['pid'].duplicated().sum()
total_patients = len(df_patient_summary_final)
unique_pids = df_patient_summary_final['pid'].nunique()

print(f"\nTotal rows: {total_patients:,}")
print(f"Unique pids: {unique_pids:,}")
print(f"Duplicate pids: {duplicate_pids:,}")

if duplicate_pids > 0:
    print(f"\n⚠️ WARNING: Found {duplicate_pids} duplicate patient IDs!")
    print("\nDuplicate pids:")
    duplicate_pid_list = df_patient_summary_final[df_patient_summary_final['pid'].duplicated(keep=False)]['pid'].unique()
    print(duplicate_pid_list)
    
    # Show the duplicate rows
    print("\nDuplicate rows:")
    print(df_patient_summary_final[df_patient_summary_final['pid'].duplicated(keep=False)].sort_values('pid'))
else:
    print("\n✓ No duplicate pids found - each patient has exactly one row")

# Also check for completely duplicate rows (all columns the same)
duplicate_rows = df_patient_summary_final.duplicated().sum()
print(f"\nCompletely duplicate rows (all columns identical): {duplicate_rows:,}")

if duplicate_rows > 0:
    print("\n⚠️ WARNING: Found completely duplicate rows!")
    print(df_patient_summary_final[df_patient_summary_final.duplicated(keep=False)].sort_values('pid'))
else:
    print("✓ No completely duplicate rows found")

# Check for outliers in numerical fields
print(f"\n{'='*60}")
print("Checking for outliers in numerical fields...")
print(f"{'='*60}")

# Define numerical columns to check for outliers
numerical_cols = [
    'span_days', 'encounter_count_total', 'encounter_count_12m', 'visits_per_year',
    'sbp_latest', 'dbp_latest', 'pulse_latest', 'bmi_latest', 
    'height_cm_latest', 'weight_kg_latest',
    'sbp_baseline', 'dbp_baseline', 'pulse_baseline', 'bmi_baseline',
    'sbp_delta', 'dbp_delta', 'pulse_delta', 'bmi_delta',
    'age', 'icd3_count', 'thyroid_ultrasound_count'
]

# Filter to columns that exist in the DataFrame
numerical_cols = [col for col in numerical_cols if col in df_patient_summary_final.columns]

outlier_summary = []

for col in numerical_cols:
    # Skip if column is all NaN
    if df_patient_summary_final[col].notna().sum() == 0:
        continue
    
    # Calculate IQR-based outliers
    Q1 = df_patient_summary_final[col].quantile(0.25)
    Q3 = df_patient_summary_final[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # Define outlier bounds (1.5 * IQR rule)
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Find outliers
    outliers_low = df_patient_summary_final[df_patient_summary_final[col] < lower_bound]
    outliers_high = df_patient_summary_final[df_patient_summary_final[col] > upper_bound]
    total_outliers = len(outliers_low) + len(outliers_high)
    
    if total_outliers > 0:
        outlier_summary.append({
            'column': col,
            'total_outliers': total_outliers,
            'outliers_low': len(outliers_low),
            'outliers_high': len(outliers_high),
            'lower_bound': lower_bound,
            'upper_bound': upper_bound,
            'min_value': df_patient_summary_final[col].min(),
            'max_value': df_patient_summary_final[col].max(),
            'Q1': Q1,
            'Q3': Q3,
            'median': df_patient_summary_final[col].median()
        })

if len(outlier_summary) > 0:
    print(f"\n⚠️ Found outliers in {len(outlier_summary)} numerical fields:\n")
    
    for item in outlier_summary:
        print(f"  {item['column']}:")
        print(f"    Total outliers: {item['total_outliers']:,} ({item['total_outliers']/total_patients*100:.2f}%)")
        print(f"    Low outliers: {item['outliers_low']:,} (values < {item['lower_bound']:.2f})")
        print(f"    High outliers: {item['outliers_high']:,} (values > {item['upper_bound']:.2f})")
        print(f"    Range: [{item['min_value']:.2f}, {item['max_value']:.2f}]")
        print(f"    Q1: {item['Q1']:.2f}, Median: {item['median']:.2f}, Q3: {item['Q3']:.2f}")
        print()
    
    # Show sample of extreme outliers (top 5 highest and lowest for each field with outliers)
    print("\nSample of extreme outliers (top 5 per field):")
    for item in outlier_summary[:5]:  # Show first 5 fields to avoid too much output
        col = item['column']
        print(f"\n  {col}:")
        
        # High outliers
        high_outliers = df_patient_summary_final[df_patient_summary_final[col] > item['upper_bound']].nlargest(5, col)
        if len(high_outliers) > 0:
            print(f"    Top 5 high outliers:")
            for idx, row in high_outliers.iterrows():
                print(f"      pid={row['pid']}: {col}={row[col]:.2f}")
        
        # Low outliers
        low_outliers = df_patient_summary_final[df_patient_summary_final[col] < item['lower_bound']].nsmallest(5, col)
        if len(low_outliers) > 0:
            print(f"    Top 5 low outliers:")
            for idx, row in low_outliers.iterrows():
                print(f"      pid={row['pid']}: {col}={row[col]:.2f}")
else:
    print("\n✓ No outliers detected using IQR method (1.5 * IQR rule)")



Checking for duplicates in patient summary...

Total rows: 6,356
Unique pids: 6,356
Duplicate pids: 0

✓ No duplicate pids found - each patient has exactly one row

Completely duplicate rows (all columns identical): 0
✓ No completely duplicate rows found

Checking for outliers in numerical fields...

⚠️ Found outliers in 19 numerical fields:

  span_days:
    Total outliers: 616 (9.69%)
    Low outliers: 0 (values < -334.88)
    High outliers: 616 (values > 558.12)
    Range: [0.00, 975.00]
    Q1: 0.00, Median: 62.00, Q3: 223.25

  encounter_count_total:
    Total outliers: 415 (6.53%)
    Low outliers: 0 (values < -5.00)
    High outliers: 415 (values > 11.00)
    Range: [1.00, 93.00]
    Q1: 1.00, Median: 3.00, Q3: 5.00

  encounter_count_12m:
    Total outliers: 421 (6.62%)
    Low outliers: 0 (values < -3.50)
    High outliers: 421 (values > 8.50)
    Range: [1.00, 69.00]
    Q1: 1.00, Median: 2.00, Q3: 4.00

  visits_per_year:
    Total outliers: 734 (11.55%)
    Low outliers: 0