In [None]:
# %matplotlib inline
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import re

# ==========================================
# 1. DICTIONARIES
# ==========================================
columns_selected = {
    'demographics' : ['Age', 'Gender', 'Weight', 'Height'],
    'mitral_metrics' : ['MR gradient', 'MS PHT', 'PHT area', 'MSA planimetry', 'MS MG (mmHg)'],
    'remodeling' : ['Left atrium', 'RV size', 'RV function', 'RVSP', 'TR Grading'],
    'lv_function' : ['LVEF (categorical)', 'LVIDd', 'LVIDs'],
    'clinical_context' : ['Pre-procedure diagnosis', 'Post-procedure diagnosis', 'Year(s) since procedure', 'Medications'],
    'vitals' : ['SBP L', 'DBP L', 'SBP R', 'DBP R', 'Heart rate', 'OSat']
}
drug_classes = {
    'Beta_Blockers': ['Carvedilol', 'Bisoprolol', 'Metoprolol', 'Propranolol', 'Atenolol'],
    'RAAS_Inhibitors': ['Enalapril', 'Lisinopril', 'Losartan', 'Irbesartan', 'Captopril', 'Sacubitril-valsartan'],
    'Diuretics_Loop': ['Furosemide'],
    'Diuretics_Other': ['Spironolactone', 'Hydrochlorothiazide'],
    'Anticoagulants': ['Warfarin', 'Rivaroxaban', 'Apixaban', 'Aspirin', 'Clopidogrel'],
    'Rate_Control': ['Digoxin', 'Amiodarone'],
    'SGLT2_Inhibitors': ['Dapaglifozin', 'Empaglifozin'],
    'Pulm_Vasodilators': ['Sildenafil', 'Tadalafil']
}
drug_freq = {'qd': 1, 'bid': 2, 'tid': 3, 'qid': 4}
index_severity = {'Mild': 1, 'Moderate': 2, 'Severe': 3, 'critical': 3}

# ==========================================
# 2. FUNCTIONS
# ==========================================
class data_transformer:
    def __init__(self, dataframe):
        self.df = dataframe.copy()

    @classmethod
    def raw_data(cls, raw_df, config_dict):
        selected_columns = [col for sublist in config_dict.values() for col in sublist]
        available_columns = [col for col in selected_columns if col in raw_df.columns]
        
        total_raw_cols = raw_df.shape[1]
        total_selected = len(available_columns)
        total_rows = raw_df.shape[0]

        print(f"{'='*30}")
        print("DATA ACCOUNTING REPORT")
        print(f"{'='*30}")
        print(f"Rows processed:     {total_rows}")
        print(f"Columns in source:  {total_raw_cols}")
        print(f"Columns selected:   {total_selected}")

        missing = set(selected_columns) - set(available_columns)
        if missing:
            print(f"Status: Missing {len(missing)} expected columns.")
            print(f"Dropped: {list(missing)}")
        else:
            print(f"Status: All expected columns matched successfully.")

        return cls(raw_df[available_columns])

In [None]:
# ==========================================
# 3. EXECUTION
# ==========================================
df = pd.read_csv('phl_2025.csv')
transformer = data_transformer.raw_data(df, columns_selected)

DATA ACCOUNTING REPORT
Rows processed:     152
Columns in source:  67
Columns selected:   27
Status: All expected columns matched successfully.


In [25]:
import pandas as pd
import numpy as np
import re

# ==========================================
# 1. ENHANCED CONFIGURATION
# ==========================================
# We use floats to respect the "Gray Areas" for more precise regressions
SEVERITY_GRADES = {
    'mild': 1.0,
    'mild to moderate': 1.5,
    'moderate': 2.0,
    'moderate to severe': 2.5,
    'severe': 3.0,
    'critical': 3.0
}

# The "Big Five" Hemodynamic Drivers
PATHOLOGY_TARGETS = {
    'ms': 'mitral stenosis',
    'mr': 'mitral regurgitation',
    'as': 'aortic stenosis',
    'ar': 'aortic regurgitation',
    'ph': 'pulmonary hypertension'
}

# ==========================================
# 2. DATA TRANSFORMER
# ==========================================
class DataTransformer:
    def __init__(self, dataframe):
        self.df = dataframe.copy()

    @classmethod
    def raw_data(cls, raw_df, config_dict):
        """Standardizes columns and initializes row accounting."""
        selected_columns = [col for sublist in config_dict.values() for col in sublist]
        available_columns = [col for col in selected_columns if col in raw_df.columns]
        
        print(f"{'='*30}\nDATA ACCOUNTING REPORT\n{'='*30}")
        print(f"Rows: {raw_df.shape[0]} | Columns Matched: {len(available_columns)}")
        
        missing = set(selected_columns) - set(available_columns)
        if missing: print(f"Dropped/Missing: {list(missing)}")
        
        return cls(raw_df[available_columns].copy())

    def _parse_severity(self, text, target):
        """Regex-based segment parser for 'Gray Area' preservation."""
        if pd.isna(text): return 0.0
        segments = text.lower().split(';')
        
        found_grade = 0.0
        for seg in segments:
            if target in seg:
                # Search for the longest keys first (e.g., 'moderate to severe' before 'moderate')
                sorted_keys = sorted(SEVERITY_GRADES.keys(), key=len, reverse=True)
                for key in sorted_keys:
                    if key in seg:
                        found_grade = max(found_grade, SEVERITY_GRADES[key])
        return found_grade

    def build_pathology_matrix(self):
        """Parses all valves and PH into float-based severity columns."""
        self.df['diag_text'] = (
            self.df['Pre-procedure diagnosis'].fillna('') + ';' + 
            self.df['Post-procedure diagnosis'].fillna('')
        ).str.lower()

        for col_prefix, search_term in PATHOLOGY_TARGETS.items():
            self.df[f'{col_prefix}_sev'] = self.df['diag_text'].apply(
                lambda x: self._parse_severity(x, search_term)
            )
        return self

    def extract_medication_burden(self, drug_classes):
        """Calculates Drug Count as a baseline for Cardio Burden."""
        self.df['med_count'] = self.df['Medications'].fillna('').apply(
            lambda x: len([m for m in x.split('|') if m.strip()]) if x else 0
        )
        return self

    def flag_complex_congenital(self):
        """Identifies heterogeneous pathologies that might confound remodeling."""
        congenital_terms = ['asd', 'vsd', 'coarctation', 'ductus', 'bicuspid', 'septal defect']
        self.df['is_complex'] = self.df['diag_text'].str.contains('|'.join(congenital_terms)).astype(int)
        return self

    def run_eda_pipeline(self, drug_classes):
        """Sequence of operations for the EDA phase."""
        return (self.build_pathology_matrix()
                .extract_medication_burden(drug_classes)
                .flag_complex_congenital()
                .df)

# ==========================================
# 3. EXECUTION & EDA AUDIT
# ==========================================
# Assuming df is loaded
transformer = DataTransformer.raw_data(df, columns_selected)
eda_df = transformer.run_eda_pipeline(drug_classes)

# THE EDA AUDIT: Identifying the "True" Lesion Landscape
print(f"\n{'='*30}\nLESION LANDSCAPE AUDIT\n{'='*30}")
lesion_counts = (eda_df[[f'{c}_sev' for c in PATHOLOGY_TARGETS.keys()]] > 0).sum()
print(lesion_counts)

# Identify Multivalvular Involvement
mmd = ((eda_df['ms_sev'] > 0) & (eda_df['mr_sev'] > 0))
mad = ((eda_df['as_sev'] > 0) & (eda_df['ar_sev'] > 0))
mvd_mask = ((eda_df['ms_sev'] > 0) | (eda_df['mr_sev'] > 0)) & \
           ((eda_df['as_sev'] > 0) | (eda_df['ar_sev'] > 0))
print(f"\nDouble valvular (Mitral) Patients: {mmd.sum()}")
print(f"Double valvular (Aortic) Patients: {mad.sum()}")
print(f"Multivalvular (Mitral + Aortic) Patients: {mvd_mask.sum()}")

DATA ACCOUNTING REPORT
Rows: 152 | Columns Matched: 27

LESION LANDSCAPE AUDIT
ms_sev    47
mr_sev    19
as_sev    34
ar_sev    15
ph_sev    16
dtype: int64

Double valvular (Mitral) Patients: 8
Double valvular (Aortic) Patients: 2
Multivalvular (Mitral + Aortic) Patients: 9
