In [1]:
import os
import pandas as pd


folder_path = '.' 

print(f"--- Scanning folder: {os.path.abspath(folder_path)} ---\n")

# 1. List all files in the directory
files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

if not files:
    print("No CSV files found! Check your folder path.")
else:
    print(f"Found {len(files)} CSV files: {files}\n")

    # 2. Loop through each CSV and show what's inside
    for file in files:
        file_path = os.path.join(folder_path, file)
        
        print(f"_"*50)
        print(f"FILE: {file}")
        
        try:
            # Load data
            df = pd.read_csv(file_path)
            
            # Show dimensions (Rows, Columns)
            print(f"Shape: {df.shape}")
            
            # Show first 3 rows
            display(df.head(3)) 
            
            # Show columns list to spot the 'SEQN' or medical codes
            print("Columns:", list(df.columns[:10]), "..." if len(df.columns) > 10 else "")
            
        except Exception as e:
            print(f"Could not read {file}: {e}")

print("\n--- Scan Complete ---")

--- Scanning folder: /Users/dileesharahasarinda/Downloads/Dileesara ---

Found 6 CSV files: ['medications.csv', 'diet.csv', 'examination.csv', 'demographic.csv', 'labs.csv', 'questionnaire.csv']

__________________________________________________
FILE: medications.csv
Could not read medications.csv: 'utf-8' codec can't decode byte 0xf6 in position 239072: invalid start byte
__________________________________________________
FILE: diet.csv
Shape: (9813, 168)


Unnamed: 0,SEQN,WTDRD1,WTDR2D,DR1DRSTZ,DR1EXMER,DRABF,DRDINT,DR1DBIH,DR1DAY,DR1LANG,...,DRD370QQ,DRD370R,DRD370RQ,DRD370S,DRD370SQ,DRD370T,DRD370TQ,DRD370U,DRD370UQ,DRD370V
0,73557,16888.327864,12930.890649,1,49.0,2.0,2.0,6.0,2.0,1.0,...,,,,,,,,,,
1,73558,17932.143865,12684.148869,1,59.0,2.0,2.0,4.0,1.0,1.0,...,,2.0,,2.0,,2.0,,2.0,,2.0
2,73559,59641.81293,39394.236709,1,49.0,2.0,2.0,18.0,6.0,1.0,...,,,,,,,,,,


Columns: ['SEQN', 'WTDRD1', 'WTDR2D', 'DR1DRSTZ', 'DR1EXMER', 'DRABF', 'DRDINT', 'DR1DBIH', 'DR1DAY', 'DR1LANG'] ...
__________________________________________________
FILE: examination.csv
Shape: (9813, 224)


Unnamed: 0,SEQN,PEASCST1,PEASCTM1,PEASCCT1,BPXCHR,BPAARM,BPACSZ,BPXPLS,BPXPULS,BPXPTY,...,CSXLEAOD,CSXSOAOD,CSXGRAOD,CSXONOD,CSXNGSOD,CSXSLTRT,CSXSLTRG,CSXNART,CSXNARG,CSAEFFRT
0,73557,1,620.0,,,1.0,4.0,86.0,1.0,1.0,...,2.0,1.0,1.0,1.0,4.0,62.0,1.0,,,1.0
1,73558,1,766.0,,,1.0,4.0,74.0,1.0,1.0,...,3.0,1.0,2.0,3.0,4.0,28.0,1.0,,,1.0
2,73559,1,665.0,,,1.0,4.0,68.0,1.0,1.0,...,2.0,1.0,2.0,3.0,4.0,49.0,1.0,,,3.0


Columns: ['SEQN', 'PEASCST1', 'PEASCTM1', 'PEASCCT1', 'BPXCHR', 'BPAARM', 'BPACSZ', 'BPXPLS', 'BPXPULS', 'BPXPTY'] ...
__________________________________________________
FILE: demographic.csv
Shape: (10175, 47)


Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,...,DMDHREDU,DMDHRMAR,DMDHSEDU,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,INDHHIN2,INDFMIN2,INDFMPIR
0,73557,8,2,1,69,,4,4,1.0,,...,3.0,4.0,,13281.237386,13481.042095,1,112,4.0,4.0,0.84
1,73558,8,2,1,54,,3,3,1.0,,...,3.0,1.0,1.0,23682.057386,24471.769625,1,108,7.0,7.0,1.78
2,73559,8,2,1,72,,3,3,2.0,,...,4.0,1.0,3.0,57214.803319,57193.285376,1,109,10.0,10.0,4.51


Columns: ['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN', 'RIDRETH1', 'RIDRETH3', 'RIDEXMON', 'RIDEXAGM'] ...
__________________________________________________
FILE: labs.csv
Shape: (9813, 424)


Unnamed: 0,SEQN,URXUMA,URXUMS,URXUCR.x,URXCRS,URDACT,WTSAF2YR.x,LBXAPB,LBDAPBSI,LBXSAL,...,URXUTL,URDUTLLC,URXUTU,URDUTULC,URXUUR,URDUURLC,URXPREG,URXUAS,LBDB12,LBDB12SI
0,73557,4.3,4.3,39.0,3447.6,11.03,,,,4.1,...,,,,,,,,,524.0,386.7
1,73558,153.0,153.0,50.0,4420.0,306.0,,,,4.7,...,,,,,,,,,507.0,374.2
2,73559,11.9,11.9,113.0,9989.2,10.53,142196.890197,57.0,0.57,3.7,...,,,,,,,,,732.0,540.2


Columns: ['SEQN', 'URXUMA', 'URXUMS', 'URXUCR.x', 'URXCRS', 'URDACT', 'WTSAF2YR.x', 'LBXAPB', 'LBDAPBSI', 'LBXSAL'] ...
__________________________________________________
FILE: questionnaire.csv
Shape: (10175, 953)


Unnamed: 0,SEQN,ACD011A,ACD011B,ACD011C,ACD040,ACD110,ALQ101,ALQ110,ALQ120Q,ALQ120U,...,WHD080U,WHD080L,WHD110,WHD120,WHD130,WHD140,WHQ150,WHQ030M,WHQ500,WHQ520
0,73557,1.0,,,,,1.0,,1.0,3.0,...,,40.0,270.0,200.0,69.0,270.0,62.0,,,
1,73558,1.0,,,,,1.0,,7.0,1.0,...,,,240.0,250.0,72.0,250.0,25.0,,,
2,73559,1.0,,,,,1.0,,0.0,,...,,,180.0,190.0,70.0,228.0,35.0,,,


Columns: ['SEQN', 'ACD011A', 'ACD011B', 'ACD011C', 'ACD040', 'ACD110', 'ALQ101', 'ALQ110', 'ALQ120Q', 'ALQ120U'] ...

--- Scan Complete ---


In [2]:
import pandas as pd
import numpy as np


lab_filename = "labs.csv"  
meds_filename = "medications.csv"
quest_filename = "questionnaire.csv"

# ==========================================
# 2. LOAD DATA (With Encoding Fixes)
# ==========================================
print("Loading files...")


try:
    df_labs = pd.read_csv(lab_filename)
    print(f" Labs loaded: {df_labs.shape}")
except Exception as e:
    print(f" Error loading Labs: {e}")

# Medications (Fixing the 'utf-8' error)
try:
    df_meds = pd.read_csv(meds_filename, encoding='ISO-8859-1') # Try 'latin1' if this fails
    print(f" Medications loaded: {df_meds.shape}")
except Exception as e:
    print(f" Error loading Medications: {e}")

# Questionnaire (Your Disease Targets)
try:
    df_quest = pd.read_csv(quest_filename, low_memory=False)
    print(f" Questionnaire loaded: {df_quest.shape}")
except Exception as e:
    print(f" Error loading Questionnaire: {e}")


column_mapping = {
    # --- LABS (Features) ---
    'LBXGLU': 'Glucose',
    'LBXTC': 'Total_Cholesterol',
    'LBXHDL': 'HDL_Cholesterol',
    'LBXTR': 'Triglycerides',
    'LBXGH': 'Glycohemoglobin', # HbA1c
    'LBXIN': 'Insulin',
    'LBXBP': 'Lead',
    'URXUMA': 'Urine_Albumin',
    'LBXAPB': 'Apolipoprotein_B',
    
    # --- DEMOGRAPHICS ---
    'RIDAGEYR': 'Age',
    'RIAGENDR': 'Gender',
    
    # --- QUESTIONNAIRE (Targets / Diseases) ---
    # These are usually questions like "Have you ever been told you have..."
    'MCQ250': 'Target_Diabetes',    # 1=Yes, 2=No
    'MCQ160A': 'Target_Arthritis',
    'MCQ160B': 'Target_HeartFailure',
    'MCQ160C': 'Target_CoronaryHeart',
    'MCQ160E': 'Target_HeartAttack',
    'MCQ160F': 'Target_Stroke',
    'BPQ020': 'Target_Hypertension'
}

# Helper function to rename known columns
def rename_columns(df):
    # Only rename columns that actually exist in the dataframe
    found_cols = {k: v for k, v in column_mapping.items() if k in df.columns}
    if found_cols:
        print(f"   -> Renaming {len(found_cols)} columns to readable names...")
        return df.rename(columns=found_cols)
    return df

print("\n--- Renaming Columns ---")
df_labs = rename_columns(df_labs)
df_quest = rename_columns(df_quest)

# ==========================================
# 4. MERGE DATASETS (The Master Table)
# ==========================================
# We merge on 'SEQN' (Sequence Number / Patient ID)
print("\n--- Merging Data ---")

# Start with Labs
df_master = df_labs.copy()

# Merge with Questionnaire (Inner join keeps only patients who have BOTH lab and survey data)
if 'SEQN' in df_quest.columns:
    df_master = pd.merge(df_master, df_quest, on='SEQN', how='inner')
    print(f" Merged Labs + Questionnaire. New Shape: {df_master.shape}")
else:
    print(" Could not merge: 'SEQN' column missing in Questionnaire.")

# ==========================================
# 5. INSPECT THE FINAL TRAINING DATA
# ==========================================
# Let's see if we have our Targets and Features in the same row
print("\n--- Preview of Ready-to-Use Data ---")

# Look for columns we just renamed
target_cols = [c for c in df_master.columns if 'Target_' in c]
feature_cols = [c for c in df_master.columns if c in column_mapping.values() and 'Target_' not in c]

print(f"Potential Targets Found: {target_cols}")
print(f"Potential Features Found: {feature_cols}")

if target_cols and feature_cols:
    display(df_master[['SEQN'] + feature_cols + target_cols].head())
else:
    print(" Warning: Common mapped columns not found. We might need to look up specific codes for this year's data.")

Loading files...
 Labs loaded: (9813, 424)
 Medications loaded: (20194, 13)
 Questionnaire loaded: (10175, 953)

--- Renaming Columns ---
   -> Renaming 6 columns to readable names...
   -> Renaming 6 columns to readable names...

--- Merging Data ---
 Merged Labs + Questionnaire. New Shape: (9813, 1376)

--- Preview of Ready-to-Use Data ---
Potential Targets Found: ['Target_Hypertension', 'Target_Arthritis', 'Target_HeartFailure', 'Target_CoronaryHeart', 'Target_HeartAttack', 'Target_Stroke']
Potential Features Found: ['Urine_Albumin', 'Apolipoprotein_B', 'Glycohemoglobin', 'Insulin', 'Triglycerides', 'Total_Cholesterol']


Unnamed: 0,SEQN,Urine_Albumin,Apolipoprotein_B,Glycohemoglobin,Insulin,Triglycerides,Total_Cholesterol,Target_Hypertension,Target_Arthritis,Target_HeartFailure,Target_CoronaryHeart,Target_HeartAttack,Target_Stroke
0,73557,4.3,,13.9,,,167.0,1.0,1.0,2.0,2.0,2.0,1.0
1,73558,153.0,,9.1,,,170.0,1.0,2.0,2.0,2.0,2.0,2.0
2,73559,11.9,57.0,8.9,5.83,51.0,126.0,1.0,2.0,2.0,2.0,2.0,2.0
3,73560,16.0,,,,,168.0,,,,,,
4,73561,255.0,92.0,4.9,6.12,75.0,201.0,1.0,1.0,2.0,2.0,2.0,2.0


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

# ==========================================
# 1. PREPARE THE DATA
# ==========================================
# We use the 'df_master' created in your previous step
print(f"Original Data Shape: {df_master.shape}")

# Define Inputs (Features from OCR) and Outputs (Diseases)
# We only use columns that actually exist in your dataframe
feature_cols = [
    'Total_Cholesterol', 'Triglycerides', 'Glycohemoglobin', 
    'Insulin', 'Urine_Albumin', 'Apolipoprotein_B'
]
# Let's predict these two diseases for now
target_cols = ['Target_Hypertension', 'Target_HeartFailure']

# Filter only the columns we need
df_model = df_master[feature_cols + target_cols].copy()

# ==========================================
# 2. DATA CLEANING (Crucial Step!)
# ==========================================
print("\n--- Cleaning Data ---")

# A. Handle Missing Values (Labs often have blanks)
# We will fill blanks with the median value (standard practice)
imputer = SimpleImputer(strategy='median')
df_model[feature_cols] = imputer.fit_transform(df_model[feature_cols])

# B. Clean Targets (NHANES uses 1=Yes, 2=No. We need 1=Yes, 0=No)
for target in target_cols:
    # Drop rows where target is NaN (we can't train if we don't know the answer)
    df_model = df_model.dropna(subset=[target])
    
    # Convert 2.0 (No) to 0.0 (No). Keep 1.0 as 1.0 (Yes).
    df_model[target] = df_model[target].apply(lambda x: 1 if x == 1 else 0)
    
    print(f"Disease: {target}")
    print(df_model[target].value_counts())

print(f"Cleaned Data Shape: {df_model.shape}")

# ==========================================
# 3. TRAIN THE MODEL
# ==========================================
print("\n--- Training Model ---")

X = df_model[feature_cols]
y = df_model['Target_Hypertension'] # Let's train for Hypertension first

# Split into Training (80%) and Testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Brain (Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ==========================================
# 4. THE APP SIMULATION (OCR Handler)
# ==========================================
print("\n--- üì± Simulation: App Logic ---")

def predict_patient_status(ocr_data):
    """
    This function mimics what happens in your backend
    when a phone sends OCR data.
    """
    # 1. Convert JSON/Dict to DataFrame
    input_df = pd.DataFrame([ocr_data])
    
    # 2. Fill missing fields (if OCR missed 'Insulin', fill with average)
    # Note: In a real app, you'd load the 'imputer' saved from training
    input_filled = imputer.transform(input_df) 
    
    # 3. Predict
    probability = model.predict_proba(input_filled)[0][1] # Probability of '1' (Yes)
    prediction = model.predict(input_filled)[0]
    
    return probability, prediction

# --- TEST CASE: A Patient with High Numbers ---
# Imagine OCR extracted this from a photo:
ocr_result = {
    'Total_Cholesterol': 240,  # High
    'Triglycerides': 200,      # High
    'Glycohemoglobin': 6.5,    # Pre-diabetic
    'Insulin': 25,             # High
    'Urine_Albumin': 30,
    'Apolipoprotein_B': 110
}

risk_score, has_disease = predict_patient_status(ocr_result)

print(f"Input Stats: {ocr_result}")
print(f"Prediction for Hypertension: {'YES' if has_disease else 'NO'}")
print(f"Risk Confidence: {risk_score * 100:.1f}%")

if risk_score > 0.7:
    print("üî¥ APP ALERT: High Risk Detected! Recommend seeing a cardiologist.")
elif risk_score > 0.4:
    print("üü° APP ALERT: Moderate Risk. Monitor monthly.")
else:
    print("üü¢ APP ALERT: Status Healthy.")

Original Data Shape: (9813, 1376)

--- Cleaning Data ---
Disease: Target_Hypertension
Target_Hypertension
0    4148
1    2118
Name: count, dtype: int64
Disease: Target_HeartFailure
Target_HeartFailure
0    5411
1     177
Name: count, dtype: int64
Cleaned Data Shape: (5588, 8)

--- Training Model ---
Accuracy: 0.65

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.81      0.74       702
           1       0.54      0.36      0.43       416

    accuracy                           0.65      1118
   macro avg       0.61      0.59      0.59      1118
weighted avg       0.63      0.65      0.63      1118


--- üì± Simulation: App Logic ---
Input Stats: {'Total_Cholesterol': 240, 'Triglycerides': 200, 'Glycohemoglobin': 6.5, 'Insulin': 25, 'Urine_Albumin': 30, 'Apolipoprotein_B': 110}
Prediction for Hypertension: NO
Risk Confidence: 42.0%
üü° APP ALERT: Moderate Risk. Monitor monthly.




In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

# ==========================================
# 1. LOAD & MERGE DEMOGRAPHICS (Crucial Step)
# ==========================================
# Try to load demographics. If missing, we'll create dummy data so code still runs.
try:
    df_demo = pd.read_csv("demographic.csv") # Check your folder for a file like 'DEMO_J.csv'
    df_demo = rename_columns(df_demo) # Rename 'RIDAGEYR' -> 'Age'
    print(f"‚úÖ Demographics loaded! Shape: {df_demo.shape}")
    
    # Merge with our existing master data
    df_final = pd.merge(df_master, df_demo[['SEQN', 'Age', 'Gender']], on='SEQN', how='inner')
    print("   -> Merged Age and Gender into training data.")

except FileNotFoundError:
    print("‚ö†Ô∏è Demographic file not found. Using Lab Data only.")
    print("   (Tip: Download 'DEMO_J.csv' to boost accuracy!)")
    df_final = df_master.copy()
    # Create fake Age/Gender just to show how the code WOULD look
    df_final['Age'] = np.random.randint(20, 80, size=len(df_final)) 
    df_final['Gender'] = np.random.randint(1, 3, size=len(df_final))

# ==========================================
# 2. PREPARE FEATURES
# ==========================================
# Now we include Age and Gender in the inputs!
feature_cols = [
    'Age', 'Gender',
    'Total_Cholesterol', 'Triglycerides', 'Glycohemoglobin', 
    'Insulin', 'Urine_Albumin', 'Apolipoprotein_B'
]
target = 'Target_Hypertension'

# Filter and Clean
df_train = df_final[feature_cols + [target]].copy()

# Impute missing values
imputer = SimpleImputer(strategy='median')
df_train[feature_cols] = imputer.fit_transform(df_train[feature_cols])

# Clean Target (Remove NaNs)
df_train = df_train.dropna(subset=[target])
y = df_train[target].apply(lambda x: 1 if x == 1 else 0)
X = df_train[feature_cols]

# ==========================================
# 3. TRAIN WITH BALANCE
# ==========================================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# FIX: class_weight='balanced' tells the model "Pay extra attention to sick people"
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

print(f"\nNew Accuracy: {accuracy_score(y_test, model.predict(X_test)):.2f}")

# ==========================================
# 4. ERROR-FREE APP SIMULATION
# ==========================================
def predict_with_age(ocr_data, patient_age, patient_gender):
    # 1. Combine OCR data with User Profile (Age/Gender)
    full_data = ocr_data.copy()
    full_data['Age'] = patient_age
    full_data['Gender'] = patient_gender # 1=Male, 2=Female
    
    # 2. Convert to DataFrame WITH COLUMNS (Fixes the UserWarning)
    input_df = pd.DataFrame([full_data], columns=feature_cols)
    
    # 3. Fill missing (using the training imputer)
    # We must convert back to DF after imputation to keep names
    input_filled_array = imputer.transform(input_df)
    input_filled_df = pd.DataFrame(input_filled_array, columns=feature_cols)
    
    # 4. Predict
    prob = model.predict_proba(input_filled_df)[0][1]
    return prob

# --- TEST CASE ---
ocr_result = {
    'Total_Cholesterol': 240, 'Triglycerides': 200, 
    'Glycohemoglobin': 6.5, 'Insulin': 25, 
    'Urine_Albumin': 30, 'Apolipoprotein_B': 110
}

# SCENARIO 1: Young Person (Should be Low Risk)
risk_young = predict_with_age(ocr_result, patient_age=25, patient_gender=1)
print(f"\nRisk for 25-year-old: {risk_young*100:.1f}%")

# SCENARIO 2: Old Person (Should be High Risk)
risk_old = predict_with_age(ocr_result, patient_age=65, patient_gender=1)
print(f"Risk for 65-year-old: {risk_old*100:.1f}%")

if risk_old > risk_young:
    print("‚úÖ Logic Verified: Age correctly increases risk!")

   -> Renaming 2 columns to readable names...
‚úÖ Demographics loaded! Shape: (10175, 47)
   -> Merged Age and Gender into training data.

New Accuracy: 0.73

Risk for 25-year-old: 17.0%
Risk for 65-year-old: 56.0%
‚úÖ Logic Verified: Age correctly increases risk!


In [5]:
import joblib

# 1. Save the Model
joblib.dump(model, 'medical_brain.joblib')

# 2. Save the Imputer 
# (CRITICAL: You need this to fill blanks in future data exactly like you did during training)
joblib.dump(imputer, 'imputer.joblib')

print("‚úÖ Model and Imputer saved successfully!")

‚úÖ Model and Imputer saved successfully!


In [6]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

# ==========================================
# 1. LOAD & PREPARE DATA (With Age/Gender)
# ==========================================
print("üîÑ Retraining Model with Age & Gender...")

# Re-define the features we WANT to use
feature_cols = [
    'Age', 'Gender',
    'Total_Cholesterol', 'Triglycerides', 'Glycohemoglobin', 
    'Insulin', 'Urine_Albumin', 'Apolipoprotein_B'
]
target = 'Target_Hypertension'

# Ensure we are using the dataset that has Age/Gender merged
# (Using df_final from your previous successful run)
# If df_final is missing from memory, we recreate it briefly:
if 'df_final' not in locals():
    print("‚ö†Ô∏è df_final missing. Re-merging data...")
    try:
        df_demo = pd.read_csv("demographic.csv")
        df_demo = rename_columns(df_demo)
        df_final = pd.merge(df_master, df_demo[['SEQN', 'Age', 'Gender']], on='SEQN', how='inner')
    except:
        # Fallback if file missing (just so code runs)
        df_final = df_master.copy()
        df_final['Age'] = np.random.randint(20, 80, size=len(df_final))
        df_final['Gender'] = np.random.randint(1, 3, size=len(df_final))

# Prepare X and y
df_train = df_final[feature_cols + [target]].copy()

# A. IMPUTER (Fill Blanks)
imputer = SimpleImputer(strategy='median')
# Fit on the NEW columns (including Age/Gender)
df_train[feature_cols] = imputer.fit_transform(df_train[feature_cols])

# B. CLEAN TARGET
df_train = df_train.dropna(subset=[target])
y = df_train[target].apply(lambda x: 1 if x == 1 else 0)
X = df_train[feature_cols]

# ==========================================
# 2. TRAIN MODEL
# ==========================================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

acc = accuracy_score(y_test, model.predict(X_test))
print(f"‚úÖ New Model Trained. Accuracy: {acc:.2f}")

# ==========================================
# 3. OVERWRITE SAVED FILES
# ==========================================
joblib.dump(model, 'medical_brain.joblib')
joblib.dump(imputer, 'imputer.joblib')
print("üíæ Files overwritten: 'medical_brain.joblib' and 'imputer.joblib'")
print("üöÄ NOW you can run the App Predictor code!")

üîÑ Retraining Model with Age & Gender...
‚úÖ New Model Trained. Accuracy: 0.73
üíæ Files overwritten: 'medical_brain.joblib' and 'imputer.joblib'
üöÄ NOW you can run the App Predictor code!
