In [1]:
import pandas as pd
from openai import OpenAI
from sklearn.metrics import accuracy_score

In [None]:
client = OpenAI(api_key = 'sk-proj-')

In [5]:
df = pd.read_csv('cardio_train.csv')
print(df.columns)

Index(['id;age;gender;height;weight;ap_hi;ap_lo;cholesterol;gluc;smoke;alco;active;cardio'], dtype='object')


In [6]:
df = pd.read_csv('cardio_train.csv', delimiter=';')


target = df['cardio']
features = df.drop(columns=['cardio'])


In [7]:
features_sample = features.sample(n=60, random_state=42)
target_sample = target.loc[features_sample.index]

In [None]:
def llm_predict_cardiovascular(row):
    risk_score = 0
    
    # Age risk (older patients have higher risk)
    if row['age'] / 365 > 55:  # Assuming age is in days
        risk_score += 2
    
    # Blood pressure risk
    if row['ap_hi'] >= 140 or row['ap_lo'] >= 90:
        risk_score += 2
    
    # Cholesterol risk
    if row['cholesterol'] == 3:
        risk_score += 2
    elif row['cholesterol'] == 2:
        risk_score += 1
    
    # BMI calculation and risk
    height_m = row['height'] / 100
    bmi = row['weight'] / (height_m * height_m)
    if bmi >= 30:
        risk_score += 1
    
    # Glucose risk
    if row['gluc'] == 3:
        risk_score += 1
    
    prompt = f"""You are given clinical data of a patient and need to predict their risk of cardiovascular disease. 
Your task is to predict whether the patient has cardiovascular disease (binary classification: 0 = no, 1 = yes).

Analyze the patient's full clinical profile carefully, considering all these factors:

- Age: Higher age increases cardiovascular risk significantly (age is given in days, divide by 365 for years).
- Gender: 1 = female, 2 = male. Men typically have higher cardiovascular risk.
- Height and weight: Used to calculate BMI. BMI > 25 indicates overweight, > 30 indicates obesity.
- Blood pressure: ap_hi = systolic, ap_lo = diastolic. Elevated BP (≥140/90) is a major risk factor.
- Cholesterol: 1 = normal, 2 = above normal, 3 = well above normal. Higher levels increase risk.
- Glucose: 1 = normal, 2 = above normal, 3 = well above normal. Elevated glucose increases risk.
- Smoking: 1 = smoker, 0 = non-smoker. Smoking significantly increases cardiovascular risk.
- Alcohol intake: 1 = drinks alcohol, 0 = doesn't drink. Excessive alcohol increases risk.
- Physical activity: 1 = active, 0 = inactive. Lack of physical activity increases risk.

Additional calculated risk factors for this patient:
- BMI: {bmi:.1f} kg/m² (Underweight < 18.5, Normal 18.5-24.9, Overweight 25-29.9, Obese ≥ 30)
- Overall risk score: {risk_score} (Based on age, blood pressure, cholesterol, BMI and glucose)

Follow this step-by-step reasoning process:

1. Calculate the patient's age in years (days/365).
2. Identify and assess the patient's key cardiovascular risk factors.
3. Evaluate whether the combination of factors suggests high or low risk of cardiovascular disease.
4. Make a final prediction: 0 (no cardiovascular disease) or 1 (cardiovascular disease).

Examples:

---

Patient Data:
- Age: 23725 days (65 years)
- Gender: 1 (female)
- Height: 160 cm
- Weight: 85 kg
- Systolic BP: 155 mmHg
- Diastolic BP: 95 mmHg
- Cholesterol: 3 (well above normal)
- Glucose: 2 (above normal)
- Smoking: 0 (non-smoker)
- Alcohol: 0 (no)
- Physical activity: 0 (inactive)

Reasoning:
- Age is 65 years which is elevated risk.
- BMI is 33.2 kg/m² indicating obesity.
- Blood pressure is 155/95, showing hypertension.
- Cholesterol is well above normal (level 3).
- Glucose is above normal (level 2).
- Patient doesn't smoke or drink alcohol.
- Patient is inactive physically.
- Multiple major risk factors are present: advanced age, obesity, hypertension, hypercholesterolemia, hyperglycemia, and physical inactivity.

Prediction: 1 (cardiovascular disease present)

---

Patient Data:
- Age: 14600 days (40 years)
- Gender: 2 (male)
- Height: 175 cm
- Weight: 70 kg
- Systolic BP: 120 mmHg
- Diastolic BP: 80 mmHg
- Cholesterol: 1 (normal)
- Glucose: 1 (normal)
- Smoking: 0 (non-smoker)
- Alcohol: 0 (no)
- Physical activity: 1 (active)

Reasoning:
- Age is 40 years, which is relatively young.
- BMI is 22.9 kg/m², within normal range.
- Blood pressure is 120/80, which is optimal.
- Cholesterol and glucose levels are normal.
- Patient doesn't smoke or drink alcohol.
- Patient is physically active.
- No major risk factors are present.

Prediction: 0 (no cardiovascular disease)

---

Now, here is the patient's data you need to predict:

- Age: {row['age']} days (approximately {row['age']/365:.1f} years)
- Gender: {row['gender']}
- Height: {row['height']} cm
- Weight: {row['weight']} kg
- Systolic BP: {row['ap_hi']} mmHg
- Diastolic BP: {row['ap_lo']} mmHg
- Cholesterol: {row['cholesterol']}
- Glucose: {row['gluc']}
- Smoking: {row['smoke']}
- Alcohol: {row['alco']}
- Physical activity: {row['active']}
- BMI: {bmi:.1f} kg/m²

Please provide your step-by-step reasoning. At the end, on a separate line, provide ONLY your final prediction formatted exactly as: "FINAL_PREDICTION: [0 or 1]"
"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a medical assistant specialized in analyzing cardiovascular health data to predict cardiovascular disease."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )

    output = response.choices[0].message.content.strip()
    
    # Extract prediction using a more reliable method
    import re
    match = re.search(r"FINAL_PREDICTION:\s*([01])", output)
    if match:
        return int(match.group(1))
    else:
        # Fallback: analyze text for prediction evidence
        cardio_indicators = ["cardiovascular disease", "high risk", "predict: 1", "prediction: 1"]
        for indicator in cardio_indicators:
            if indicator in output.lower():
                return 1
        return 0

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    
    print(f"Accuracy: {acc:.2f}")
    print(f"Precision: {prec:.2f}")
    print(f"Recall: {rec:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"Confusion Matrix:")
    print(cm)
    
    return acc, prec, rec, f1, cm

In [10]:
# Step 1: Generate predictions using the improved function
preds_improved = features_sample.apply(llm_predict_cardiovascular, axis=1)

# Step 2: Evaluate the model with the new predictions
evaluate_model(target_sample, preds_improved)

Accuracy: 0.70
Precision: 0.79
Recall: 0.52
F1 Score: 0.62
Confusion Matrix:
[[27  4]
 [14 15]]


(0.7,
 0.7894736842105263,
 0.5172413793103449,
 0.625,
 array([[27,  4],
        [14, 15]], dtype=int64))