In [9]:
import pandas as pd
from sklearn.metrics import accuracy_score
from langchain_ollama import ChatOllama
import time
from concurrent.futures import ThreadPoolExecutor

In [10]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

# Drop rows with missing values
df = df.dropna()
target = df['stroke']
features = df.drop(columns=['stroke', 'id'])


In [11]:
features_sample = features.head(50)
target_sample = target.head(50)

In [12]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model="mistral:latest",
    temperature=0.1,  # Lower temperature for faster, more consistent responses
    num_ctx=512,  # Reduced context window for faster processing
    num_thread=4  # Utilize multiple threads for inference
)

def llm_predict(row):
    prompt = f"""You are a highly experienced medical expert specialized in stroke prediction.


Given a patient's medical information, predict whether the patient is likely to experience a stroke.


Here is the patient's information:
- Gender: {row['gender']}
- Age: {row['age']}
- Hypertension (0: No, 1: Yes): {row['hypertension']}
- Heart Disease (0: No, 1: Yes): {row['heart_disease']}
- Ever Married (Yes/No): {row['ever_married']}
- Work Type (Private, Self-employed, Govt_job, Children, Never_worked): {row['work_type']}
- Residence Type (Urban/Rural): {row['Residence_type']}
- Average Glucose Level: {row['avg_glucose_level']}
- BMI (Body Mass Index): {row['bmi']}
- Smoking Status (formerly smoked / never smoked / smokes / unknown): {row['smoking_status']}


Based on this information, please answer strictly with one of the following two options:
- "Stroke"
- "No Stroke"


Do not add any extra explanation. Only return "Stroke" or "No Stroke"."""
   
    response = llm.invoke([
        {"role": "system", "content": "You are a medical AI specialized in predicting stroke risk. Assess based only on the provided data."},
        {"role": "user", "content": prompt}
    ])
   
    output = response.content.strip().lower()
    return 1 if "stroke" in output and "no" not in output else 0

def predict_with_improved_approach(row):
    """Zero-shot with medical guidelines"""
    prompt = f"""As a stroke prediction specialist, analyze the following patient data:

Patient Information:
- Gender: {row['gender']}
- Age: {int(row['age'])}
- Hypertension: {row['hypertension']}
- Heart Disease: {row['heart_disease']}
- Ever Married: {row['ever_married']}
- Work Type: {row['work_type']}
- Residence Type: {row['Residence_type']}
- Average Glucose Level: {row['avg_glucose_level']:.2f}
- BMI: {row['bmi']:.1f}
- Smoking Status: {row['smoking_status']}

Medical Stroke Risk Factors:
1. Advanced age (especially >65)
2. Presence of hypertension
3. History of heart disease
4. High glucose levels (>140 mg/dL)
5. High BMI (>30)
6. Smoking history

Your assessment:
1. Systematically evaluate each risk factor above
2. Indicate the risk level for each factor
3. Conduct an overall risk assessment
4. Answer with ONLY "Stroke" or "No Stroke"

Your prediction:"""
    
    response = llm.invoke([
        {"role": "system", "content": "You are a stroke risk prediction specialist. Answer with ONLY 'Stroke' or 'No Stroke', nothing else."},
        {"role": "user", "content": prompt}
    ])
    
    output = response.content.strip().lower()
    if "stroke" in output and not any(neg in output for neg in ["no stroke", "no-stroke", "not stroke"]):
        return 1
    else:
        return 0


In [None]:
def efficient_predict(row):
    """Efficient approach with focused medical guidelines"""
    prompt = f"""As a stroke specialist, analyze:

Patient Information:
- Age: {int(row['age'])} | Gender: {row['gender']}
- Hypertension: {row['hypertension']} | Heart Disease: {row['heart_disease']}
- Glucose: {row['avg_glucose_level']:.1f} | BMI: {row['bmi']:.1f}
- Smoking: {row['smoking_status']}

Risk factors analysis:
- Age >65: {int(row['age']) > 65}
- Hypertension: {row['hypertension'] == 1}
- Heart Disease: {row['heart_disease'] == 1}
- High Glucose (>140): {row['avg_glucose_level'] > 140}
- High BMI (>30): {row['bmi'] > 30}
- Smoking risk: {"Yes" if row['smoking_status'] in ["formerly smoked", "smokes"] else "No"}

Based on these factors, answer ONLY "Stroke" or "No Stroke".
"""
    
    response = llm.invoke([
        {"role": "system", "content": "You are a stroke risk specialist. Answer with ONLY 'Stroke' or 'No Stroke'."},
        {"role": "user", "content": prompt}
    ])
    
    output = response.content.strip().lower()
    return 1 if "stroke" in output and not any(neg in output for neg in ["no stroke", "no-stroke", "not stroke"]) else 0

def batch_predict(dataframe, batch_size=5):
    """Process predictions in parallel batches"""
    predictions = []
    total = len(dataframe)
    
    # Process in batches using multiple threads
    for i in range(0, total, batch_size):
        batch = dataframe.iloc[i:min(i+batch_size, total)]
        
        with ThreadPoolExecutor(max_workers=batch_size) as executor:
            batch_results = list(executor.map(efficient_predict, [row for _, row in batch.iterrows()]))
            
        predictions.extend(batch_results)
        print(f"Processed {min(i+batch_size, total)}/{total} samples")
    
    return predictions

In [19]:
#preds = features_sample.apply(llm_predict, axis=1)
#preds = features_sample.apply(predict_with_improved_approach, axis=1)
preds = batch_predict(features_sample, batch_size=5)


Processed 5/50 samples
Processed 10/50 samples
Processed 15/50 samples
Processed 20/50 samples
Processed 25/50 samples
Processed 30/50 samples
Processed 35/50 samples
Processed 40/50 samples
Processed 45/50 samples
Processed 50/50 samples


In [20]:
acc = accuracy_score(target_sample, preds)
print(f"Accuracy: {acc:.2f}")

Accuracy: 0.96
