In [49]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('stroke_prediction_dataset.csv')
df.describe()

Unnamed: 0,Patient ID,Age,Hypertension,Heart Disease,Average Glucose Level,Body Mass Index (BMI),Stroke History,Stress Levels
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,49715.802867,54.035667,0.249,0.502933,129.445209,27.474302,0.500267,5.022694
std,29000.656642,21.063111,0.432448,0.500008,40.487792,7.230201,0.500017,2.873223
min,1.0,18.0,0.0,0.0,60.0,15.01,0.0,0.0
25%,24562.0,36.0,0.0,0.0,94.5175,21.16,0.0,2.54
50%,49448.0,54.0,0.0,1.0,128.9,27.42,1.0,5.05
75%,75112.0,72.0,0.0,1.0,164.5925,33.72,1.0,7.52
max,99975.0,90.0,1.0,1.0,200.0,40.0,1.0,10.0


In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    accuracy_score
)

In [51]:
df = df.drop(columns=["Patient ID", "Patient Name"])

df["stroke"] = df["Diagnosis"].apply(lambda x: 1 if x == "Stroke" else 0)
df = df.drop(columns=["Diagnosis"])

categorical_cols = df.select_dtypes(include='object').columns.tolist()

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df.drop("stroke", axis=1)
y = df["stroke"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
if rf.n_classes_ == 2:
    y_prob = rf.predict_proba(X_test)[:, 1]
else:
    y_prob = np.zeros_like(y_test)  # fallback

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))
print("Accuracy:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[830 676]
 [801 693]]

Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.55      0.53      1506
           1       0.51      0.46      0.48      1494

    accuracy                           0.51      3000
   macro avg       0.51      0.51      0.51      3000
weighted avg       0.51      0.51      0.51      3000

ROC-AUC Score: 0.5082110202652133
Accuracy: 0.5076666666666667


<h2>Logistic Regression</h2>

In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [55]:

df = pd.read_csv("stroke_prediction_dataset.csv")

df[["Systolic", "Diastolic"]] = df["Blood Pressure Levels"].str.split("/", expand=True).astype(float)

df["HDL"] = df["Cholesterol Levels"].str.extract(r"HDL:\s*(\d+)").astype(float)
df["LDL"] = df["Cholesterol Levels"].str.extract(r"LDL:\s*(\d+)").astype(float)

df.drop(columns=["Patient ID", "Patient Name", "Symptoms", "Blood Pressure Levels", "Cholesterol Levels"], inplace=True)

label_encoders = {}
categorical_cols = df.select_dtypes(include=["object"]).columns

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

target_encoder = LabelEncoder()
df["Diagnosis"] = target_encoder.fit_transform(df["Diagnosis"])  # Stroke=1, No Stroke=0

X = df.drop("Diagnosis", axis=1)
y = df["Diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("\n🧾 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("📈 Accuracy:", accuracy_score(y_test, y_pred))



🧾 Confusion Matrix:
 [[863 618]
 [856 663]]

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.58      0.54      1481
           1       0.52      0.44      0.47      1519

    accuracy                           0.51      3000
   macro avg       0.51      0.51      0.51      3000
weighted avg       0.51      0.51      0.51      3000

📈 Accuracy: 0.5086666666666667
