Sample

In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

In [None]:
# Dataset

dataset = pd.read_csv("C:/Users/daxab/py projects/ml_projects/datasets/healthcare-dataset-stroke-data.csv")

In [None]:
# Checking for Missing Values

missValues = dataset.isnull().sum()
print("Missing Values:", missValues)

In [None]:
# check if there are any null values
dataset.isnull().values.any()

Data Pre-Processing

In [None]:
dataset.describe()

In [None]:
cols_with_zero_invalid = ['age', 'hypertension', 'heart_disease', 'ever_married', 'work_type' , 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']

zero_rows = dataset[(dataset[cols_with_zero_invalid] == 0).any(axis=1)]
print(zero_rows)

In [None]:
print((dataset[cols_with_zero_invalid] == 0).sum())

In [None]:
from sklearn.impute import SimpleImputer

# Separate numeric and categorical columns
numeric_cols = ['age', 'avg_glucose_level', 'bmi']
categorical_cols = ['hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Handle numeric columns - replace 0 with NaN and impute with median
dataset[numeric_cols] = dataset[numeric_cols].replace(0, np.nan)
numeric_imputer = SimpleImputer(strategy="median")
dataset[numeric_cols] = numeric_imputer.fit_transform(dataset[numeric_cols])

# Handle categorical columns - impute missing values with mode (most frequent)
categorical_imputer = SimpleImputer(strategy="most_frequent")
dataset[categorical_cols] = categorical_imputer.fit_transform(dataset[categorical_cols])

Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Check data types and unique values before encoding
print("Data types:")
print(dataset.dtypes)
print("\nUnique values in each column:")
for col in dataset.columns:
    print(f"{col}: {dataset[col].unique()}")

# Encode ALL categorical variables
categorical_cols_all = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

label_encoders = {}
for col in categorical_cols_all:
    if col in dataset.columns and dataset[col].dtype == 'object':
        le = LabelEncoder()
        dataset[col] = le.fit_transform(dataset[col])
        label_encoders[col] = le
        print(f"Encoded {col}: {le.classes_}")

# Correlation Matrix Heatmap
plt.figure(figsize=(12, 10))
correlation_matrix = dataset.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix Heatmap')
plt.tight_layout()
plt.show()

# Prepare features and target
X = dataset.drop('stroke', axis=1)  # Features
y = dataset['stroke']  # Target variable

# Check if there are still any object columns
print("\nData types after encoding:")
print(X.dtypes)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the logistic regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

# Make predictions
y_pred = log_reg.predict(X_test_scaled)
y_pred_proba = log_reg.predict_proba(X_test_scaled)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance heatmap
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': abs(log_reg.coef_[0])
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.heatmap(feature_importance.set_index('feature')[['importance']], 
            annot=True, cmap='viridis', fmt='.3f')
plt.title('Feature Importance Heatmap (Logistic Regression Coefficients)')
plt.tight_layout()
plt.show()

SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

# Use the same preprocessed data from logistic regression
# X_train_scaled, X_test_scaled, y_train, y_test are already available

# Check class distribution
print("Class distribution in training set:")
print(y_train.value_counts())
print("\nClass distribution in test set:")
print(y_test.value_counts())

# Create SVM model with class balancing
svm_model = SVC(kernel='rbf', random_state=42, probability=True, class_weight='balanced')

# Train the SVM model
svm_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test_scaled)
y_pred_proba_svm = svm_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate the SVM model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.4f}")
print("\nSVM Classification Report:")
print(classification_report(y_test, y_pred_svm, zero_division=0))

# Hyperparameter tuning with GridSearchCV - reduced grid for faster execution
print("\nPerforming hyperparameter tuning...")
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.01, 0.1],
    'kernel': ['rbf', 'linear'],
    'class_weight': ['balanced', None]
}

grid_search = GridSearchCV(SVC(random_state=42, probability=True), 
                          param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Train best model
best_svm = grid_search.best_estimator_
y_pred_best = best_svm.predict(X_test_scaled)

# Evaluate best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"\nBest SVM Accuracy: {accuracy_best:.4f}")
print("\nBest SVM Classification Report:")
print(classification_report(y_test, y_pred_best, zero_division=0))

# Compare models
comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'SVM (default)', 'SVM (tuned)'],
    'Accuracy': [accuracy, accuracy_svm, accuracy_best]
})

plt.figure(figsize=(10, 6))
sns.barplot(data=comparison, x='Model', y='Accuracy')
plt.title('Model Comparison - Accuracy')
plt.ylim(0, 1)
for i, v in enumerate(comparison['Accuracy']):
    plt.text(i, v + 0.01, f'{v:.4f}', ha='center')
plt.tight_layout()
plt.show()

print("\nModel Comparison:")
print(comparison)