---
*19CSE495 G063 Phase 1 Implementation*
---

---
***Dynamic AI-Powered Healthcare Prediction for Multiple Diseases***
---

---
*Team Members*

---
*Adithi Balaji - CB.EN.U4CSE20303*

*Aksita G - CB.EN.U4CSE20304*

*Dharaneish V C - CB.EN.U4CSE20315*

*Shanjaikumar VM - CB.EN.U4CSE20655*


---

## Pre processing

In [9]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the datasets
heart_failure_data = pd.read_csv('datasets/heart.csv')
hepatitis_c_data = pd.read_csv('datasets/HepatitisC.csv')
cirrhosis_data = pd.read_csv('datasets/cirrhosis.csv')
stroke_data = pd.read_csv('datasets/stroke.csv')
framingham_data = pd.read_csv('datasets/framingham-heart.csv')
diabetes_pimaIndian_data = pd.read_csv('datasets/diabetes.csv')
lung_cancer_data = pd.read_csv('datasets/lung-cancer.csv')


### Handling Missing Values (KNN Imputer)

In [10]:
# Display the number of missing values in each dataset for all columns
datasets = {
    'Heart Failure': heart_failure_data,
    'Hepatitis C': hepatitis_c_data,
    'Cirrhosis': cirrhosis_data,
    'Stroke': stroke_data,
    'Framingham': framingham_data,
    'Diabetes': diabetes_pimaIndian_data,
    'Lung Cancer':lung_cancer_data
}

for dataset_name, dataset in datasets.items():

    print(f" \nDataset name: {dataset_name}")

    # Store column name, dtype, no of null rows in Column Info 
    column_info = pd.DataFrame({
        'Column Name': dataset.columns,
        'Data Type': dataset.dtypes,
        'No of Nulls': dataset.isnull().sum()
    })

    # Print the information as a table
    print("\nFeature Information:\n", column_info)


 
Dataset name: Heart Failure

Feature Information:
                    Column Name Data Type  No of Nulls
Age                        Age     int64            0
Sex                        Sex    object            0
ChestPainType    ChestPainType    object            0
RestingBP            RestingBP     int64            0
Cholesterol        Cholesterol     int64            0
FastingBS            FastingBS     int64            0
RestingECG          RestingECG    object            0
MaxHR                    MaxHR     int64            0
ExerciseAngina  ExerciseAngina    object            0
Oldpeak                Oldpeak   float64            0
ST_Slope              ST_Slope    object            0
HeartDisease      HeartDisease     int64            0
 
Dataset name: Hepatitis C

Feature Information:
          Column Name Data Type  No of Nulls
Category    Category    object            0
Age              Age     int64            0
Sex              Sex    object            0
ALB              A

In [11]:
from sklearn.impute import KNNImputer

# Function to handle datasets having missing values using KNN Imputer

def Missing_values_handler(dataset):
    # Identify numerical and categorical columns
    numerical_columns = dataset.select_dtypes(include=['float64', 'int64']).columns
    categorical_columns = dataset.select_dtypes(include=['object']).columns

    # Check if numerical columns exist
    if not numerical_columns.empty:
        # Create a mask for missing values
        numerical_mask = dataset[numerical_columns].isnull()

        # Initialize KNNImputer for numerical values
        numerical_imputer = KNNImputer(n_neighbors=5, missing_values=np.nan)

        # Fill missing values in numerical columns
        dataset[numerical_columns] = numerical_imputer.fit_transform(dataset[numerical_columns])

        # Apply the mask to keep original non-null values
        dataset[numerical_columns] = dataset[numerical_columns].where(~numerical_mask, dataset[numerical_columns])

    # Check if categorical columns exist
    if not categorical_columns.empty:
        # Convert categorical columns to numeric using label encoding
        label_encoder = {}
        for col in categorical_columns:
            label_encoder[col] = pd.Categorical(dataset[col])
            dataset[col] = label_encoder[col].codes.astype(float)  # Convert to float

        # Create a mask for missing values
        categorical_mask = dataset[categorical_columns].isnull()

        # Initialize KNNImputer for categorical values with 'nan_euclidean' metric
        categorical_imputer = KNNImputer(n_neighbors=5, missing_values=np.nan, metric='nan_euclidean')

        # Fill missing values in categorical columns
        dataset[categorical_columns] = categorical_imputer.fit_transform(dataset[categorical_columns])

        # Convert back categorical columns to their original type
        for col in categorical_columns:
            dataset[col] = label_encoder[col].categories.take(np.round(dataset[col]).astype(int))

        # Apply the mask to keep original non-null values
        dataset[categorical_columns] = dataset[categorical_columns].where(~categorical_mask, dataset[categorical_columns])

    #Return the dataset
    return dataset


In [12]:
## Handling Missing Values using the function

datasets = {
    'Heart Failure': heart_failure_data,
    'Hepatitis C': hepatitis_c_data,
    'Cirrhosis': cirrhosis_data,
    'Stroke': stroke_data,
    'Framingham': framingham_data,
    'Diabetes': diabetes_pimaIndian_data,
    'Lung Cancer':lung_cancer_data
}

for dataset_name, dataset in datasets.items():

    dataset = Missing_values_handler(dataset)

    print(f"Dataset name: {dataset_name}")

    # Store column name, dtype, no of null rows in Column Info 
    column_info = pd.DataFrame({
        'Column Name': dataset.columns,
        'Data Type': dataset.dtypes,
        'No of Nulls': dataset.isnull().sum()
    })

    # Print the information as a table
    print("\nFeature Information:\n", column_info)

    # Print general statistics
    print("\nGeneral Statistics:\n", dataset.describe())

    # Sample 5 rows
    print("\nSample:\n", dataset.head())

Dataset name: Heart Failure

Feature Information:
                    Column Name Data Type  No of Nulls
Age                        Age   float64            0
Sex                        Sex    object            0
ChestPainType    ChestPainType    object            0
RestingBP            RestingBP   float64            0
Cholesterol        Cholesterol   float64            0
FastingBS            FastingBS   float64            0
RestingECG          RestingECG    object            0
MaxHR                    MaxHR   float64            0
ExerciseAngina  ExerciseAngina    object            0
Oldpeak                Oldpeak   float64            0
ST_Slope              ST_Slope    object            0
HeartDisease      HeartDisease   float64            0

General Statistics:
               Age   RestingBP  Cholesterol   FastingBS       MaxHR  \
count  918.000000  918.000000   918.000000  918.000000  918.000000   
mean    53.510893  132.396514   198.799564    0.233115  136.809368   
std      9.4326

Dataset name: Cirrhosis

Feature Information:
                  Column Name Data Type  No of Nulls
N_Days                N_Days   float64            0
Status                Status    object            0
Drug                    Drug    object            0
Age                      Age   float64            0
Sex                      Sex    object            0
Ascites              Ascites    object            0
Hepatomegaly    Hepatomegaly    object            0
Spiders              Spiders    object            0
Edema                  Edema    object            0
Bilirubin          Bilirubin   float64            0
Cholesterol      Cholesterol   float64            0
Albumin              Albumin   float64            0
Copper                Copper   float64            0
Alk_Phos            Alk_Phos   float64            0
SGOT                    SGOT   float64            0
Tryglicerides  Tryglicerides   float64            0
Platelets          Platelets   float64            0
Prothrombin      

### Encoding Categorical Columns

In [13]:
# Encoding the categorical data
heart_failure_data = pd.get_dummies(heart_failure_data, columns=["Sex", "ChestPainType", "FastingBS", "RestingECG", "ExerciseAngina", "ST_Slope"])
hepatitis_c_data = pd.get_dummies(hepatitis_c_data, columns=["Sex"])
cirrhosis_data = pd.get_dummies(cirrhosis_data, columns=['Drug', 'Sex', 'Ascites','Hepatomegaly','Spiders','Edema'])
stroke_data = pd.get_dummies(stroke_data, columns=['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])
framingham_data = pd.get_dummies(framingham_data, columns=['male', 'currentSmoker', 'prevalentStroke', 'prevalentHyp', 'diabetes'])
lung_cancer_data = pd.get_dummies(lung_cancer_data, columns=['Gender'])

### Feature Scaling Numerical Columns

In [14]:
# Feature Scaling
scaler = StandardScaler()

num_cols_heart_failure = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
heart_failure_data[num_cols_heart_failure] = scaler.fit_transform(heart_failure_data[num_cols_heart_failure])

num_cols_hepatitis_c = ['Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']
hepatitis_c_data[num_cols_hepatitis_c] = scaler.fit_transform(hepatitis_c_data[num_cols_hepatitis_c])

num_cols_cirrhosis = ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']
cirrhosis_data[num_cols_cirrhosis] = scaler.fit_transform(cirrhosis_data[num_cols_cirrhosis])

num_cols_stroke = ['age', 'avg_glucose_level', 'bmi']
stroke_data[num_cols_stroke] = scaler.fit_transform(stroke_data[num_cols_stroke])

num_cols_framingham = ['age', 'education', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
framingham_data[num_cols_framingham] = scaler.fit_transform(framingham_data[num_cols_framingham])

num_cols_pima_indians = ['Age', 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction']
diabetes_pimaIndian_data[num_cols_pima_indians] = scaler.fit_transform(diabetes_pimaIndian_data[num_cols_pima_indians])

num_cols_lung_cancer = ['Age', 'Air Pollution', 'Alcohol use', 'Dust Allergy', 'OccuPational Hazards', 'Genetic Risk', 'chronic Lung Disease', 'Balanced Diet', 'Obesity', 'Smoking', 'Passive Smoker', 'Chest Pain', 'Coughing of Blood', 'Fatigue', 'Weight Loss', 'Shortness of Breath', 'Wheezing', 'Swallowing Difficulty', 'Clubbing of Finger Nails', 'Frequent Cold', 'Dry Cough', 'Snoring']
lung_cancer_data[num_cols_lung_cancer] = scaler.fit_transform(lung_cancer_data[num_cols_lung_cancer])

### Test, Train Split

In [15]:
# Splitting the dataset
from sklearn.model_selection import train_test_split

heart_failure_X = heart_failure_data.drop('HeartDisease', axis=1)
heart_failure_y = heart_failure_data['HeartDisease']
heart_failure_X_train, heart_failure_X_test, heart_failure_y_train, heart_failure_y_test = train_test_split(heart_failure_X, heart_failure_y, test_size=0.2, random_state=42)

hepatitis_c_X = hepatitis_c_data.drop('Category', axis=1)
hepatitis_c_y = hepatitis_c_data['Category']
hepatitis_c_X_train, hepatitis_c_X_test, hepatitis_c_y_train, hepatitis_c_y_test = train_test_split(hepatitis_c_X, hepatitis_c_y, test_size=0.2, random_state=42)

cirrhosis_X = cirrhosis_data.drop('Status', axis=1)
cirrhosis_y = cirrhosis_data['Status']
cirrhosis_X_train, cirrhosis_X_test, cirrhosis_y_train, cirrhosis_y_test = train_test_split(cirrhosis_X, cirrhosis_y, test_size=0.2, random_state=42)

stroke_X = stroke_data.drop('stroke', axis=1)
stroke_y = stroke_data['stroke'].astype('int64')
stroke_X_train, stroke_X_test, stroke_y_train, stroke_y_test = train_test_split(stroke_X, stroke_y, test_size=0.2, random_state=42)

framingham_X = framingham_data.drop('TenYearCHD', axis=1)
framingham_y = framingham_data['TenYearCHD'].astype('int64')
framingham_X_train, framingham_X_test, framingham_y_train, framingham_y_test = train_test_split(framingham_X, framingham_y, test_size=0.2, random_state=42)

diabetes_X = diabetes_pimaIndian_data.drop('Outcome', axis=1)
diabetes_y = diabetes_pimaIndian_data['Outcome']
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(diabetes_X, diabetes_y, test_size=0.2, random_state=42)

lung_cancer_X = lung_cancer_data.drop('Level', axis=1)
lung_cancer_Y = lung_cancer_data['Level']
lung_cancer_X_train, lung_cancer_X_test, lung_cancer_y_train, lung_cancer_y_test = train_test_split(lung_cancer_X, lung_cancer_Y, test_size=0.2, random_state=42)


In [16]:
datasets = {
    'Heart Failure': heart_failure_data,
    'Hepatitis C': hepatitis_c_data,
    'Cirrhosis': cirrhosis_data,
    'Stroke': stroke_data,
    'Framingham': framingham_data,
    'Diabetes': diabetes_pimaIndian_data,
    'Lung Cancer': lung_cancer_data
}

for dataset_name, dataset in datasets.items():

    print(f"Dataset name: {dataset_name}")

    # Sample 5 rows
    print("\nSample:\n", dataset.head())

Dataset name: Heart Failure

Sample:
         Age  RestingBP  Cholesterol     MaxHR   Oldpeak  HeartDisease  Sex_F  \
0 -1.433140   0.410909     0.825070  1.382928 -0.832432           0.0      0   
1 -0.478484   1.491752    -0.171961  0.754157  0.105664           1.0      1   
2 -1.751359  -0.129513     0.770188 -1.525138 -0.832432           0.0      0   
3 -0.584556   0.302825     0.139040 -1.132156  0.574711           1.0      1   
4  0.051881   0.951331    -0.034755 -0.581981 -0.832432           0.0      0   

   Sex_M  ChestPainType_ASY  ChestPainType_ATA  ...  FastingBS_0.0  \
0      1                  0                  1  ...              1   
1      0                  0                  0  ...              1   
2      1                  0                  1  ...              1   
3      0                  1                  0  ...              1   
4      1                  0                  0  ...              1   

   FastingBS_1.0  RestingECG_LVH  RestingECG_Normal  Resting

4 Datasets - Binary Classification
3 Datasets - Multi Classification
Hepatisis & Lung Cancer are Multinomial classification

## ML Model Pipeline

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.mixture import GaussianMixture
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

In [58]:
from sklearn.preprocessing import LabelEncoder

In [66]:
def display_classification_metrics(y_true, predicted_probabilities, multi_class=False):
    if multi_class:
        # Convert string labels to numeric labels
        label_encoder = LabelEncoder()
        y_true = label_encoder.fit_transform(y_true)

        # Choose the class with the highest probability for each row
        predicted_labels = np.argmax(predicted_probabilities, axis=1)
    else:
        predicted_labels = (predicted_probabilities > 0.5).astype(int)  # Threshold for binary classification

    classification_rep = classification_report(y_true, predicted_labels, zero_division=1)
    confusion_mat = confusion_matrix(y_true, predicted_labels)
    
    accuracy = accuracy_score(y_true, predicted_labels)
    
    if (not multi_class):
        precision = precision_score(y_true, predicted_labels)
        recall = recall_score(y_true, predicted_labels)
        f1 = f1_score(y_true, predicted_labels)
        auc = roc_auc_score(y_true, predicted_probabilities)
    
    print(f"Classification Report:")
    print(classification_rep)
    
    print(f"Confusion Matrix:")
    print(confusion_mat)

    print(f"Accuracy: {accuracy:.4f}")
    
    if (not multi_class):
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"AUC-ROC: {auc:.4f}")


In [73]:
def disease_prediction(X_train, y_train, X_test, y_test, multi_class=False):

   #Defining models
    models = {
        'Logistic Regression': LogisticRegression(),
        'Gradient Boosting': GradientBoostingClassifier(),
        'K-Nearest Neighbors': KNeighborsClassifier(),
    #    'Gaussian Mixture Models': GaussianMixture(),
    #    'Neural Networks': MLPClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(),
        'Gaussian Naive Bayes': GaussianNB(),
        'Decision Tree': DecisionTreeClassifier(),
        'SVM': SVC(probability=True)
    #   'Markov Random Fields': (MRF) - Not a direct implementation in scikit-learn. Need to do,
    #   'Hidden Markov Models;: (HMM) - Not a direct implementation in scikit-learn. Need to do
    }

    #GM, NN are showing errors/warnings in voting. Needs to be resolved
    
    #Parameters that are to be tuned
    param_grids = {
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'solver':['lbfgs'],
        'penalty': ['l2'],
        'max_iter' :[100,500,1000,5000]
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    },
    'K-Nearest Neighbors (KNN)': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],
        'max_iter': [100,200,300,500,1000,5000]
    },
    'Gaussian Mixture Models': {
        'n_components': [2, 3, 4],
        'covariance_type': ['full', 'tied', 'diag', 'spherical']
    },
    'Markov Random Fields': {
        # Add parameters specific to Markov Random Fields
    },
    'Hidden Markov Models': {
        # Add parameters specific to Hidden Markov Models
    },
    'Neural Networks': {
        'hidden_layer_sizes': [(50, 50), (100,)],
        'activation': ['logistic', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.001, 0.01],
    },
    'Random Forests': {
        'n_estimators': [50, 100, 200, 500],
        'max_depth': [3, 4, 5, 7, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Decision Trees': {
        'max_depth': [3, 4, 5, 7, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Support Vector Machines (SVM)': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    },
    'Gaussian Naive Bayes': {
        # No hyperparameters to tune for Gaussian Naive Bayes
    }
}
    
    # Hyperparameter tuning using GridSearchCV
    for model_name, model in models.items():
        param_grid = param_grids.get(model_name, {})  # Get the corresponding parameter grid
        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        models[model_name] = best_model

        # Print optimized parameters
        print(f"Optimized parameters for {model_name}: {grid_search.best_params_}")
        
        # Predict probabilities
        if multi_class:
            predicted_probabilities = best_model.predict_proba(X_test)
        else:
            predicted_probabilities = best_model.predict_proba(X_test)[:, 1]

        print("Sample(5) Prediction Actual Test & Predicted: ", y_test.values[0:5], predicted_probabilities[0:5])

        # Print classification metrics
        print(f"Classification Report for {model_name}:")
        display_classification_metrics(y_test, predicted_probabilities, multi_class)
        print("-" * 100)
    
    # Ensemble Method: Voting Classifiers, Defining and estimators parameters Set to best models
    voting_clf = VotingClassifier(
        estimators=[(name, model) for name, model in models.items()],
        voting='soft'
    )

    # Fit the Voting Classifier
    voting_clf.fit(X_train, y_train)
    
    # Predict probabilities for the ensemble method
    if multi_class:
        voting_probs = voting_clf.predict_proba(X_test)
    else:
        voting_probs = voting_clf.predict_proba(X_test)[:, 1]

    # Display classification metrics for the ensemble method
    print(f"Classification Report for viting_clf Ensemble Model of all above models:")
    display_classification_metrics(y_test, voting_probs, multi_class)


### Predicting each of 7 diseases

In [20]:
disease_prediction(heart_failure_X_train, heart_failure_y_train, heart_failure_X_test, heart_failure_y_test, multi_class=False)

Optimized parameters for Logistic Regression: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Sample(5) Prediction Actual Test & Predicted:  [0. 1. 1. 1. 0.] [0.04010284 0.18516203 0.95578813 0.96375719 0.06046206]
Classification Report for Logistic Regression:
 Classification Report:
              precision    recall  f1-score   support

         0.0       0.80      0.88      0.84        77
         1.0       0.91      0.84      0.87       107

    accuracy                           0.86       184
   macro avg       0.85      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184

Confusion Matrix:
[[68  9]
 [17 90]]
Accuracy: 0.8587
Precision: 0.9091
Recall: 0.8411
F1 Score: 0.8738
AUC-ROC: 0.9262
----------------------------------------------------------------------------------------------------
Optimized parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}
Sample(5) Prediction Actual Test & Predicted:  

In [75]:
disease_prediction(hepatitis_c_X_train, hepatitis_c_y_train, hepatitis_c_X_test, hepatitis_c_y_test, multi_class=True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Optimized parameters for Logistic Regression: {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Sample(5) Prediction Actual Test & Predicted:  ['0=Blood Donor' '0=Blood Donor' '0=Blood Donor' '3=Cirrhosis'
 '0=Blood Donor'] [[9.75417132e-01 1.29160991e-02 1.69618500e-04 4.75173291e-04
  1.10219772e-02]
 [9.95963124e-01 3.37346439e-04 2.09911458e-04 1.10246479e-05
  3.47859305e-03]
 [9.80587221e-01 2.19017593e-05 9.10434367e-03 9.04591561e-03
  1.24061827e-03]
 [1.25080197e-03 9.96724230e-01 1.19114057e-07 1.60597679e-12
  2.02484880e-03]
 [9.98269394e-01 4.31456977e-04 4.29485991e-04 3.46543052e-05
  8.35008971e-04]]
Classification Report for Logistic Regression:
 Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        96
           1       0.67      0.67      0.67         3
           2       1.00      0.33      0.50         9
           3       0.50      0.33      0.40         6
           4       0.



Optimized parameters for Gradient Boosting: {'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 100}
Sample(5) Prediction Actual Test & Predicted:  ['0=Blood Donor' '0=Blood Donor' '0=Blood Donor' '3=Cirrhosis'
 '0=Blood Donor'] [[9.99999997e-01 2.04791966e-10 1.04335980e-09 4.82578363e-10
  7.79356362e-10]
 [1.00000000e+00 3.64409769e-11 1.47568563e-10 6.73266773e-11
  1.43235477e-10]
 [1.00000000e+00 3.64409769e-11 1.47568563e-10 7.96743564e-11
  5.48357715e-11]
 [7.45453468e-06 2.57633002e-07 1.42861709e-06 2.04325316e-04
  9.99786534e-01]
 [1.00000000e+00 2.89317317e-11 1.45063222e-10 6.32561828e-11
  1.09086524e-10]]
Classification Report for Gradient Boosting:
 Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94        96
           1       1.00      0.33      0.50         3
           2       0.67      0.22      0.33         9
           3       0.50      0.33      0.40         6
           4       0.88    



Optimized parameters for Random Forest: {}
Sample(5) Prediction Actual Test & Predicted:  ['0=Blood Donor' '0=Blood Donor' '0=Blood Donor' '3=Cirrhosis'
 '0=Blood Donor'] [[0.96 0.01 0.02 0.   0.01]
 [0.98 0.   0.   0.   0.02]
 [1.   0.   0.   0.   0.  ]
 [0.06 0.05 0.14 0.1  0.65]
 [1.   0.   0.   0.   0.  ]]
Classification Report for Random Forest:
 Classification Report:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        96
           1       1.00      0.00      0.00         3
           2       1.00      0.22      0.36         9
           3       0.67      0.33      0.44         6
           4       0.88      0.78      0.82         9

    accuracy                           0.87       123
   macro avg       0.88      0.47      0.51       123
weighted avg       0.88      0.87      0.84       123

Confusion Matrix:
[[96  0  0  0  0]
 [ 3  0  0  0  0]
 [ 5  0  2  1  1]
 [ 4  0  0  2  0]
 [ 2  0  0  0  7]]
Accuracy: 0.8699
--------



Optimized parameters for SVM: {}
Sample(5) Prediction Actual Test & Predicted:  ['0=Blood Donor' '0=Blood Donor' '0=Blood Donor' '3=Cirrhosis'
 '0=Blood Donor'] [[9.81649394e-01 3.89660801e-03 7.01077347e-03 3.51337386e-03
  3.92985056e-03]
 [9.55774308e-01 7.20423544e-03 1.52423360e-02 1.02928768e-02
  1.14862440e-02]
 [9.87158040e-01 2.26387875e-04 9.27111598e-03 3.00922977e-03
  3.35226115e-04]
 [1.39656863e-01 1.82414725e-01 1.09237233e-01 9.59575650e-02
  4.72733614e-01]
 [9.93256329e-01 3.04404525e-04 4.76653587e-03 9.87665607e-04
  6.85064983e-04]]
Classification Report for SVM:
 Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        96
           1       1.00      0.33      0.50         3
           2       1.00      0.44      0.62         9
           3       0.25      0.17      0.20         6
           4       0.50      0.44      0.47         9

    accuracy                           0.86       123
   

In [76]:
disease_prediction(cirrhosis_X_train, cirrhosis_y_train, cirrhosis_X_test, cirrhosis_y_test, multi_class=True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Optimized parameters for Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Sample(5) Prediction Actual Test & Predicted:  ['D' 'C' 'C' 'D' 'D'] [[0.62746169 0.04663825 0.32590006]
 [0.34627392 0.04074958 0.6129765 ]
 [0.46639299 0.05340291 0.48020409]
 [0.19718536 0.03727181 0.76554283]
 [0.10331192 0.02587136 0.87081672]]
Classification Report for Logistic Regression:
 Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.89      0.81        44
           1       1.00      0.00      0.00         4
           2       0.81      0.72      0.76        36

    accuracy                           0.77        84
   macro avg       0.85      0.54      0.53        84
weighted avg       0.79      0.77      0.75        84

Confusion Matrix:
[[39  0  5]
 [ 3  0  1]
 [10  0 26]]
Accuracy: 0.7738
----------------------------------------------------------------------------------------------------
Optimized para

In [80]:
disease_prediction(stroke_X_train, stroke_y_train, stroke_X_test, stroke_y_test, multi_class=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Optimized parameters for Logistic Regression: {'C': 0.001, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Sample(5) Prediction Actual Test & Predicted:  [0 0 0 0 0] [0.03617368 0.04141762 0.0306099  0.05658748 0.05426644]
Classification Report for Logistic Regression:
 Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       1.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.97      0.50      0.48      1022
weighted avg       0.94      0.94      0.91      1022

Confusion Matrix:
[[960   0]
 [ 62   0]]
Accuracy: 0.9393
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
AUC-ROC: 0.8371
----------------------------------------------------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


Optimized parameters for Gradient Boosting: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
Sample(5) Prediction Actual Test & Predicted:  [0 0 0 0 0] [0.02922044 0.02922044 0.02922044 0.08731909 0.08323702]
Classification Report for Gradient Boosting:
 Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       1.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.97      0.50      0.48      1022
weighted avg       0.94      0.94      0.91      1022

Confusion Matrix:
[[960   0]
 [ 62   0]]
Accuracy: 0.9393
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
AUC-ROC: 0.8353
----------------------------------------------------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


Optimized parameters for K-Nearest Neighbors: {}
Sample(5) Prediction Actual Test & Predicted:  [0 0 0 0 0] [0.  0.  0.  0.2 0. ]
Classification Report for K-Nearest Neighbors:
 Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.88      0.94      0.91      1022

Confusion Matrix:
[[957   3]
 [ 62   0]]
Accuracy: 0.9364
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
AUC-ROC: 0.6435
----------------------------------------------------------------------------------------------------
Optimized parameters for Random Forest: {}
Sample(5) Prediction Actual Test & Predicted:  [0 0 0 0 0] [0.01 0.01 0.   0.42 0.05]
Classification Report for Random Forest:
 Classification Report:
              precision    recall  f1-score   support

  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for viting_clf Ensemble Model of all above models:
 Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       1.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.97      0.50      0.48      1022
weighted avg       0.94      0.94      0.91      1022

Confusion Matrix:
[[960   0]
 [ 62   0]]
Accuracy: 0.9393
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
AUC-ROC: 0.8282


  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
disease_prediction(framingham_X_train,framingham_y_train, framingham_X_test, framingham_y_test)

Optimized parameters for Logistic Regression: {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Sample(5) Prediction Actual Test & Predicted:  [1 0 0 0 1] [0.08533844 0.11910695 0.12017267 0.43277495 0.18194329]
Classification Report for Logistic Regression:
 Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92       725
           1       0.53      0.07      0.12       123

    accuracy                           0.86       848
   macro avg       0.70      0.53      0.52       848
weighted avg       0.81      0.86      0.80       848

Confusion Matrix:
[[718   7]
 [115   8]]
Accuracy: 0.8561
Precision: 0.5333
Recall: 0.0650
F1 Score: 0.1159
AUC-ROC: 0.7096
----------------------------------------------------------------------------------------------------
Optimized parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Sample(5) Prediction Actual Test & Predicted:  [1 

In [27]:
disease_prediction(diabetes_X_train, diabetes_y_train, diabetes_X_test, diabetes_y_test)

Optimized parameters for Logistic Regression: {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Sample(5) Prediction Actual Test & Predicted:  [0. 0. 0. 0. 0.] [0.27496436 0.18507733 0.11292185 0.16814395 0.46821364]
Classification Report for Logistic Regression:
 Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      0.80      0.81        99
         1.0       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154

Confusion Matrix:
[[79 20]
 [18 37]]
Accuracy: 0.7532
Precision: 0.6491
Recall: 0.6727
F1 Score: 0.6607
AUC-ROC: 0.8143
----------------------------------------------------------------------------------------------------
Optimized parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Sample(5) Prediction Actual Test & Predicted:  [

In [81]:
disease_prediction(lung_cancer_X_train,  lung_cancer_y_train, lung_cancer_X_test, lung_cancer_y_test, multi_class=True)

Optimized parameters for Logistic Regression: {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Sample(5) Prediction Actual Test & Predicted:  ['Medium' 'Medium' 'Medium' 'High' 'Medium'] [[5.43948942e-03 7.70787442e-03 9.86852636e-01]
 [6.27263038e-04 1.52602513e-04 9.99220134e-01]
 [4.12870497e-03 3.62164361e-03 9.92249651e-01]
 [9.99060323e-01 2.70826192e-10 9.39676936e-04]
 [4.37615973e-05 2.58211391e-07 9.99955980e-01]]
Classification Report for Logistic Regression:
 Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        82
           1       1.00      1.00      1.00        55
           2       1.00      1.00      1.00        63

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Confusion Matrix:
[[82  0  0]
 [ 0 55  0]
 [ 0  0 63]]
Accuracy: 1.0000
-----------------------------------