---
*19CSE495 G063 Phase 1 Implementation*
---

---
***Dynamic AI-Powered Healthcare Prediction for Multiple Diseases***
---

---
*Team Members*

---
*Adithi Balaji - CB.EN.U4CSE20303*

*Aksita G - CB.EN.U4CSE20304*

*Dharaneish V C - CB.EN.U4CSE20315*

*Shanjaikumar VM - CB.EN.U4CSE20655*


---

## Pre processing

In [8]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the datasets
heart_failure_data = pd.read_csv('datasets/heart.csv')
hepatitis_c_data = pd.read_csv('datasets/HepatitisC.csv')
cirrhosis_data = pd.read_csv('datasets/cirrhosis.csv')
stroke_data = pd.read_csv('datasets/stroke.csv')
framingham_data = pd.read_csv('datasets/framingham-heart.csv')
pima_indians_data = pd.read_csv('datasets/diabetes.csv')
lung_cancer_data = pd.read_csv('datasets/lung-cancer.csv')


### Handling Missing Values (KNN Imputer)

In [10]:
# Display the number of missing values in each dataset for all columns
datasets = {
    'Heart Failure': heart_failure_data,
    'Hepatitis C': hepatitis_c_data,
    'Cirrhosis': cirrhosis_data,
    'Stroke': stroke_data,
    'Framingham': framingham_data,
    'Pima Indians': pima_indians_data,
    'Lung Cancer':lung_cancer_data
}

for dataset_name, dataset in datasets.items():

    print(f" \nDataset name: {dataset_name}")

    # Store column name, dtype, no of null rows in Column Info 
    column_info = pd.DataFrame({
        'Column Name': dataset.columns,
        'Data Type': dataset.dtypes,
        'No of Nulls': dataset.isnull().sum()
    })

    # Print the information as a table
    print("\nFeature Information:\n", column_info)


 
Dataset name: Heart Failure

Feature Information:
                    Column Name Data Type  No of Nulls
Age                        Age     int64            0
Sex                        Sex    object            0
ChestPainType    ChestPainType    object            0
RestingBP            RestingBP     int64            0
Cholesterol        Cholesterol     int64            0
FastingBS            FastingBS     int64            0
RestingECG          RestingECG    object            0
MaxHR                    MaxHR     int64            0
ExerciseAngina  ExerciseAngina    object            0
Oldpeak                Oldpeak   float64            0
ST_Slope              ST_Slope    object            0
HeartDisease      HeartDisease     int64            0
 
Dataset name: Hepatitis C

Feature Information:
            Column Name Data Type  No of Nulls
Unnamed: 0  Unnamed: 0     int64            0
Category      Category    object            0
Age                Age     int64            0
Sex       

In [11]:
from sklearn.impute import KNNImputer

# List of datasets having missing values
datasetsMV = [hepatitis_c_data, cirrhosis_data, stroke_data, framingham_data]

for dataset in datasetsMV:
    # Identify numerical and categorical columns
    numerical_columns = dataset.select_dtypes(include=['float64', 'int64']).columns
    categorical_columns = dataset.select_dtypes(include=['object']).columns

    # Check if numerical columns exist
    if not numerical_columns.empty:
        # Create a mask for missing values
        numerical_mask = dataset[numerical_columns].isnull()

        # Initialize KNNImputer for numerical values
        numerical_imputer = KNNImputer(n_neighbors=5, missing_values=np.nan)

        # Fill missing values in numerical columns
        dataset[numerical_columns] = numerical_imputer.fit_transform(dataset[numerical_columns])

        # Apply the mask to keep original non-null values
        dataset[numerical_columns] = dataset[numerical_columns].where(~numerical_mask, dataset[numerical_columns])

    # Check if categorical columns exist
    if not categorical_columns.empty:
        # Convert categorical columns to numeric using label encoding
        label_encoder = {}
        for col in categorical_columns:
            label_encoder[col] = pd.Categorical(dataset[col])
            dataset[col] = label_encoder[col].codes.astype(float)  # Convert to float

        # Create a mask for missing values
        categorical_mask = dataset[categorical_columns].isnull()

        # Initialize KNNImputer for categorical values with 'nan_euclidean' metric
        categorical_imputer = KNNImputer(n_neighbors=5, missing_values=np.nan, metric='nan_euclidean')

        # Fill missing values in categorical columns
        dataset[categorical_columns] = categorical_imputer.fit_transform(dataset[categorical_columns])

        # Convert back categorical columns to their original type
        for col in categorical_columns:
            dataset[col] = label_encoder[col].categories.take(np.round(dataset[col]).astype(int))

        # Apply the mask to keep original non-null values
        dataset[categorical_columns] = dataset[categorical_columns].where(~categorical_mask, dataset[categorical_columns])


In [12]:
datasets = {
    'Heart Failure': heart_failure_data,
    'Hepatitis C': hepatitis_c_data,
    'Cirrhosis': cirrhosis_data,
    'Stroke': stroke_data,
    'Framingham': framingham_data,
    'Pima Indians': pima_indians_data,
    'Lung Cancer':lung_cancer_data
}

for dataset_name, dataset in datasets.items():

    print(f"Dataset name: {dataset_name}")

    # Store column name, dtype, no of null rows in Column Info 
    column_info = pd.DataFrame({
        'Column Name': dataset.columns,
        'Data Type': dataset.dtypes,
        'No of Nulls': dataset.isnull().sum()
    })

    # Print the information as a table
    print("\nFeature Information:\n", column_info)

    # Print general statistics
    print("\nGeneral Statistics:\n", dataset.describe())

    # Sample 5 rows
    print("\nSample:\n", dataset.head())

Dataset name: Heart Failure

Feature Information:
                    Column Name Data Type  No of Nulls
Age                        Age     int64            0
Sex                        Sex    object            0
ChestPainType    ChestPainType    object            0
RestingBP            RestingBP     int64            0
Cholesterol        Cholesterol     int64            0
FastingBS            FastingBS     int64            0
RestingECG          RestingECG    object            0
MaxHR                    MaxHR     int64            0
ExerciseAngina  ExerciseAngina    object            0
Oldpeak                Oldpeak   float64            0
ST_Slope              ST_Slope    object            0
HeartDisease      HeartDisease     int64            0

General Statistics:
               Age   RestingBP  Cholesterol   FastingBS       MaxHR  \
count  918.000000  918.000000   918.000000  918.000000  918.000000   
mean    53.510893  132.396514   198.799564    0.233115  136.809368   
std      9.4326

### Encoding Categorical Columns

In [99]:
# Encoding the categorical data
heart_failure_data = pd.get_dummies(heart_failure_data, columns=["Sex", "ChestPainType", "FastingBS", "RestingECG", "ExerciseAngina", "ST_Slope"])
hepatitis_c_data = pd.get_dummies(hepatitis_c_data, columns=['Category', "Sex"])
cirrhosis_data = pd.get_dummies(cirrhosis_data, columns=['Drug','Drug', 'Sex', 'Ascites','Hepatomegaly','Spiders','Edema'])
stroke_data = pd.get_dummies(stroke_data, columns=['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])
framingham_data = pd.get_dummies(framingham_data, columns=['male', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose'])
pima_indians_data = pd.get_dummies(pima_indians_data, columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'])


### Feature Scaling Numerical Columns

In [100]:
# Feature Scaling
scaler = StandardScaler()

num_cols_heart_failure = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
heart_failure_data[num_cols_heart_failure] = scaler.fit_transform(heart_failure_data[num_cols_heart_failure])

num_cols_hepatitis_c = ['ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']
hepatitis_c_data[num_cols_hepatitis_c] = scaler.fit_transform(hepatitis_c_data[num_cols_hepatitis_c])

num_cols_cirrhosis = ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']
cirrhosis_data[num_cols_cirrhosis] = scaler.fit_transform(cirrhosis_data[num_cols_cirrhosis])

num_cols_stroke = ['age', 'avg_glucose_level', 'bmi']
stroke_data[num_cols_stroke] = scaler.fit_transform(stroke_data[num_cols_stroke])

num_cols_framingham = ['age']
framingham_data[num_cols_framingham] = scaler.fit_transform(framingham_data[num_cols_framingham])

num_cols_pima_indians = ['Pregnancies_0', 'Pregnancies_1', 'Pregnancies_2', 'Pregnancies_3', 'Pregnancies_4', 'Pregnancies_5', 'Pregnancies_6', 'Pregnancies_7', 'Pregnancies_8', 'Pregnancies_9', 'Age_21', 'Age_22', 'Age_23', 'Age_24', 'Age_25', 'Age_26', 'Age_27', 'Age_28', 'Age_29', 'Age_30', 'Age_31', 'Age_32', 'Age_33', 'Age_34', 'Age_35', 'Age_36', 'Age_37', 'Age_38', 'Age_39', 'Age_40', 'Age_41', 'Age_42', 'Age_43', 'Age_44', 'Age_45', 'Age_46', 'Age_47', 'Age_48', 'Age_49', 'Age_50', 'Age_51', 'Age_52', 'Age_53', 'Age_54', 'Age_55', 'Age_56', 'Age_57', 'Age_58', 'Age_59', 'Age_60', 'Age_61', 'Age_62', 'Age_63', 'Age_64', 'Age_65', 'Age_66']
pima_indians_data[num_cols_pima_indians] = scaler.fit_transform(pima_indians_data[num_cols_pima_indians])


### Test, Train Split

In [101]:
# Splitting the dataset
from sklearn.model_selection import train_test_split

heart_failure_X = heart_failure_data.drop('HeartDisease', axis=1)
heart_failure_y = heart_failure_data['HeartDisease']
heart_failure_X_train, heart_failure_X_test, heart_failure_y_train, heart_failure_y_test = train_test_split(heart_failure_X, heart_failure_y, test_size=0.2, random_state=42)

hepatitis_c_X = hepatitis_c_data.drop(['Category_0=Blood Donor', 'Category_0s=suspect Blood Donor', 'Category_1=Hepatitis', 'Category_2=Fibrosis', 'Category_3=Cirrhosis'], axis=1)
hepatitis_c_y = hepatitis_c_data[['Category_0=Blood Donor', 'Category_0s=suspect Blood Donor', 'Category_1=Hepatitis', 'Category_2=Fibrosis', 'Category_3=Cirrhosis']]
hepatitis_c_X_train, hepatitis_c_X_test, hepatitis_c_y_train, hepatitis_c_y_test = train_test_split(hepatitis_c_X, hepatitis_c_y, test_size=0.2, random_state=42)

cirrhosis_X = cirrhosis_data.drop('Status', axis=1)
cirrhosis_y = cirrhosis_data['Status']
cirrhosis_X_train, cirrhosis_X_test, cirrhosis_y_train, cirrhosis_y_test = train_test_split(cirrhosis_X, cirrhosis_y, test_size=0.2, random_state=42)

stroke_X = stroke_data.drop('stroke', axis=1)
stroke_y = stroke_data['stroke'].astype('int64')
stroke_X_train, stroke_X_test, stroke_y_train, stroke_y_test = train_test_split(stroke_X, stroke_y, test_size=0.2, random_state=42)

framingham_X = framingham_data.drop('TenYearCHD', axis=1)
framingham_y = framingham_data['TenYearCHD'].astype('int64')
framingham_X_train, framingham_X_test, framingham_y_train, framingham_y_test = train_test_split(framingham_X, framingham_y, test_size=0.2, random_state=42)

pima_indians_X = pima_indians_data.drop('Outcome', axis=1)
pima_indians_y = pima_indians_data['Outcome']
pima_indians_X_train, pima_indians_X_test, pima_indians_y_train, pima_indians_y_test = train_test_split(pima_indians_X, pima_indians_y, test_size=0.25, random_state=42)

In [102]:
datasets = {
    'Heart Failure': heart_failure_data,
    'Hepatitis C': hepatitis_c_data,
    'Cirrhosis': cirrhosis_data,
    'Stroke': stroke_data,
    'Framingham': framingham_data,
    'Pima Indians': pima_indians_data
}

for dataset_name, dataset in datasets.items():

    print(f"Dataset name: {dataset_name}")

    # Sample 5 rows
    print("\nSample:\n", dataset.head())

Dataset name: Heart Failure

Sample:
         Age  RestingBP  Cholesterol     MaxHR   Oldpeak  HeartDisease  Sex_F  \
0 -1.433140   0.410909     0.825070  1.382928 -0.832432             0      0   
1 -0.478484   1.491752    -0.171961  0.754157  0.105664             1      1   
2 -1.751359  -0.129513     0.770188 -1.525138 -0.832432             0      0   
3 -0.584556   0.302825     0.139040 -1.132156  0.574711             1      1   
4  0.051881   0.951331    -0.034755 -0.581981 -0.832432             0      0   

   Sex_M  ChestPainType_ASY  ChestPainType_ATA  ...  FastingBS_0  FastingBS_1  \
0      1                  0                  1  ...            1            0   
1      0                  0                  0  ...            1            0   
2      1                  0                  1  ...            1            0   
3      0                  1                  0  ...            1            0   
4      1                  0                  0  ...            1            

### Hepatisis - 4 label columns merging for multi-classification

In [103]:
# Combine the last four one-hot encoded columns in hepatitis_c_data
hepatitis_c_data['Combined_Category'] = hepatitis_c_data[['Category_0=Blood Donor', 'Category_0s=suspect Blood Donor', 'Category_1=Hepatitis', 'Category_2=Fibrosis', 'Category_3=Cirrhosis']].idxmax(axis=1)

# Drop the original one-hot encoded columns
hepatitis_c_data.drop(['Category_0=Blood Donor', 'Category_0s=suspect Blood Donor', 'Category_1=Hepatitis', 'Category_2=Fibrosis', 'Category_3=Cirrhosis'], axis=1, inplace=True)

# Check the resulting dataset
print(hepatitis_c_data.head())

# Split the modified dataset
hepatitis_c_X = hepatitis_c_data.drop('Combined_Category', axis=1)
hepatitis_c_y = hepatitis_c_data['Combined_Category']
hepatitis_c_X_train, hepatitis_c_X_test, hepatitis_c_y_train, hepatitis_c_y_test = train_test_split(hepatitis_c_X, hepatitis_c_y, test_size=0.2, random_state=42)



   Unnamed: 0   Age       ALB       ALP       ALT       AST       BIL  \
0         1.0  32.0 -0.541059 -0.588387 -0.816164 -0.383693 -0.198236   
1         2.0  32.0 -0.541059  0.099408 -0.411104 -0.305057 -0.381375   
2         3.0  32.0  0.914350  0.269424  0.304633  0.538767 -0.269457   
3         4.0  32.0  0.273277 -0.607707  0.084406 -0.368571  0.381706   
4         5.0  32.0 -0.419775  0.246240  0.163059 -0.302033 -0.091404   

        CHE      CHOL      CREA       GGT      PROT  Sex_f  Sex_m  \
0 -0.574734 -1.894488  0.497070 -0.502286 -0.561472      0      1   
1  1.349161 -0.502002 -0.146590 -0.438203  0.827081      0      1   
2  0.291926 -0.147228  0.094783 -0.115957  1.345474      0      1   
3 -0.393234 -0.555218 -0.025903 -0.104971  0.678969      0      1   
4  0.432588 -0.927730 -0.106361 -0.176378 -0.617014      0      1   

        Combined_Category  
0  Category_0=Blood Donor  
1  Category_0=Blood Donor  
2  Category_0=Blood Donor  
3  Category_0=Blood Donor  
4  Cat

# ML Models


In [104]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

In [105]:

# Define functions for training and evaluating models
def train_and_evaluate_binary_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Print metrics
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))

    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

def train_and_evaluate_multiclass_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Print metrics
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

4 Datasets - Binary Classification
2 Datasets - Multi Classification

### Binary Classification - RandomForest 

In [106]:

# 1. Heart Failure dataset (Binary Classification)
heart_failure_model = RandomForestClassifier(random_state=42)
train_and_evaluate_binary_model(heart_failure_X_train, heart_failure_X_test, heart_failure_y_train, heart_failure_y_test, heart_failure_model)

# 4. Stroke dataset (Binary Classification)
stroke_model = RandomForestClassifier(random_state=42)
train_and_evaluate_binary_model(stroke_X_train, stroke_X_test, stroke_y_train, stroke_y_test, stroke_model)

# 5. Framingham dataset (Binary Classification)
framingham_model = RandomForestClassifier(random_state=42)
train_and_evaluate_binary_model(framingham_X_train, framingham_X_test, framingham_y_train, framingham_y_test, framingham_model)

# 6. Pima Indians dataset (Binary Classification)
pima_indians_model = RandomForestClassifier(random_state=42)
train_and_evaluate_binary_model(pima_indians_X_train, pima_indians_X_test, pima_indians_y_train, pima_indians_y_test, pima_indians_model)


Accuracy: 0.875
Precision: 0.9038461538461539
Recall: 0.8785046728971962
F1 Score: 0.8909952606635071

Confusion Matrix:
 [[67 10]
 [13 94]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.87      0.85        77
           1       0.90      0.88      0.89       107

    accuracy                           0.88       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.88      0.88      0.88       184

Accuracy: 0.9393346379647749
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

Confusion Matrix:
 [[960   0]
 [ 62   0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       1.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.97      0.50      0.48      1022
weighted avg       0.94      0.94      0.91      1022



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8549528301886793
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

Confusion Matrix:
 [[725   0]
 [123   0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92       725
           1       1.00      0.00      0.00       123

    accuracy                           0.85       848
   macro avg       0.93      0.50      0.46       848
weighted avg       0.88      0.85      0.79       848



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6354166666666666
Precision: 0.4782608695652174
Recall: 0.15942028985507245
F1 Score: 0.2391304347826087

Confusion Matrix:
 [[111  12]
 [ 58  11]]

Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.90      0.76       123
           1       0.48      0.16      0.24        69

    accuracy                           0.64       192
   macro avg       0.57      0.53      0.50       192
weighted avg       0.59      0.64      0.57       192



### Binary Classification - Logistic Regression 

In [107]:
# 1. Heart Failure dataset (Binary Classification)
heart_failure_model = LogisticRegression(max_iter=1000, random_state=42)
train_and_evaluate_binary_model(heart_failure_X_train, heart_failure_X_test, heart_failure_y_train, heart_failure_y_test, heart_failure_model)

# 4. Stroke dataset (Binary Classification)
stroke_model = LogisticRegression(max_iter=1000,random_state=42)
train_and_evaluate_binary_model(stroke_X_train, stroke_X_test, stroke_y_train, stroke_y_test, stroke_model)

# 5. Framingham dataset (Binary Classification)
framingham_model = LogisticRegression(max_iter=1000,random_state=42)
train_and_evaluate_binary_model(framingham_X_train, framingham_X_test, framingham_y_train, framingham_y_test, framingham_model)

# 6. Pima Indians dataset (Binary Classification)
pima_indians_model = LogisticRegression(max_iter=1000,random_state=42)
train_and_evaluate_binary_model(pima_indians_X_train, pima_indians_X_test, pima_indians_y_train, pima_indians_y_test, pima_indians_model)

Accuracy: 0.8532608695652174
Precision: 0.9
Recall: 0.8411214953271028
F1 Score: 0.8695652173913043

Confusion Matrix:
 [[67 10]
 [17 90]]

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.87      0.83        77
           1       0.90      0.84      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted avg       0.86      0.85      0.85       184

Accuracy: 0.9393346379647749
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

Confusion Matrix:
 [[960   0]
 [ 62   0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       1.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.97      0.50      0.48      1022
weighted avg       0.94      0.94      0.91      1022



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8337264150943396
Precision: 0.2857142857142857
Recall: 0.0975609756097561
F1 Score: 0.14545454545454548

Confusion Matrix:
 [[695  30]
 [111  12]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.96      0.91       725
           1       0.29      0.10      0.15       123

    accuracy                           0.83       848
   macro avg       0.57      0.53      0.53       848
weighted avg       0.78      0.83      0.80       848

Accuracy: 0.6354166666666666
Precision: 0.4918032786885246
Recall: 0.43478260869565216
F1 Score: 0.4615384615384615

Confusion Matrix:
 [[92 31]
 [39 30]]

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.75      0.72       123
           1       0.49      0.43      0.46        69

    accuracy                           0.64       192
   macro avg       0.60      0.59      0.59       192
weighted avg       0.63      0.64    

### Binary Classification - Decision Tree 

In [108]:
# 1. Heart Failure dataset (Binary Classification)
heart_failure_model = DecisionTreeClassifier( random_state=42)
train_and_evaluate_binary_model(heart_failure_X_train, heart_failure_X_test, heart_failure_y_train, heart_failure_y_test, heart_failure_model)

# 4. Stroke dataset (Binary Classification)
stroke_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate_binary_model(stroke_X_train, stroke_X_test, stroke_y_train, stroke_y_test, stroke_model)

# 5. Framingham dataset (Binary Classification)
framingham_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate_binary_model(framingham_X_train, framingham_X_test, framingham_y_train, framingham_y_test, framingham_model)

# 6. Pima Indians dataset (Binary Classification)
pima_indians_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate_binary_model(pima_indians_X_train, pima_indians_X_test, pima_indians_y_train, pima_indians_y_test, pima_indians_model)

Accuracy: 0.8043478260869565
Precision: 0.865979381443299
Recall: 0.7850467289719626
F1 Score: 0.8235294117647058

Confusion Matrix:
 [[64 13]
 [23 84]]

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.83      0.78        77
           1       0.87      0.79      0.82       107

    accuracy                           0.80       184
   macro avg       0.80      0.81      0.80       184
weighted avg       0.81      0.80      0.81       184

Accuracy: 0.9129158512720157
Precision: 0.18604651162790697
Recall: 0.12903225806451613
F1 Score: 0.15238095238095237

Confusion Matrix:
 [[925  35]
 [ 54   8]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.96      0.95       960
           1       0.19      0.13      0.15        62

    accuracy                           0.91      1022
   macro avg       0.57      0.55      0.55      1022
weighted avg       0.90      0.91    

### Binary Classification - Gaussian Naive Bayers 

In [109]:
# 1. Heart Failure dataset (Binary Classification)
heart_failure_model = GaussianNB()
train_and_evaluate_binary_model(heart_failure_X_train, heart_failure_X_test, heart_failure_y_train, heart_failure_y_test, heart_failure_model)

# 4. Stroke dataset (Binary Classification)
stroke_model = GaussianNB()
train_and_evaluate_binary_model(stroke_X_train, stroke_X_test, stroke_y_train, stroke_y_test, stroke_model)

# 5. Framingham dataset (Binary Classification)
framingham_model = GaussianNB()
train_and_evaluate_binary_model(framingham_X_train, framingham_X_test, framingham_y_train, framingham_y_test, framingham_model)

# 6. Pima Indians dataset (Binary Classification)
pima_indians_model = GaussianNB()
train_and_evaluate_binary_model(pima_indians_X_train, pima_indians_X_test, pima_indians_y_train, pima_indians_y_test, pima_indians_model)

Accuracy: 0.8641304347826086
Precision: 0.9270833333333334
Recall: 0.8317757009345794
F1 Score: 0.8768472906403941

Confusion Matrix:
 [[70  7]
 [18 89]]

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.91      0.85        77
           1       0.93      0.83      0.88       107

    accuracy                           0.86       184
   macro avg       0.86      0.87      0.86       184
weighted avg       0.87      0.86      0.86       184

Accuracy: 0.9383561643835616
Precision: 0.47058823529411764
Recall: 0.12903225806451613
F1 Score: 0.20253164556962025

Confusion Matrix:
 [[951   9]
 [ 54   8]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       960
           1       0.47      0.13      0.20        62

    accuracy                           0.94      1022
   macro avg       0.71      0.56      0.59      1022
weighted avg       0.92      0.94   

Accuracy: 0.6073113207547169
Precision: 0.1590909090909091
Recall: 0.3983739837398374
F1 Score: 0.22737819025522038

Confusion Matrix:
 [[466 259]
 [ 74  49]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.64      0.74       725
           1       0.16      0.40      0.23       123

    accuracy                           0.61       848
   macro avg       0.51      0.52      0.48       848
weighted avg       0.76      0.61      0.66       848

Accuracy: 0.5989583333333334
Precision: 0.4574468085106383
Recall: 0.6231884057971014
F1 Score: 0.5276073619631901

Confusion Matrix:
 [[72 51]
 [26 43]]

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.59      0.65       123
           1       0.46      0.62      0.53        69

    accuracy                           0.60       192
   macro avg       0.60      0.60      0.59       192
weighted avg       0.64      0.60     

### Multi-class classification - Random Forest

In [110]:
# Train and evaluate the multi-class classification model

# 2. Hepatitis C dataset (Multi-label classification)
hepatitis_c_multi_class_model = RandomForestClassifier(random_state=42)
train_and_evaluate_multiclass_model(hepatitis_c_X_train, hepatitis_c_X_test, hepatitis_c_y_train, hepatitis_c_y_test, hepatitis_c_multi_class_model)


# 3. Cirrhosis dataset (Multiclass Classification)
cirrhosis_model = RandomForestClassifier(random_state=42)
train_and_evaluate_multiclass_model(cirrhosis_X_train, cirrhosis_X_test, cirrhosis_y_train, cirrhosis_y_test, cirrhosis_model)



Accuracy: 0.943089430894309

Confusion Matrix:
 [[96  0  0  0  0]
 [ 1  1  1  0  0]
 [ 0  0  8  1  0]
 [ 0  0  2  4  0]
 [ 0  0  0  2  7]]

Classification Report:
                                  precision    recall  f1-score   support

         Category_0=Blood Donor       0.99      1.00      0.99        96
Category_0s=suspect Blood Donor       1.00      0.33      0.50         3
           Category_1=Hepatitis       0.73      0.89      0.80         9
            Category_2=Fibrosis       0.57      0.67      0.62         6
           Category_3=Cirrhosis       1.00      0.78      0.88         9

                       accuracy                           0.94       123
                      macro avg       0.86      0.73      0.76       123
                   weighted avg       0.95      0.94      0.94       123

Accuracy: 0.8095238095238095

Confusion Matrix:
 [[40  0  4]
 [ 3  0  1]
 [ 8  0 28]]

Classification Report:
               precision    recall  f1-score   support

          

### Multi-class classification - Logistic Regression

In [111]:
# Train and evaluate the multi-class classification model

# 2. Hepatitis C dataset (Multi-label classification)
hepatitis_c_multi_class_model = LogisticRegression(max_iter=1000, random_state=42)
train_and_evaluate_multiclass_model(hepatitis_c_X_train, hepatitis_c_X_test, hepatitis_c_y_train, hepatitis_c_y_test, hepatitis_c_multi_class_model)


# 3. Cirrhosis dataset (Multiclass Classification)
cirrhosis_model = LogisticRegression( max_iter=1000, random_state=42)
train_and_evaluate_multiclass_model(cirrhosis_X_train, cirrhosis_X_test, cirrhosis_y_train, cirrhosis_y_test, cirrhosis_model)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.9105691056910569

Confusion Matrix:
 [[96  0  0  0  0]
 [ 0  2  0  1  0]
 [ 0  0  6  2  1]
 [ 2  0  0  4  0]
 [ 2  0  0  3  4]]

Classification Report:
                                  precision    recall  f1-score   support

         Category_0=Blood Donor       0.96      1.00      0.98        96
Category_0s=suspect Blood Donor       1.00      0.67      0.80         3
           Category_1=Hepatitis       1.00      0.67      0.80         9
            Category_2=Fibrosis       0.40      0.67      0.50         6
           Category_3=Cirrhosis       0.80      0.44      0.57         9

                       accuracy                           0.91       123
                      macro avg       0.83      0.69      0.73       123
                   weighted avg       0.92      0.91      0.91       123

Accuracy: 0.8690476190476191

Confusion Matrix:
 [[42  0  2]
 [ 3  0  1]
 [ 5  0 31]]

Classification Report:
               precision    recall  f1-score   support

         

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Multi-class classification - Decision Tree

In [112]:
# 2. Hepatitis C dataset (Multi-label classification)
hepatitis_c_multi_class_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate_multiclass_model(hepatitis_c_X_train, hepatitis_c_X_test, hepatitis_c_y_train, hepatitis_c_y_test, hepatitis_c_multi_class_model)


# 3. Cirrhosis dataset (Multiclass Classification)
cirrhosis_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate_multiclass_model(cirrhosis_X_train, cirrhosis_X_test, cirrhosis_y_train, cirrhosis_y_test, cirrhosis_model)



Accuracy: 0.975609756097561

Confusion Matrix:
 [[96  0  0  0  0]
 [ 0  1  2  0  0]
 [ 0  0  9  0  0]
 [ 0  0  0  6  0]
 [ 0  0  0  1  8]]

Classification Report:
                                  precision    recall  f1-score   support

         Category_0=Blood Donor       1.00      1.00      1.00        96
Category_0s=suspect Blood Donor       1.00      0.33      0.50         3
           Category_1=Hepatitis       0.82      1.00      0.90         9
            Category_2=Fibrosis       0.86      1.00      0.92         6
           Category_3=Cirrhosis       1.00      0.89      0.94         9

                       accuracy                           0.98       123
                      macro avg       0.94      0.84      0.85       123
                   weighted avg       0.98      0.98      0.97       123

Accuracy: 0.8214285714285714

Confusion Matrix:
 [[38  0  6]
 [ 3  1  0]
 [ 5  1 30]]

Classification Report:
               precision    recall  f1-score   support

          

### Multi-class classification - Gaussian Navies Bayers

In [113]:
# 2. Hepatitis C dataset (Multi-label classification)
hepatitis_c_multi_class_model = GaussianNB()
train_and_evaluate_multiclass_model(hepatitis_c_X_train, hepatitis_c_X_test, hepatitis_c_y_train, hepatitis_c_y_test, hepatitis_c_multi_class_model)


# 3. Cirrhosis dataset (Multiclass Classification)
cirrhosis_model = GaussianNB()
train_and_evaluate_multiclass_model(cirrhosis_X_train, cirrhosis_X_test, cirrhosis_y_train, cirrhosis_y_test, cirrhosis_model)



Accuracy: 0.9349593495934959

Confusion Matrix:
 [[96  0  0  0  0]
 [ 0  2  1  0  0]
 [ 2  0  5  1  1]
 [ 1  0  0  5  0]
 [ 0  0  0  2  7]]

Classification Report:
                                  precision    recall  f1-score   support

         Category_0=Blood Donor       0.97      1.00      0.98        96
Category_0s=suspect Blood Donor       1.00      0.67      0.80         3
           Category_1=Hepatitis       0.83      0.56      0.67         9
            Category_2=Fibrosis       0.62      0.83      0.71         6
           Category_3=Cirrhosis       0.88      0.78      0.82         9

                       accuracy                           0.93       123
                      macro avg       0.86      0.77      0.80       123
                   weighted avg       0.94      0.93      0.93       123

Accuracy: 0.5833333333333334

Confusion Matrix:
 [[29 10  5]
 [ 4  0  0]
 [ 7  9 20]]

Classification Report:
               precision    recall  f1-score   support

         