---
*19CSE495 G063 Phase 1 Implementation*
---

---
***Dynamic AI-Powered Healthcare Prediction for Multiple Diseases***
---

---
*Team Members*

---
*Adithi Balaji - CB.EN.U4CSE20303*

*Aksita G - CB.EN.U4CSE20304*

*Dharaneish V C - CB.EN.U4CSE20315*

*Shanjaikumar VM - CB.EN.U4CSE20655*


---

## Pre processing

In [85]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the datasets
heart_failure_data = pd.read_csv('datasets/heart.csv')
hepatitis_c_data = pd.read_csv('datasets/HepatitisC.csv')
cirrhosis_data = pd.read_csv('datasets/cirrhosis.csv')
stroke_data = pd.read_csv('datasets/stroke.csv')
framingham_data = pd.read_csv('datasets/framingham-heart.csv')
diabetes_pimaIndian_data = pd.read_csv('datasets/diabetes.csv')
lung_cancer_data = pd.read_csv('datasets/lung-cancer.csv')


### Handling Missing Values (KNN Imputer)

In [86]:
# Display the number of missing values in each dataset for all columns
datasets = {
    'Heart Failure': heart_failure_data,
    'Hepatitis C': hepatitis_c_data,
    'Cirrhosis': cirrhosis_data,
    'Stroke': stroke_data,
    'Framingham': framingham_data,
    'Diabetes': diabetes_pimaIndian_data,
    'Lung Cancer':lung_cancer_data
}

for dataset_name, dataset in datasets.items():

    print(f" \nDataset name: {dataset_name}")

    # Store column name, dtype, no of null rows in Column Info 
    column_info = pd.DataFrame({
        'Column Name': dataset.columns,
        'Data Type': dataset.dtypes,
        'No of Nulls': dataset.isnull().sum()
    })

    # Print the information as a table
    print("\nFeature Information:\n", column_info)


 
Dataset name: Heart Failure

Feature Information:
                    Column Name Data Type  No of Nulls
Age                        Age     int64            0
Sex                        Sex    object            0
ChestPainType    ChestPainType    object            0
RestingBP            RestingBP     int64            0
Cholesterol        Cholesterol     int64            0
FastingBS            FastingBS     int64            0
RestingECG          RestingECG    object            0
MaxHR                    MaxHR     int64            0
ExerciseAngina  ExerciseAngina    object            0
Oldpeak                Oldpeak   float64            0
ST_Slope              ST_Slope    object            0
HeartDisease      HeartDisease     int64            0
 
Dataset name: Hepatitis C

Feature Information:
          Column Name Data Type  No of Nulls
Category    Category    object            0
Age              Age     int64            0
Sex              Sex    object            0
ALB              A

In [87]:
from sklearn.impute import KNNImputer

# List of datasets having missing values
datasetsMV = [hepatitis_c_data, cirrhosis_data, stroke_data, framingham_data]

for dataset in datasetsMV:
    # Identify numerical and categorical columns
    numerical_columns = dataset.select_dtypes(include=['float64', 'int64']).columns
    categorical_columns = dataset.select_dtypes(include=['object']).columns

    # Check if numerical columns exist
    if not numerical_columns.empty:
        # Create a mask for missing values
        numerical_mask = dataset[numerical_columns].isnull()

        # Initialize KNNImputer for numerical values
        numerical_imputer = KNNImputer(n_neighbors=5, missing_values=np.nan)

        # Fill missing values in numerical columns
        dataset[numerical_columns] = numerical_imputer.fit_transform(dataset[numerical_columns])

        # Apply the mask to keep original non-null values
        dataset[numerical_columns] = dataset[numerical_columns].where(~numerical_mask, dataset[numerical_columns])

    # Check if categorical columns exist
    if not categorical_columns.empty:
        # Convert categorical columns to numeric using label encoding
        label_encoder = {}
        for col in categorical_columns:
            label_encoder[col] = pd.Categorical(dataset[col])
            dataset[col] = label_encoder[col].codes.astype(float)  # Convert to float

        # Create a mask for missing values
        categorical_mask = dataset[categorical_columns].isnull()

        # Initialize KNNImputer for categorical values with 'nan_euclidean' metric
        categorical_imputer = KNNImputer(n_neighbors=5, missing_values=np.nan, metric='nan_euclidean')

        # Fill missing values in categorical columns
        dataset[categorical_columns] = categorical_imputer.fit_transform(dataset[categorical_columns])

        # Convert back categorical columns to their original type
        for col in categorical_columns:
            dataset[col] = label_encoder[col].categories.take(np.round(dataset[col]).astype(int))

        # Apply the mask to keep original non-null values
        dataset[categorical_columns] = dataset[categorical_columns].where(~categorical_mask, dataset[categorical_columns])


In [88]:
datasets = {
    'Heart Failure': heart_failure_data,
    'Hepatitis C': hepatitis_c_data,
    'Cirrhosis': cirrhosis_data,
    'Stroke': stroke_data,
    'Framingham': framingham_data,
    'Diabetes': diabetes_pimaIndian_data,
    'Lung Cancer':lung_cancer_data
}

for dataset_name, dataset in datasets.items():

    print(f"Dataset name: {dataset_name}")

    # Store column name, dtype, no of null rows in Column Info 
    column_info = pd.DataFrame({
        'Column Name': dataset.columns,
        'Data Type': dataset.dtypes,
        'No of Nulls': dataset.isnull().sum()
    })

    # Print the information as a table
    print("\nFeature Information:\n", column_info)

    # Print general statistics
    print("\nGeneral Statistics:\n", dataset.describe())

    # Sample 5 rows
    print("\nSample:\n", dataset.head())

Dataset name: Heart Failure

Feature Information:
                    Column Name Data Type  No of Nulls
Age                        Age     int64            0
Sex                        Sex    object            0
ChestPainType    ChestPainType    object            0
RestingBP            RestingBP     int64            0
Cholesterol        Cholesterol     int64            0
FastingBS            FastingBS     int64            0
RestingECG          RestingECG    object            0
MaxHR                    MaxHR     int64            0
ExerciseAngina  ExerciseAngina    object            0
Oldpeak                Oldpeak   float64            0
ST_Slope              ST_Slope    object            0
HeartDisease      HeartDisease     int64            0

General Statistics:
               Age   RestingBP  Cholesterol   FastingBS       MaxHR  \
count  918.000000  918.000000   918.000000  918.000000  918.000000   
mean    53.510893  132.396514   198.799564    0.233115  136.809368   
std      9.4326

### Encoding Categorical Columns

In [89]:
# Encoding the categorical data
heart_failure_data = pd.get_dummies(heart_failure_data, columns=["Sex", "ChestPainType", "FastingBS", "RestingECG", "ExerciseAngina", "ST_Slope"])
hepatitis_c_data = pd.get_dummies(hepatitis_c_data, columns=["Sex"])
cirrhosis_data = pd.get_dummies(cirrhosis_data, columns=['Drug', 'Sex', 'Ascites','Hepatomegaly','Spiders','Edema'])
stroke_data = pd.get_dummies(stroke_data, columns=['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])
framingham_data = pd.get_dummies(framingham_data, columns=['male', 'currentSmoker', 'prevalentStroke', 'prevalentHyp', 'diabetes'])
lung_cancer_data = pd.get_dummies(lung_cancer_data, columns=['Gender'])

### Feature Scaling Numerical Columns

In [90]:
'''
ArithmeticError# Feature Scaling
scaler = StandardScaler()

num_cols_heart_failure = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
heart_failure_data[num_cols_heart_failure] = scaler.fit_transform(heart_failure_data[num_cols_heart_failure])

num_cols_hepatitis_c = ['ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']
hepatitis_c_data[num_cols_hepatitis_c] = scaler.fit_transform(hepatitis_c_data[num_cols_hepatitis_c])

num_cols_cirrhosis = ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']
cirrhosis_data[num_cols_cirrhosis] = scaler.fit_transform(cirrhosis_data[num_cols_cirrhosis])

num_cols_stroke = ['age', 'avg_glucose_level', 'bmi']
stroke_data[num_cols_stroke] = scaler.fit_transform(stroke_data[num_cols_stroke])

num_cols_framingham = ['age']
framingham_data[num_cols_framingham] = scaler.fit_transform(framingham_data[num_cols_framingham])

num_cols_pima_indians = ['Pregnancies', 'Age']
diabetes_pimaIndian_data[num_cols_pima_indians] = scaler.fit_transform(diabetes_pimaIndian_data[num_cols_pima_indians])
'''

"\nArithmeticError# Feature Scaling\nscaler = StandardScaler()\n\nnum_cols_heart_failure = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']\nheart_failure_data[num_cols_heart_failure] = scaler.fit_transform(heart_failure_data[num_cols_heart_failure])\n\nnum_cols_hepatitis_c = ['ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']\nhepatitis_c_data[num_cols_hepatitis_c] = scaler.fit_transform(hepatitis_c_data[num_cols_hepatitis_c])\n\nnum_cols_cirrhosis = ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']\ncirrhosis_data[num_cols_cirrhosis] = scaler.fit_transform(cirrhosis_data[num_cols_cirrhosis])\n\nnum_cols_stroke = ['age', 'avg_glucose_level', 'bmi']\nstroke_data[num_cols_stroke] = scaler.fit_transform(stroke_data[num_cols_stroke])\n\nnum_cols_framingham = ['age']\nframingham_data[num_cols_framingham] = scaler.fit_transform(framingham_data[num_cols_framingham])\n\nnum_cols_pima

### Test, Train Split

In [91]:
# Splitting the dataset
from sklearn.model_selection import train_test_split

heart_failure_X = heart_failure_data.drop('HeartDisease', axis=1)
heart_failure_y = heart_failure_data['HeartDisease']
heart_failure_X_train, heart_failure_X_test, heart_failure_y_train, heart_failure_y_test = train_test_split(heart_failure_X, heart_failure_y, test_size=0.2, random_state=42)

hepatitis_c_X = hepatitis_c_data.drop('Category', axis=1)
hepatitis_c_y = hepatitis_c_data['Category']
hepatitis_c_X_train, hepatitis_c_X_test, hepatitis_c_y_train, hepatitis_c_y_test = train_test_split(hepatitis_c_X, hepatitis_c_y, test_size=0.2, random_state=42)

cirrhosis_X = cirrhosis_data.drop('Status', axis=1)
cirrhosis_y = cirrhosis_data['Status']
cirrhosis_X_train, cirrhosis_X_test, cirrhosis_y_train, cirrhosis_y_test = train_test_split(cirrhosis_X, cirrhosis_y, test_size=0.2, random_state=42)

stroke_X = stroke_data.drop('stroke', axis=1)
stroke_y = stroke_data['stroke'].astype('int64')
stroke_X_train, stroke_X_test, stroke_y_train, stroke_y_test = train_test_split(stroke_X, stroke_y, test_size=0.2, random_state=42)

framingham_X = framingham_data.drop('TenYearCHD', axis=1)
framingham_y = framingham_data['TenYearCHD'].astype('int64')
framingham_X_train, framingham_X_test, framingham_y_train, framingham_y_test = train_test_split(framingham_X, framingham_y, test_size=0.2, random_state=42)

diabetes_X = diabetes_pimaIndian_data.drop('Outcome', axis=1)
diabetes_y = diabetes_pimaIndian_data['Outcome']
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(diabetes_X, diabetes_y, test_size=0.2, random_state=42)

lung_cancer_X = lung_cancer_data.drop('Level', axis=1)
lung_cancer_Y = lung_cancer_data['Level']
lung_cancer_X_train, lung_cancer_X_test, lung_cancer_y_train, lung_cancer_y_test = train_test_split(lung_cancer_X, lung_cancer_Y, test_size=0.2, random_state=42)


In [92]:
datasets = {
    'Heart Failure': heart_failure_data,
    'Hepatitis C': hepatitis_c_data,
    'Cirrhosis': cirrhosis_data,
    'Stroke': stroke_data,
    'Framingham': framingham_data,
    'Diabetes': diabetes_pimaIndian_data,
    'Lung Cancer': lung_cancer_data
}

for dataset_name, dataset in datasets.items():

    print(f"Dataset name: {dataset_name}")

    # Sample 5 rows
    print("\nSample:\n", dataset.head())

Dataset name: Heart Failure

Sample:
    Age  RestingBP  Cholesterol  MaxHR  Oldpeak  HeartDisease  Sex_F  Sex_M  \
0   40        140          289    172      0.0             0      0      1   
1   49        160          180    156      1.0             1      1      0   
2   37        130          283     98      0.0             0      0      1   
3   48        138          214    108      1.5             1      1      0   
4   54        150          195    122      0.0             0      0      1   

   ChestPainType_ASY  ChestPainType_ATA  ...  FastingBS_0  FastingBS_1  \
0                  0                  1  ...            1            0   
1                  0                  0  ...            1            0   
2                  0                  1  ...            1            0   
3                  1                  0  ...            1            0   
4                  0                  0  ...            1            0   

   RestingECG_LVH  RestingECG_Normal  RestingECG

Hepatisis & Lung Cancer are Multinomial classification

# ML Models


In [93]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

In [94]:

# Define functions for training and evaluating models
def train_and_evaluate_binary_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Print metrics
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))

    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

def train_and_evaluate_multiclass_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Print metrics
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

4 Datasets - Binary Classification
3 Datasets - Multi Classification

### Binary Classification - RandomForest 

In [95]:

# 1. Heart Failure dataset (Binary Classification)
heart_failure_model = RandomForestClassifier(random_state=42)
train_and_evaluate_binary_model(heart_failure_X_train, heart_failure_X_test, heart_failure_y_train, heart_failure_y_test, heart_failure_model)

# 4. Stroke dataset (Binary Classification)
stroke_model = RandomForestClassifier(random_state=42)
train_and_evaluate_binary_model(stroke_X_train, stroke_X_test, stroke_y_train, stroke_y_test, stroke_model)

# 5. Framingham dataset (Binary Classification)
framingham_model = RandomForestClassifier(random_state=42)
train_and_evaluate_binary_model(framingham_X_train, framingham_X_test, framingham_y_train, framingham_y_test, framingham_model)

# 6. Diabetes Pima Indians dataset (Binary Classification)
diabetes_model = RandomForestClassifier(random_state=42)
train_and_evaluate_binary_model(diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test, diabetes_model)


Accuracy: 0.8804347826086957
Precision: 0.9047619047619048
Recall: 0.8878504672897196
F1 Score: 0.8962264150943396

Confusion Matrix:
 [[67 10]
 [12 95]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.87      0.86        77
           1       0.90      0.89      0.90       107

    accuracy                           0.88       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.88      0.88      0.88       184

Accuracy: 0.9383561643835616
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

Confusion Matrix:
 [[959   1]
 [ 62   0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.88      0.94      0.91      1022

Accuracy: 0.8525943396226415


### Binary Classification - Logistic Regression 

In [96]:
# 1. Heart Failure dataset (Binary Classification)
heart_failure_model = LogisticRegression(max_iter=1000, random_state=42)
train_and_evaluate_binary_model(heart_failure_X_train, heart_failure_X_test, heart_failure_y_train, heart_failure_y_test, heart_failure_model)

# 4. Stroke dataset (Binary Classification)
stroke_model = LogisticRegression(max_iter=1000,random_state=42)
train_and_evaluate_binary_model(stroke_X_train, stroke_X_test, stroke_y_train, stroke_y_test, stroke_model)

# 5. Framingham dataset (Binary Classification)
framingham_model = LogisticRegression(max_iter=1000,random_state=42)
train_and_evaluate_binary_model(framingham_X_train, framingham_X_test, framingham_y_train, framingham_y_test, framingham_model)

# 6. Pima Indians dataset (Binary Classification)
diabetes_model = LogisticRegression(max_iter=1000,random_state=42)
train_and_evaluate_binary_model(diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test, diabetes_model)

Accuracy: 0.8532608695652174
Precision: 0.9
Recall: 0.8411214953271028
F1 Score: 0.8695652173913043

Confusion Matrix:
 [[67 10]
 [17 90]]

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.87      0.83        77
           1       0.90      0.84      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted avg       0.86      0.85      0.85       184

Accuracy: 0.9393346379647749
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

Confusion Matrix:
 [[960   0]
 [ 62   0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       1.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.97      0.50      0.48      1022
weighted avg       0.94      0.94      0.91      1022



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8573113207547169
Precision: 0.5714285714285714
Recall: 0.06504065040650407
F1 Score: 0.11678832116788321

Confusion Matrix:
 [[719   6]
 [115   8]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.99      0.92       725
           1       0.57      0.07      0.12       123

    accuracy                           0.86       848
   macro avg       0.72      0.53      0.52       848
weighted avg       0.82      0.86      0.81       848

Accuracy: 0.7467532467532467
Precision: 0.6379310344827587
Recall: 0.6727272727272727
F1 Score: 0.6548672566371682

Confusion Matrix:
 [[78 21]
 [18 37]]

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.79      0.80        99
           1       0.64      0.67      0.65        55

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.75      0.75    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Binary Classification - Decision Tree 

In [97]:
# 1. Heart Failure dataset (Binary Classification)
heart_failure_model = DecisionTreeClassifier( random_state=42)
train_and_evaluate_binary_model(heart_failure_X_train, heart_failure_X_test, heart_failure_y_train, heart_failure_y_test, heart_failure_model)

# 4. Stroke dataset (Binary Classification)
stroke_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate_binary_model(stroke_X_train, stroke_X_test, stroke_y_train, stroke_y_test, stroke_model)

# 5. Framingham dataset (Binary Classification)
framingham_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate_binary_model(framingham_X_train, framingham_X_test, framingham_y_train, framingham_y_test, framingham_model)

# 6. Diabetes Pima Indians dataset (Binary Classification)
diabetes_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate_binary_model(diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test, diabetes_model)

Accuracy: 0.8043478260869565
Precision: 0.865979381443299
Recall: 0.7850467289719626
F1 Score: 0.8235294117647058

Confusion Matrix:
 [[64 13]
 [23 84]]

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.83      0.78        77
           1       0.87      0.79      0.82       107

    accuracy                           0.80       184
   macro avg       0.80      0.81      0.80       184
weighted avg       0.81      0.80      0.81       184



Accuracy: 0.913894324853229
Precision: 0.25
Recall: 0.20967741935483872
F1 Score: 0.22807017543859648

Confusion Matrix:
 [[921  39]
 [ 49  13]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.95       960
           1       0.25      0.21      0.23        62

    accuracy                           0.91      1022
   macro avg       0.60      0.58      0.59      1022
weighted avg       0.91      0.91      0.91      1022

Accuracy: 0.7606132075471698
Precision: 0.19696969696969696
Recall: 0.21138211382113822
F1 Score: 0.203921568627451

Confusion Matrix:
 [[619 106]
 [ 97  26]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.86       725
           1       0.20      0.21      0.20       123

    accuracy                           0.76       848
   macro avg       0.53      0.53      0.53       848
weighted avg       0.77      0.76      0.76    

### Binary Classification - Gaussian Naive Bayers 

In [98]:
# 1. Heart Failure dataset (Binary Classification)
heart_failure_model = GaussianNB()
train_and_evaluate_binary_model(heart_failure_X_train, heart_failure_X_test, heart_failure_y_train, heart_failure_y_test, heart_failure_model)

# 4. Stroke dataset (Binary Classification)
stroke_model = GaussianNB()
train_and_evaluate_binary_model(stroke_X_train, stroke_X_test, stroke_y_train, stroke_y_test, stroke_model)

# 5. Framingham dataset (Binary Classification)
framingham_model = GaussianNB()
train_and_evaluate_binary_model(framingham_X_train, framingham_X_test, framingham_y_train, framingham_y_test, framingham_model)

# 6. Diabetes Pima Indians dataset (Binary Classification)
diabetes_model = GaussianNB()
train_and_evaluate_binary_model(diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test, diabetes_model)

Accuracy: 0.8641304347826086
Precision: 0.9270833333333334
Recall: 0.8317757009345794
F1 Score: 0.8768472906403941

Confusion Matrix:
 [[70  7]
 [18 89]]

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.91      0.85        77
           1       0.93      0.83      0.88       107

    accuracy                           0.86       184
   macro avg       0.86      0.87      0.86       184
weighted avg       0.87      0.86      0.86       184

Accuracy: 0.6046966731898239
Precision: 0.12168141592920353
Recall: 0.8870967741935484
F1 Score: 0.2140077821011673

Confusion Matrix:
 [[563 397]
 [  7  55]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.59      0.74       960
           1       0.12      0.89      0.21        62

    accuracy                           0.60      1022
   macro avg       0.55      0.74      0.47      1022
weighted avg       0.94      0.60     

Accuracy: 0.8337264150943396
Precision: 0.3269230769230769
Recall: 0.13821138211382114
F1 Score: 0.19428571428571428

Confusion Matrix:
 [[690  35]
 [106  17]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.95      0.91       725
           1       0.33      0.14      0.19       123

    accuracy                           0.83       848
   macro avg       0.60      0.54      0.55       848
weighted avg       0.79      0.83      0.80       848

Accuracy: 0.7662337662337663
Precision: 0.6610169491525424
Recall: 0.7090909090909091
F1 Score: 0.6842105263157895

Confusion Matrix:
 [[79 20]
 [16 39]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.81        99
           1       0.66      0.71      0.68        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77    

### Multi-class classification - Random Forest

In [99]:
# Train and evaluate the multi-class classification model

# 2. Hepatitis C dataset (Multi-label classification)
hepatitis_c_multi_class_model = RandomForestClassifier(random_state=42)
train_and_evaluate_multiclass_model(hepatitis_c_X_train, hepatitis_c_X_test, hepatitis_c_y_train, hepatitis_c_y_test, hepatitis_c_multi_class_model)

# 3. Cirrhosis dataset (Multiclass Classification)
cirrhosis_model = RandomForestClassifier(random_state=42)
train_and_evaluate_multiclass_model(cirrhosis_X_train, cirrhosis_X_test, cirrhosis_y_train, cirrhosis_y_test, cirrhosis_model)

# 7. Lung Cancer dataset (Multiclass Classification)
lung_cancer_model = RandomForestClassifier(random_state=42)
train_and_evaluate_multiclass_model(lung_cancer_X_train, lung_cancer_X_test, lung_cancer_y_train, lung_cancer_y_test, lung_cancer_model)


Accuracy: 0.8455284552845529

Confusion Matrix:
 [[96  0  0  0  0]
 [ 2  0  0  1  0]
 [ 7  0  1  1  0]
 [ 4  0  0  2  0]
 [ 3  0  0  1  5]]

Classification Report:
                         precision    recall  f1-score   support

         0=Blood Donor       0.86      1.00      0.92        96
0s=suspect Blood Donor       1.00      0.00      0.00         3
           1=Hepatitis       1.00      0.11      0.20         9
            2=Fibrosis       0.40      0.33      0.36         6
           3=Cirrhosis       1.00      0.56      0.71         9

              accuracy                           0.85       123
             macro avg       0.85      0.40      0.44       123
          weighted avg       0.86      0.85      0.81       123

Accuracy: 0.7976190476190477

Confusion Matrix:
 [[39  0  5]
 [ 3  0  1]
 [ 8  0 28]]

Classification Report:
               precision    recall  f1-score   support

           C       0.78      0.89      0.83        44
          CL       1.00      0.00   

### Multi-class classification - Logistic Regression

In [100]:
# Train and evaluate the multi-class classification model

# 2. Hepatitis C dataset (Multi-label classification)
hepatitis_c_multi_class_model = LogisticRegression(max_iter=1000, random_state=42)
train_and_evaluate_multiclass_model(hepatitis_c_X_train, hepatitis_c_X_test, hepatitis_c_y_train, hepatitis_c_y_test, hepatitis_c_multi_class_model)

# 3. Cirrhosis dataset (Multiclass Classification)
cirrhosis_model = LogisticRegression( max_iter=1000, random_state=42)
train_and_evaluate_multiclass_model(cirrhosis_X_train, cirrhosis_X_test, cirrhosis_y_train, cirrhosis_y_test, cirrhosis_model)

# 7. Lung Cancer dataset (Multiclass Classification)
lung_cancer_model = LogisticRegression( max_iter=1000, random_state=42)
train_and_evaluate_multiclass_model(lung_cancer_X_train, lung_cancer_X_test, lung_cancer_y_train, lung_cancer_y_test, lung_cancer_model)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8699186991869918

Confusion Matrix:
 [[94  2  0  0  0]
 [ 1  2  0  0  0]
 [ 1  1  4  3  0]
 [ 4  0  0  2  0]
 [ 2  2  0  0  5]]

Classification Report:
                         precision    recall  f1-score   support

         0=Blood Donor       0.92      0.98      0.95        96
0s=suspect Blood Donor       0.29      0.67      0.40         3
           1=Hepatitis       1.00      0.44      0.62         9
            2=Fibrosis       0.40      0.33      0.36         6
           3=Cirrhosis       1.00      0.56      0.71         9

              accuracy                           0.87       123
             macro avg       0.72      0.60      0.61       123
          weighted avg       0.89      0.87      0.87       123



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7738095238095238

Confusion Matrix:
 [[37  0  7]
 [ 3  0  1]
 [ 8  0 28]]

Classification Report:
               precision    recall  f1-score   support

           C       0.77      0.84      0.80        44
          CL       1.00      0.00      0.00         4
           D       0.78      0.78      0.78        36

    accuracy                           0.77        84
   macro avg       0.85      0.54      0.53        84
weighted avg       0.78      0.77      0.75        84

Accuracy: 1.0

Confusion Matrix:
 [[82  0  0]
 [ 0 55  0]
 [ 0  0 63]]

Classification Report:
               precision    recall  f1-score   support

        High       1.00      1.00      1.00        82
         Low       1.00      1.00      1.00        55
      Medium       1.00      1.00      1.00        63

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Multi-class classification - Decision Tree

In [101]:
# 2. Hepatitis C dataset (Multi-label classification)
hepatitis_c_multi_class_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate_multiclass_model(hepatitis_c_X_train, hepatitis_c_X_test, hepatitis_c_y_train, hepatitis_c_y_test, hepatitis_c_multi_class_model)

# 3. Cirrhosis dataset (Multiclass Classification)
cirrhosis_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate_multiclass_model(cirrhosis_X_train, cirrhosis_X_test, cirrhosis_y_train, cirrhosis_y_test, cirrhosis_model)

# 7. Lung Cancer dataset (Multiclass Classification)
lung_cancer_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate_multiclass_model(lung_cancer_X_train, lung_cancer_X_test, lung_cancer_y_train, lung_cancer_y_test, lung_cancer_model)


Accuracy: 0.8536585365853658

Confusion Matrix:
 [[94  0  2  0  0]
 [ 1  2  0  0  0]
 [ 3  0  4  1  1]
 [ 5  0  0  0  1]
 [ 2  1  0  1  5]]

Classification Report:
                         precision    recall  f1-score   support

         0=Blood Donor       0.90      0.98      0.94        96
0s=suspect Blood Donor       0.67      0.67      0.67         3
           1=Hepatitis       0.67      0.44      0.53         9
            2=Fibrosis       0.00      0.00      0.00         6
           3=Cirrhosis       0.71      0.56      0.63         9

              accuracy                           0.85       123
             macro avg       0.59      0.53      0.55       123
          weighted avg       0.82      0.85      0.83       123

Accuracy: 0.5952380952380952

Confusion Matrix:
 [[22  3 19]
 [ 0  1  3]
 [ 7  2 27]]

Classification Report:
               precision    recall  f1-score   support

           C       0.76      0.50      0.60        44
          CL       0.17      0.25   

### Multi-class classification - Gaussian Navies Bayers

In [102]:
# 2. Hepatitis C dataset (Multi-label classification)
hepatitis_c_multi_class_model = GaussianNB()
train_and_evaluate_multiclass_model(hepatitis_c_X_train, hepatitis_c_X_test, hepatitis_c_y_train, hepatitis_c_y_test, hepatitis_c_multi_class_model)

# 3. Cirrhosis dataset (Multiclass Classification)
cirrhosis_model = GaussianNB()
train_and_evaluate_multiclass_model(cirrhosis_X_train, cirrhosis_X_test, cirrhosis_y_train, cirrhosis_y_test, cirrhosis_model)

# 7. Lung Cancer dataset (Multiclass Classification)
lung_cancer_model = GaussianNB()
train_and_evaluate_multiclass_model(lung_cancer_X_train, lung_cancer_X_test, lung_cancer_y_train, lung_cancer_y_test, lung_cancer_model)


Accuracy: 0.8536585365853658

Confusion Matrix:
 [[95  0  1  0  0]
 [ 0  1  0  1  1]
 [ 5  0  1  2  1]
 [ 3  0  1  2  0]
 [ 1  0  0  2  6]]

Classification Report:
                         precision    recall  f1-score   support

         0=Blood Donor       0.91      0.99      0.95        96
0s=suspect Blood Donor       1.00      0.33      0.50         3
           1=Hepatitis       0.33      0.11      0.17         9
            2=Fibrosis       0.29      0.33      0.31         6
           3=Cirrhosis       0.75      0.67      0.71         9

              accuracy                           0.85       123
             macro avg       0.66      0.49      0.53       123
          weighted avg       0.83      0.85      0.83       123

Accuracy: 0.7261904761904762

Confusion Matrix:
 [[37  2  5]
 [ 4  0  0]
 [ 9  3 24]]

Classification Report:
               precision    recall  f1-score   support

           C       0.74      0.84      0.79        44
          CL       0.00      0.00   