<a href="https://colab.research.google.com/github/dc-neo/heart_attack_predict/blob/main/Heart_Disease_Semi-Supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Notebook Setup**

In [1]:
import os
print(os.getcwd())

/content


In [2]:
# Importing packages
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb

import random
import matplotlib.pyplot as plt
from google.colab import drive
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import GridSearchCV



In [3]:
# Install & Import Pycaret
!pip install pycaret[full]




In [4]:
# import ClassificationExperiment and init the class

from pycaret.classification import setup, compare_models, tune_model, evaluate_model
from pycaret.classification import setup, compare_models, tune_model
import pycaret.classification as pc
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline


In [5]:
# Mounting Google Drive

drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
df_path = 'drive/My Drive/Colab Notebooks/heart.csv'

data = pd.read_csv(df_path, encoding='unicode_escape')

In [7]:
df_original = data.copy()
df = data.copy()

# **Data Exploration**

In [None]:
# installing ydata-profiling package.  This tool will allow for better data visualization & exploration

!pip install  ydata-profiling matplotlib

In [None]:
# Viewing the data through ydata-profiling

from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Dataset Profile")
profile.to_notebook_iframe()

# **Analyzing Data**

In [None]:
print(f'Shape     : {df.shape}\n'
      f'Size      : {df.size}\n'
      f'Dimension : {df.ndim}')

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
chart_color = sns.color_palette('Accent')

fig, axs = plt.subplots(1,3,figsize = (15, 5))

ax = sns.countplot(data=df,
                   x = df['sex'],
                   ax = axs[0],
                   palette='Accent');

ax.set_xlabel('Sex   F = 0, M = 1', fontsize=14)
ax.set_ylabel('Count', fontsize=14)
axs[0].set_title("Count of Sex", fontsize=16)

plt.xticks(rotation=45)
plt.xticks([0, 1], ["Female", "Male"])


ax2 = df['sex'].value_counts().plot.pie(explode=[0,0.05],
                                             colors=[chart_color[2], chart_color[5]],
                                             ax = axs[1],
                                             shadow = True,
                                             autopct='%.5f%%');
axs[1].set_title("Ratio of Sex", fontsize=15);

ax3 = df['output'].value_counts().plot.pie(explode=[0,0.07],
                                               colors=["#81b7d2", "#ffcce7"],
                                               autopct='%.5f%%',
                                               shadow = True,
                                               figsize=(15, 5))
ax3.set_title('Target Distribution');

In [None]:
bx = sns.boxplot(df)
bx.set_xticklabels(bx.get_xticklabels(),rotation=45)

# **Modelling Data**

In [80]:
# Creating a subset for use when we need to use unlabelled data (unlab)
  # Creating three datasets: 1) Labelled Train Data Set,  2) Labelled Test Dataset, 3) Unlabelled Dataset
    # 1 will be used to train the model, 2 will be used to apply some pseudo labels.  These two will then be combined to make a new dataset which will be used to train a new model
    # 3 will be used by the new model (pseudo_model) to predict the labels

train=df.sample(frac=0.5,random_state=200)
unlab=df.drop(train.index)

print(train.shape)
unlab.shape

(152, 14)


(151, 14)

In [31]:
# Labelled Dataset

  # In this code box, encoding categorical features for efficiency, scaling the continuous features

# Features/Columns, defined by categorical (cat) / continuous (con)
cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']
con_cols = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']

# encoding the categorical columns
ha_encoded_data = pd.get_dummies(train, columns = cat_cols)

# defining the features and target
X = ha_encoded_data

# instantiating the scaler
scaler = StandardScaler()

# scaling the continuous features
X[con_cols] = scaler.fit_transform(X[con_cols])
print("The first 5 rows of X are")
X.head()

The first 5 rows of X are


Unnamed: 0,age,trtbps,chol,thalachh,oldpeak,output,sex_0,sex_1,cp_0,cp_1,...,slp_2,caa_0,caa_1,caa_2,caa_3,caa_4,thall_0,thall_1,thall_2,thall_3
166,1.402246,-0.794666,-0.327522,-0.979569,1.414661,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,1
246,0.237539,-0.044089,3.340513,-0.092286,0.807481,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,1
127,1.402246,0.920938,0.65062,0.837248,-0.840579,1,1,0,0,0,...,1,0,1,0,0,0,0,0,1,0
277,0.343421,-0.580216,0.324573,-0.47255,-0.580359,0,0,1,0,1,...,1,1,0,0,0,0,0,0,0,1
121,0.555186,0.170361,0.528353,1.259763,-0.840579,1,0,1,1,0,...,1,1,0,0,0,0,0,0,1,0


In [32]:
# Unlabelled Dataset

  # In this code box, encoding categorical features for efficiency, scaling the continuous features

# Features / Columns, defined by categorical (cat) / continuous (con)
cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']
con_cols = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']

# encoding the categorical columns
ha_encoded_data = pd.get_dummies(unlab, columns = cat_cols)

# defining the features and target
X_u = ha_encoded_data.drop(['output'],axis=1)
y_u = ha_encoded_data[['output']]

# instantiating the scaler
scaler = StandardScaler()

# scaling the continuous features
X_u[con_cols] = scaler.fit_transform(X_u[con_cols])
print("The first 5 rows of X are")
X_u.head()

The first 5 rows of X are


Unnamed: 0,age,trtbps,chol,thalachh,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,...,slp_2,caa_0,caa_1,caa_2,caa_3,caa_4,thall_0,thall_1,thall_2,thall_3
0,0.929593,1.061813,-0.26648,0.13376,1.024347,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,-2.084101,0.102114,0.046732,1.836074,2.057819,0,1,0,0,1,...,0,1,0,0,0,0,0,0,1,0
2,-1.620455,0.102114,-0.800784,1.145947,0.249243,1,0,0,1,0,...,1,1,0,0,0,0,0,0,1,0
3,0.118214,-0.537686,-0.211208,1.421998,-0.267494,0,1,0,1,0,...,1,1,0,0,0,0,0,0,1,0
6,0.118214,0.741913,0.857398,0.271785,0.16312,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0


In [41]:
# Creating a split of data for training, testing

X_train, X_test = train_test_split(X, test_size=0.4, random_state=0)

# Labelled Data: Showing the splits, by X / Y
print("X Train : ", X_train.shape)
print("X Test  : ", X_test.shape)


# Unlabelled Data Shape
print("X Unlabelled Data : ", X_u.shape)
print("y Unlabelled Data : ", y_u.shape)

X Train :  (91, 31)
X Test  :  (61, 31)
X Unlabelled Data :  (151, 30)
y Unlabelled Data :  (151, 1)


In [87]:
X_test = X_test.drop(['output'],axis=1)


KeyError: ignored

In [36]:
# Set up our experiment
from pycaret.classification import *
exp = ClassificationExperiment()
exp = setup(X_train, target='output', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,output
2,Target type,Binary
3,Original data shape,"(91, 31)"
4,Transformed data shape,"(91, 31)"
5,Transformed train set shape,"(63, 31)"
6,Transformed test set shape,"(28, 31)"
7,Numeric features,30
8,Preprocess,True
9,Imputation type,simple


In [37]:
best_model_1 = exp.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8881,0.9771,0.875,0.9267,0.8902,0.7725,0.7915,0.06
lightgbm,Light Gradient Boosting Machine,0.8881,0.8986,0.9417,0.8767,0.9052,0.7641,0.7751,6.796
catboost,CatBoost Classifier,0.8881,0.95,0.9333,0.8817,0.899,0.7629,0.7841,1.085
lr,Logistic Regression,0.8738,0.9444,0.9,0.88,0.8822,0.7393,0.7575,0.469
ridge,Ridge Classifier,0.8738,0.0,0.9,0.8633,0.8756,0.7393,0.7535,0.008
lda,Linear Discriminant Analysis,0.8571,0.9542,0.9333,0.835,0.8768,0.6964,0.7208,0.01
rf,Random Forest Classifier,0.8548,0.925,0.85,0.8933,0.8578,0.7125,0.7334,0.07
ada,Ada Boost Classifier,0.8405,0.8722,0.9083,0.83,0.8594,0.6726,0.699,0.032
knn,K Neighbors Classifier,0.8238,0.8687,0.8667,0.835,0.8368,0.6297,0.6622,0.012
svm,SVM - Linear Kernel,0.8071,0.0,0.8167,0.83,0.8127,0.6059,0.6242,0.008


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

In [48]:
from pycaret.classification import evaluate_model
evaluate_model(best_model_1)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [49]:
from pycaret.classification import predict_model
holdout_pred = predict_model(best_model_1)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.75,0.8769,0.8667,0.7222,0.7879,0.4896,0.5017


In [57]:
# Predict the scores of the X_test dataset, this is our data that will be pseudo-labeled

pseudo_label = predict_model(best_model_1, data=X_test)
pseudo_label.head()

Unnamed: 0,age,trtbps,chol,thalachh,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,cp_3,fbs_0,fbs_1,restecg_0,restecg_1,restecg_2,exng_0,exng_1,slp_0,slp_1,slp_2,caa_0,caa_1,caa_2,caa_3,caa_4,thall_0,thall_1,thall_2,thall_3,prediction_label,prediction_score
197,1.402246,-0.526603,0.181927,0.456984,-0.667099,0,1,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0.77
34,-0.291873,-0.526603,-0.65357,-1.148575,0.373781,0,1,0,0,0,1,1,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0.71
271,0.766951,-0.044089,-0.225632,-0.303544,1.414662,0,1,0,0,0,1,1,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0.76
12,-0.503638,-0.25854,0.426463,0.794996,-0.320139,0,1,0,1,0,0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,1,0.93
229,1.084599,-0.526603,1.302715,-0.895066,0.720741,0,1,0,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0.63


In [58]:
X_train.head()

Unnamed: 0,age,trtbps,chol,thalachh,oldpeak,output,sex_0,sex_1,cp_0,cp_1,cp_2,cp_3,fbs_0,fbs_1,restecg_0,restecg_1,restecg_2,exng_0,exng_1,slp_0,slp_1,slp_2,caa_0,caa_1,caa_2,caa_3,caa_4,thall_0,thall_1,thall_2,thall_3
80,-1.350698,-1.223567,0.100415,1.133009,-0.840579,1,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0
233,1.084599,-0.794666,0.018903,-2.37387,1.067701,0,0,1,1,0,0,0,1,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0
5,0.343421,0.277586,-1.081507,-0.176789,-0.493619,1,0,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0
248,0.025774,3.065443,0.772888,1.809033,-0.840579,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1
60,1.825776,-1.330792,0.406085,-0.937317,-0.840579,1,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0


In [63]:
# Renaming the pseudo label column to match 'output' and dropping the score column

pseudo_label = pseudo_label.rename(columns={'prediction_label':'output'})
pseudo_label = pseudo_label.drop(['prediction_score'], axis=1)
pseudo_label

Unnamed: 0,age,trtbps,chol,thalachh,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,cp_3,fbs_0,fbs_1,restecg_0,restecg_1,restecg_2,exng_0,exng_1,slp_0,slp_1,slp_2,caa_0,caa_1,caa_2,caa_3,caa_4,thall_0,thall_1,thall_2,thall_3,output
197,1.402246,-0.526603,0.181927,0.456984,-0.667099,0,1,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0
34,-0.291873,-0.526603,-0.653570,-1.148575,0.373781,0,1,0,0,0,1,1,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0
271,0.766951,-0.044089,-0.225632,-0.303544,1.414662,0,1,0,0,0,1,1,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1
12,-0.503638,-0.258540,0.426463,0.794996,-0.320139,0,1,0,1,0,0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,1
229,1.084599,-0.526603,1.302715,-0.895066,0.720741,0,1,0,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,-0.185991,-0.365765,-0.816594,1.344266,-0.840579,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,1
205,-0.185991,-0.365765,0.202305,0.372481,-0.840579,0,1,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0
175,-1.456581,-1.330792,-1.590957,-1.613342,0.894221,0,1,1,0,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0
9,0.343421,0.813713,-1.570579,0.921751,0.547261,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,1


In [64]:
# Creating our dataframe that is a combination of the train data and the pseudo-labeled data.  This is supposed to increase the accuracy of the model

df_pseudo = pd.concat([X_train, pseudo_label])

In [66]:
df_pseudo

Unnamed: 0,age,trtbps,chol,thalachh,oldpeak,output,sex_0,sex_1,cp_0,cp_1,cp_2,cp_3,fbs_0,fbs_1,restecg_0,restecg_1,restecg_2,exng_0,exng_1,slp_0,slp_1,slp_2,caa_0,caa_1,caa_2,caa_3,caa_4,thall_0,thall_1,thall_2,thall_3
80,-1.350698,-1.223567,0.100415,1.133009,-0.840579,1,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0
233,1.084599,-0.794666,0.018903,-2.373870,1.067701,0,0,1,1,0,0,0,1,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0
5,0.343421,0.277586,-1.081507,-0.176789,-0.493619,1,0,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0
248,0.025774,3.065443,0.772888,1.809033,-0.840579,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1
60,1.825776,-1.330792,0.406085,-0.937317,-0.840579,1,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,-0.185991,-0.365765,-0.816594,1.344266,-0.840579,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0
205,-0.185991,-0.365765,0.202305,0.372481,-0.840579,0,0,1,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1
175,-1.456581,-1.330792,-1.590957,-1.613342,0.894221,0,0,1,1,0,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1
9,0.343421,0.813713,-1.570579,0.921751,0.547261,1,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0


In [70]:
# Set up our experiment #2 using pseudo dataset as our train and then the X_u as out unlabelled dataset

from pycaret.classification import *
exp = ClassificationExperiment()
exp = setup(df_pseudo, target='output', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,output
2,Target type,Binary
3,Original data shape,"(152, 31)"
4,Transformed data shape,"(152, 31)"
5,Transformed train set shape,"(106, 31)"
6,Transformed test set shape,"(46, 31)"
7,Numeric features,30
8,Preprocess,True
9,Imputation type,simple


In [71]:
pseudo_model = exp.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8973,0.9453,0.9167,0.9107,0.9065,0.7936,0.8082,0.061
rf,Random Forest Classifier,0.8873,0.9467,0.8967,0.904,0.8941,0.7742,0.7866,0.064
lda,Linear Discriminant Analysis,0.8873,0.9637,0.88,0.9181,0.8931,0.7749,0.7856,0.01
catboost,CatBoost Classifier,0.8873,0.9438,0.8967,0.904,0.8941,0.7742,0.7866,1.12
lr,Logistic Regression,0.8864,0.9637,0.88,0.924,0.8898,0.7751,0.7933,0.009
ridge,Ridge Classifier,0.8782,0.0,0.88,0.9074,0.8865,0.7556,0.7699,0.008
xgboost,Extreme Gradient Boosting,0.8782,0.943,0.8967,0.8957,0.8884,0.7542,0.7711,0.018
lightgbm,Light Gradient Boosting Machine,0.8782,0.9585,0.8967,0.8898,0.8864,0.7556,0.7694,14.198
knn,K Neighbors Classifier,0.8691,0.957,0.8833,0.8964,0.8809,0.7375,0.7559,0.009
svm,SVM - Linear Kernel,0.8691,0.0,0.83,0.9383,0.869,0.7405,0.7641,0.007


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

In [72]:
from pycaret.classification import evaluate_model
evaluate_model(pseudo_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [73]:
from pycaret.classification import predict_model
holdout_pred = predict_model(pseudo_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.8478,0.9743,0.88,0.8462,0.8627,0.6922,0.6928


In [74]:
holdout_pred.head()

Unnamed: 0,age,trtbps,chol,thalachh,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,...,caa_2,caa_3,caa_4,thall_0,thall_1,thall_2,thall_3,output,prediction_label,prediction_score
226,0.872834,-0.794666,0.732132,-2.078109,0.373781,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0.64
206,0.555186,-1.330792,-0.123743,-0.430299,0.200301,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0.97
72,-2.621288,-0.25854,-0.836972,2.104794,-0.840579,0,1,0,1,0,...,0,0,0,0,0,1,0,1,1,0.93
259,-1.668346,-0.794666,-0.286766,1.259763,2.455542,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0.85
41,-0.609521,-0.25854,-0.001475,1.17526,-0.667099,0,1,0,1,0,...,0,0,0,0,0,1,0,1,1,0.93


In [75]:
tuned_model = tune_model(pseudo_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9091,1.0,1.0,0.8571,0.9231,0.8136,0.8281
1,0.9091,1.0,0.8333,1.0,0.9091,0.8197,0.8333
2,0.9091,0.95,1.0,0.8571,0.9231,0.8136,0.8281
3,0.8182,0.8667,0.8333,0.8333,0.8333,0.6333,0.6333
4,0.8182,0.95,0.8333,0.8333,0.8333,0.6333,0.6333
5,0.9091,0.9667,0.8333,1.0,0.9091,0.8197,0.8333
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,0.9,0.88,1.0,0.8333,0.9091,0.8,0.8165
8,0.9,0.9792,0.8333,1.0,0.9091,0.8,0.8165
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [77]:
evaluate_model(tuned_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [78]:
holdout_pred = predict_model(tuned_model);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9348,0.9829,0.92,0.9583,0.9388,0.8691,0.8699


In [81]:
# Now for our final predictions, we take the large dataset we split out at the very beginning of the modelling and we predict the labels of these, using our pseudo_model

unlabeled_predictor = predict_model(pseudo_model, data=X_u)
unlabeled_predictor.head()

Unnamed: 0,age,trtbps,chol,thalachh,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,...,caa_1,caa_2,caa_3,caa_4,thall_0,thall_1,thall_2,thall_3,prediction_label,prediction_score
0,0.929593,1.061813,-0.26648,0.13376,1.024347,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0.66
1,-2.0841,0.102114,0.046732,1.836074,2.057819,0,1,0,0,1,...,0,0,0,0,0,0,1,0,1,1.0
2,-1.620455,0.102114,-0.800784,1.145947,0.249243,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0.92
3,0.118214,-0.537686,-0.211208,1.421998,-0.267494,0,1,0,1,0,...,0,0,0,0,0,0,1,0,1,0.99
6,0.118214,0.741913,0.857399,0.271785,0.16312,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0.95


In [86]:
from pycaret.classification import finalize_model
final_model = finalize_model(pseudo_model)

print(final_model)

Pipeline(memory=Memory(location=None),
         steps=[('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['age', 'trtbps', 'chol',
                                             'thalachh', 'oldpeak', 'sex_0',
                                             'sex_1', 'cp_0', 'cp_1', 'cp_2',
                                             'cp_3', 'fbs_0', 'fbs_1',
                                             'restecg_0', 'restecg_1',
                                             'restecg_2', 'exng_0', 'exng_1',
                                             'slp_0', 'slp_1', 'slp_2', 'caa_0',
                                             'caa_1', 'caa_2', 'caa_3', 'caa_4',
                                             'thall_0', 'thall_1', 't...
                 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                      class_weight=None, criterion='gini',
                                      max_depth=