In [1]:
import pandas as pd
import numpy as np
import joblib
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

# **MODEL BUILDING**

In [2]:
def load_dataset(file_path: str) -> pd.DataFrame:
     """
    This function is used to Load a dataset from the specified file path in the local device.

    Args:
        file_path (str): The path of the dataset file.

    Returns:
        pd.DataFrame: The dataset is loaded as Pandas DataFrame.
    """
     dataset = pd.read_csv(file_path)
     print(f"Heart disease Dataset is loaded from {file_path} dimension is {dataset.shape}")
     return dataset

In [3]:
file_path = r"C:\Users\SOHAM\Git_Repositories\DataScience_Projects\dsp-heart-failure-prediction\data\heart.csv"
dataset = load_dataset(file_path)

Heart disease Dataset is loaded from C:\Users\SOHAM\Git_Repositories\DataScience_Projects\dsp-heart-failure-prediction\data\heart.csv dimension is (918, 12)


In [4]:
def select_features(data: pd.DataFrame, selected_features: list, target_feature: str) -> tuple[pd.DataFrame, pd.Series]:
    """
    selecting the features and target variable for ml training.

    Args:
        data (pd.DataFrame): The dataset to prepare features from.
        selected_features (list): List of feature names to include.
        target_feature (str): The name of the target feature.

    Returns:
        tuple[pd.DataFrame, pd.Series]: The selected features and target variable.
    """
    X = data[selected_features]
    y = data[target_feature]
    return X, y


In [5]:
selected_features = ['Age', 'Sex', 'ChestPainType', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']
target_feature = ['HeartDisease']

X, y = select_features(dataset, selected_features, target_feature)

In [6]:
def split_dataset(data: pd.DataFrame, target_feature:pd.DataFrame , test_size: float = 0.35, random_state: int = 50) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    This function is used to Split the dataset into training and testing sets.

    Args:
        data (pd.DataFrame): The dataset to split.
        target_feature (str): The column_name of the target feature.
        test_size (float): The proportion of the dataset to include in the test split. Defaults to 0.35.
        random_state (int): Seed used by the random number generator. Defaults to 50.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: The training features, testing features, training target, and testing target.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test


In [7]:
X_train, X_test, y_train, y_test = split_dataset(X, y)

In [8]:
def encode_categorical_features(df: pd.DataFrame, Nominal_features: list, Ordinal_features: list, slope_order: list) -> pd.DataFrame:
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    ordinal_encoder = OrdinalEncoder(categories=slope_order)
    one_hot_encoder.fit(df[Nominal_features])
    ordinal_encoder.fit(df[Ordinal_features])
    encoded_Nominal_features = one_hot_encoder.transform(df[Nominal_features])
    encoded_Ordinal_features = ordinal_encoder.transform(df[Ordinal_features])                     
    Nominal_encoded_df = pd.DataFrame(encoded_Nominal_features, columns=one_hot_encoder.get_feature_names_out(Nominal_features))
    Ordinal_encoded_df = pd.DataFrame(encoded_Ordinal_features, columns=Ordinal_features)
    encoded_df = pd.concat([Nominal_encoded_df, Ordinal_encoded_df], axis = 1)
    return encoded_df, one_hot_encoder, ordinal_encoder


In [9]:
Nominal_features = ['ChestPainType', 'Sex', 'ExerciseAngina', 'RestingECG']
Ordinal_features = ['ST_Slope']
slope_order =[['Down', 'Flat', 'Up']]
encoded_df, one_hot_encoder, ordinal_encoder = encode_categorical_features(X_train, Nominal_features, Ordinal_features, slope_order)

In [10]:
def scale_continuous_features(df: pd.DataFrame, continuos_features: list) -> pd.DataFrame:
    scaler = StandardScaler()
    scaler.fit(df[continuos_features])
    scaled_features = scaler.transform(df[continuos_features])
    scaled_df = pd.DataFrame(scaled_features, columns=continuos_features)
    return scaled_df, scaler

In [11]:
continuos_features = ['Age', 'MaxHR', 'Oldpeak']
scaled_df, scaler = scale_continuous_features(X_train, continuos_features)

In [12]:
def concatenate_features(scaled_df: pd.DataFrame, encoded_df: pd.DataFrame) -> pd.DataFrame:
    concatenated = pd.concat([scaled_df, encoded_df], axis=1)
    return concatenated

In [13]:
transformed_df = concatenate_features(scaled_df, encoded_df)
transformed_df

Unnamed: 0,Age,MaxHR,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,Sex_F,Sex_M,ExerciseAngina_N,ExerciseAngina_Y,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope
0,0.475397,-0.534464,0.296557,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
1,-0.794222,0.526945,-0.866898,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,2.0
2,-0.053611,-0.180661,-0.866898,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,2.0
3,-1.217428,-0.455841,0.878285,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
4,1.110206,-1.595873,1.266103,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,-1.852237,1.706288,-0.866898,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,2.0
592,-1.534832,-1.202758,-0.866898,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,2.0
593,-1.005825,0.291076,1.847831,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
594,0.475397,-1.045513,1.072194,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0


In [14]:
def train_model(
    transformed_df: pd.DataFrame,
    y_train: pd.Series, 
    model_path: str, 
    scaler_path: str,
    OH_encoder_path: str,
    OR_encoder_path:str):
    model = LogisticRegression()
    model.fit(transformed_df, y_train)
    joblib.dump(model, model_path)
    joblib.dump(scaler, scaler_path)
    joblib.dump(one_hot_encoder, OH_encoder_path)
    joblib.dump(ordinal_encoder, OR_encoder_path)
    return model

In [15]:
model_path = r"C:\Users\SOHAM\Git_Repositories\DataScience_Projects\dsp-heart-failure-prediction\models\log_model.joblib"
scaler_path = r"C:\Users\SOHAM\Git_Repositories\DataScience_Projects\dsp-heart-failure-prediction\models\scaler_HD.joblib"
OH_encoder_path = r"C:\Users\SOHAM\Git_Repositories\DataScience_Projects\dsp-heart-failure-prediction\models\one_hot_encoder_HD.joblib"
OR_encoder_path = r"C:\Users\SOHAM\Git_Repositories\DataScience_Projects\dsp-heart-failure-prediction\models\ordinal_encoder_HD.joblib"
model = train_model(transformed_df, y_train, model_path, scaler_path, OH_encoder_path, OR_encoder_path)

  y = column_or_1d(y, warn=True)


In [16]:
def process_testing_set(X_test: pd.DataFrame, Nominal_features: list, Ordinal_features: list, continuos_features: list, OH_encoder, OR_encoder, scaler) -> pd.DataFrame:
    # Encode categorical features
    
    OH_encoder = joblib.load(OH_encoder)
    OR_encoder = joblib.load(OR_encoder)
    scaler = joblib.load(scaler)
    Ordinal_df = X_test[Ordinal_features]
    Nominal_df = X_test[Nominal_features]
    continuous_df = X_test[continuos_features]

    
    encoded_oh_features = OH_encoder.transform(Nominal_df)
    encoded_or_features = OR_encoder.transform(Ordinal_df)
    oh_encoded_df = pd.DataFrame(encoded_oh_features, columns=OH_encoder.get_feature_names_out(Nominal_features))
    or_encoded_df = pd.DataFrame(encoded_or_features, columns=Ordinal_features)
    encoded_df = pd.concat([oh_encoded_df, or_encoded_df], axis =1)
    
    # Scale continuous features
    scaled_features = scaler.transform(continuous_df)
    scaled_df = pd.DataFrame(scaled_features, columns=continuos_features)
    
    # Concatenate features
    processed_test_df = pd.concat([scaled_df, encoded_df], axis =1)
    y_pred = model.predict(processed_test_df)
    return processed_test_df,y_pred


In [17]:
testing_set, y_pred = process_testing_set(X_test, Nominal_features, Ordinal_features, continuos_features, OH_encoder_path, OR_encoder_path,
                                          scaler_path)

In [18]:
def evaluate_model(y_test: pd.Series, y_pred: np.ndarray) -> dict[str, str]:
    score = accuracy_score(y_test, y_pred)
    return {'Accuracy score is': str(score)}


In [19]:
evaluate = evaluate_model(y_test, y_pred)
print(evaluate)

{'Accuracy score is': '0.8416149068322981'}


# **MODEL INFERENCE**

In [20]:
def load_model_and_transformers(model_path: str, scaler_path: str, OH_encoder_path: str, OR_encoder_path: str):
    """
    Loads the pre-trained model, scaler, and encoder.

    Args:
        model_path (str): Path to the saved model.
        scaler_path (str): Path to the saved scaler.
        encoder_path (str): Path to the saved encoder.

    Returns:
        model: Loaded model object.
        scaler: Loaded scaler object.
        encoder: Loaded encoder object.
    """
    model = joblib.load(model_path)
    scaler = joblib.load(scaler_path)
    one_hot_encoder = joblib.load(OH_encoder_path)
    ordinal_encoder = joblib.load(OR_encoder_path)
    return model, scaler, one_hot_encoder, ordinal_encoder



In [21]:
model, scaler, one_hot_encoder, ordinal_encoder = load_model_and_transformers(model_path, scaler_path, OH_encoder_path, OR_encoder_path)


In [22]:
def preprocess_data_and_predict(Testing_df: pd.DataFrame, scaler, one_hot_encoder, ordinal_encoder, model, continuos_features: list, Nominal_features: list, Ordinal_features: list) -> pd.DataFrame:
    """
    Preprocesses the input data by scaling continuous features and encoding categorical features.

    Args:
        input_data (pd.DataFrame): The data to preprocess.
        scaler: Scaler object to scale continuous features.
        encoder: Encoder object to encode categorical features.
        continuous_features (list): List of continuous feature names.
        discrete_features (list): List of discrete feature names.

    Returns:
        pd.DataFrame: Preprocessed data.
    """
    test_scaled = scaler.transform(Testing_df[continuos_features])
    test_one_hot_encoded = one_hot_encoder.transform(Testing_df[Nominal_features])
    test_ordinal_encoded = ordinal_encoder.transform(Testing_df[Ordinal_features])
    

    test_scaled_df = pd.DataFrame(test_scaled, columns=continuos_features)
    test_one_hot_encoded_df = pd.DataFrame(test_one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(Nominal_features))
    test_ordinal_encoded_df = pd.DataFrame(test_ordinal_encoded, columns=Ordinal_features)

    transformed_encoded_df = pd.concat([test_one_hot_encoded_df, test_ordinal_encoded_df], axis=1)
    transformed_test_df = pd.concat([test_scaled_df, transformed_encoded_df], axis=1)
    predict_heart_patients = model.predict(transformed_test_df)
    return predict_heart_patients



In [23]:
Test_dataset = r"C:\Users\SOHAM\Git_Repositories\DataScience_Projects\dsp-heart-failure-prediction\data\heart.csv"
Testing_df = pd.read_csv(Test_dataset)
predict_heart_patients = preprocess_data_and_predict(Testing_df, scaler, one_hot_encoder, ordinal_encoder, model ,continuos_features, Nominal_features, Ordinal_features)
print(predict_heart_patients)

[0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1
 0 0 1 0 0 0 0 1 1 0 0 0 1 1 1 0 0 1 0 1 0 1 1 0 0 0 1 0 0 0 0 1 0 1 0 1 0
 1 0 1 0 0 1 0 0 1 0 1 1 1 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0
 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0 0 1 1 1 1 1 0 0 0 0 0
 0 1 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0
 1 0 1 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 0 1 1
 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 0 1 0 0 0 0
 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 1
 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0 0 1 0 0 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0
 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 0 1 1 1 1 1
 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1
 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 1 1
 1 1 1 1 1 0 1 0 1 1 1 1 