In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

#File Paths
file_path = r"C:\Users\SOHAM\Git_Repositories\DataScience_Projects\dsp-heart-failure-prediction\data\heart.csv"
model_path = r"C:\Users\SOHAM\Git_Repositories\DataScience_Projects\dsp-heart-failure-prediction\models\model.joblib"
preprocessor_path = r"C:\Users\SOHAM\Git_Repositories\DataScience_Projects\dsp-heart-failure-prediction\models\preprocessors.joblib"

# Load dataset
data = pd.read_csv(file_path)


In [2]:
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


# Preprocess Data

In [3]:

# Features and Target variable
X = data[['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']]
y = data['HeartDisease']  # Assuming this is the target variable (HeartDisease as binary target)

# Preprocessing function
def preprocess_data():
    """
    Preprocessing pipeline for categorical and numerical features
    Returns: preprocessor, X_train, X_test, y_train, y_test
    """
    # Define categorical and numerical columns
    categorical_features = ['Sex', 'ChestPainType', 'ExerciseAngina', 'ST_Slope']
    numerical_features = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

    # Categorical and numerical transformers
    categorical_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')
    numerical_transformer = StandardScaler()

    # Combine categorical and numerical transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return preprocessor, X_train, X_test, y_train, y_test

# Build Model

In [4]:
# Build Model


# Build model function using Linear Regression
def build_model(preprocessor, X_train, y_train):
    """
    Train a Logistic Regression model and return the pipeline
    Returns: model pipeline
    """
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000))
    ])

    # Train the model
    model.fit(X_train, y_train)

    return model



## Save model,encoders and scalars

In [5]:
# Save model, encoders, and scalers separately
def save_components(model, preprocessor, model_path, preprocessor_path):
    """
    Save the model, encoders, and scalers separately
    """
    joblib.dump(model.named_steps['classifier'],model_path)
    joblib.dump(preprocessor, preprocessor_path)

# Prediction

In [6]:

# Prediction function
def predict(model, preprocessor, input_data):
    """
    Predict using the saved model and preprocessor
    input_data: DataFrame with same structure as the training data
    Returns: prediction
    """
    processed_data = preprocessor.transform(input_data)
    prediction = model.predict(processed_data)
    # If there's only a single row in the input, provide a textual output
    if input_data.shape[0] == 1:
        if prediction[0] == 1:
            return "Heart Failure"
        else:
            return "No Heart Failure"
    
    return prediction

# Inference

In [7]:
# Main execution
if __name__ == "__main__":
    # Preprocess data
    preprocessor, X_train, X_test, y_train, y_test = preprocess_data()

    # Train model
    model = build_model(preprocessor, X_train, y_train)

    # Save model and preprocessors for reuse in API
    save_components(model, preprocessor, model_path, preprocessor_path)

    # Load the test data CSV into a DataFrame
    
    test_data_path = file_path = r"C:\Users\SOHAM\Git_Repositories\DataScience_Projects\dsp-heart-failure-prediction\data\test_heart.csv"
    test_data = pd.read_csv(test_data_path)

    # Load preprocessor and model for prediction
    preprocessor = joblib.load(preprocessor_path)
    model = joblib.load(model_path)
    
    """
        For single prediction:
        new_data = pd.DataFrame([{
            'Age': 60,
            'Sex': 'M',
            'ChestPainType': 'ASY',
            'RestingBP': 145,
            'Cholesterol': 233,
            'MaxHR': 150,
            'ExerciseAngina': 'Y',
            'Oldpeak': 1.0,
            'ST_Slope': 'Up'
        }]) 
        add new_data to the function prediction
    """

    # Predict
    prediction = predict(model, preprocessor, test_data)
    
    # Print the result
    if isinstance(prediction, str):
        # Single prediction (textual result)
        print(f"Prediction: {prediction}")
    else:
        # Multiple predictions (array of 0s and 1s)
        print(f"Prediction array: {prediction}")

TypeError: __init__() got an unexpected keyword argument 'sparse_output'