### **Cardio Vascular Risk Assessment Prediction Model**

In [1]:
# Import Data Manipulation Libraries
import numpy as np
import pandas as pd
# Import Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
# Import Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Importing Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
# Importing Metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
# Import Logging
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',
                    filename='model_training1.log', filemode='w',
                    force = True)
from sklearn.impute import SimpleImputer

In [6]:
# Data Ingestion
def data_ingestion():
    df = pd.read_csv(r'C:\CardioVascularRisk_AssessmentModel\data\raw\cardiovascular_risk_dataset.csv')
    return df


# No heavy preprocessing outside pipeline
def data_preprocessing(df):
    return df.copy()


# Model Building
def model_building(data):

    # Separate features & target first
    X = data.drop(columns=['Patient_ID', 'heart_disease_risk_score'])
    y = data['heart_disease_risk_score']

    # Now find column types
    numerical_col = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_col = X.select_dtypes(include=['object']).columns

    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Pipelines
    numerical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])

    categorical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_pipeline, numerical_col),
        ('cat', categorical_pipeline, categorical_col)
    ])

    return preprocessor,numerical_pipeline,categorical_pipeline,X_train, X_test, y_train, y_test


# Model Training & Evaluation
def model_evaluation(preprocessor,numerical_pipeline,categorical_pipeline,X_train, X_test, y_train, y_test):

    models = {
        "LinearRegression": LinearRegression(),
        "DecisionTreeRegressor": DecisionTreeRegressor(random_state=42),
        "RandomForestRegressor": RandomForestRegressor(random_state=42),
        "GradientBoostingRegressor": GradientBoostingRegressor(random_state=42),
        "AdaBoostRegressor": AdaBoostRegressor(random_state=42),
        "SVR": SVR(),
        "KNeighborsRegressor": KNeighborsRegressor()
    }

    model_performance = {}

    for model_name, model in models.items():
        logging.info(f'Training {model_name}...')

        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        model_performance[model_name] = {
            'MAE': mean_absolute_error(y_test, y_pred),
            'MSE': mean_squared_error(y_test, y_pred),
            'R2': r2_score(y_test, y_pred)
        }

        logging.info(f"{model_name} Done.")

    return model_performance


In [7]:
def main():
    # Step1:
    logging.info("Starting data ingestion...")
    df = data_ingestion()
    logging.info("Data ingestion completed. Dataset shape: {}".format(df.shape))
    # Step2:
    logging.info("Starting data preprocessing...")
    data = data_preprocessing(df)
    logging.info("Data preprocessing completed. Dataset shape after preprocessing: {}".format(data.shape))
    # Step3: Model Building using Serialization i.e. Pipeline
    logging.info("Starting model building...")  
    preprocessor,numerical_pipeline,categorical_pipeline,X_train,X_test,y_train,y_test = model_building(data)
    logging.info("Model building completed. Preprocessor and pipelines created successfully.")
    # Step4: Model Training and Evaluation
    logging.info("Starting model training and evaluation...")
    model_performance = model_evaluation(preprocessor,numerical_pipeline,categorical_pipeline,X_train,X_test,y_train,y_test)
    print(model_performance)
if __name__ == "__main__":
    main()

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_col = X.select_dtypes(include=['object']).columns


{'LinearRegression': {'MAE': 2.798600401234952, 'MSE': 13.130461880150651, 'R2': 0.9775423744187214}, 'DecisionTreeRegressor': {'MAE': 4.610545454545455, 'MSE': 37.81301818181818, 'R2': 0.9353266768392148}, 'RandomForestRegressor': {'MAE': 3.2442309090909096, 'MSE': 17.224610450909093, 'R2': 0.9705399660864438}, 'GradientBoostingRegressor': {'MAE': 3.1207048087062184, 'MSE': 15.278368780312643, 'R2': 0.9738687116498432}, 'AdaBoostRegressor': {'MAE': 5.3407839332269, 'MSE': 39.37555818262917, 'R2': 0.9326541936764533}, 'SVR': {'MAE': 2.963703177916421, 'MSE': 15.072986002063784, 'R2': 0.9742199871477546}, 'KNeighborsRegressor': {'MAE': 3.5318727272727277, 'MSE': 20.720454909090915, 'R2': 0.9645608644639093}}
