In [1]:
# Importing Data Manipulation Libraries
import pandas as pd
import numpy as np
# Import Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
# Import Filter Warning Libraries
import warnings
warnings.filterwarnings('ignore')
# Import Logging
import logging
logging.basicConfig(level = logging.INFO,
                    format = '%(asctime)s - %(levelname)s - %(message)s',
                    filemode = 'w',
                    filename = 'model.log',force = True)
# Import Scikit Learn Libraries for Machine Learning Model Building
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,learning_curve,KFold
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
import xgboost
from xgboost import XGBRegressor

# Multicolinearity test and treatment libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from collections import OrderedDict

In [2]:
import logging
def data_ingestion(data_source: str) -> pd.DataFrame:

    logging.info("Data Ingestion Started...")
    df = pd.read_csv(data_source)
    logging.info("Data Ingestion Completed Successfully")
    return df

def data_exploration(df: pd.DataFrame) -> pd.DataFrame:

    stats = []

    numerical_cols = df.select_dtypes(exclude='object').columns

    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        LW = Q1 - 1.5 * IQR
        UW = Q3 + 1.5 * IQR

        outlier_flag = "Has Outliers" if df[(df[col] < LW) | (df[col] > UW)].shape[0] > 0 else "No Outliers"

        numerical_stats = OrderedDict({
            "Feature": col,
            "Minimum": df[col].min(),
            "Maximum": df[col].max(),
            "Mean": df[col].mean(),
            "Median": df[col].median(),
            "Mode": df[col].mode().iloc[0] if not df[col].mode().empty else np.nan,
            "25%": Q1,
            "75%": Q3,
            "IQR": IQR,
            "Standard Deviation": df[col].std(),
            "Skewness": df[col].skew(),
            "Kurtosis": df[col].kurt(),
            "Outlier Comment": outlier_flag
        })

        stats.append(numerical_stats)

    report = pd.DataFrame(stats)
    return report

def categorical_summary(df: pd.DataFrame) -> pd.DataFrame:
    cat_cols = df.select_dtypes(include='object').columns

    summary = []
    for col in cat_cols:
        summary.append({
            "Feature": col,
            "Unique Values": df[col].nunique(),
            "Most Frequent": df[col].mode().iloc[0] if not df[col].mode().empty else None,
            "Missing Values": df[col].isna().sum()
        })

    return pd.DataFrame(summary)

def split_data(data, target_col, test_size=0.3, random_state=42):
    X = data.drop(columns=[target_col])
    y = data[target_col]

    return train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )


def encode_categorical(X_train, X_test):
    X_train = X_train.copy()
    X_test = X_test.copy()

    cat_cols = X_train.select_dtypes(include="object").columns

    encoders = {}

    for col in cat_cols:
        le = LabelEncoder()

        # Fit ONLY on train
        X_train[col] = le.fit_transform(X_train[col])

        # Transform test using same mapping
        X_test[col] = X_test[col].map(
            lambda x: le.transform([x])[0] if x in le.classes_ else -1
        )

        encoders[col] = le

    return X_train, X_test, encoders

def train_evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    return rmse, r2
def compare_models(X_train, X_test, y_train, y_test):
    models = {
        "Linear Regression": LinearRegression(),
        "Lasso": Lasso(),
        "Ridge": Ridge(),
        "Decision Tree": DecisionTreeRegressor(),
        "SVR": SVR(),
        "KNN": KNeighborsRegressor(),
        "Random Forest": RandomForestRegressor(),
        "Gradient Boost": GradientBoostingRegressor(),
        "Ada Boost": AdaBoostRegressor(),
        "XG Boost": XGBRegressor()
    }

    results = []

    for name, model in models.items():
        rmse, r2 = train_evaluate_model(
            model, X_train, X_test, y_train, y_test
        )
        results.append([name, rmse, r2])

    return pd.DataFrame(
        results, columns=["Model Name", "RMSE", "R2 Score"]).sort_values("R2 Score", ascending=False)

def k_fold_cv(X_train, y_train, folds=10):
    models = {
        "Linear Regression": LinearRegression(),
        "Lasso": Lasso(),
        "Ridge": Ridge(),
        "Decision Tree": DecisionTreeRegressor(),
        "SVR": SVR(),
        "KNN": KNeighborsRegressor(),
        "Random Forest": RandomForestRegressor(),
        "Gradient Boost": GradientBoostingRegressor(),
        "Ada Boost": AdaBoostRegressor(),
        "XG Boost": XGBRegressor()
    }

    results = []

    for name, model in models.items():
        scores = cross_val_score(
            model, X_train, y_train, cv=folds, scoring="r2"
        )
        results.append([name, scores.mean(), scores.std()])

    return pd.DataFrame(
        results, columns=["Model Name", "CV Mean R2", "CV STD"]).sort_values("CV Mean R2", ascending=False)

def hyperparameter_tuning(X_train, y_train, folds=5):
    tuning_config = {
        "XGBoost": {
            "model": XGBRegressor(),
            "params": {
                "eta": [0.1, 0.2, 0.3],
                "max_depth": [3, 5, 7],
                "gamma": [0, 10, 20],
                "reg_lambda": [0, 1]
            }
        },
        "Random Forest": {
            "model": RandomForestRegressor(),
            "params": {
                "max_depth": [5, 10, 15],
                "max_features": ["sqrt", "log2", 3, 4]
            }
        }
    }

    best_models = {}

    for name, cfg in tuning_config.items():
        grid = GridSearchCV(
            cfg["model"],
            cfg["params"],
            cv=folds,
            scoring="r2",
            n_jobs=-1
        )
        grid.fit(X_train, y_train)

        best_models[name] = grid.best_estimator_

    return best_models

def post_tuning_cv(best_models, X_train, y_train, folds=5):
    results = []

    for name, model in best_models.items():
        scores = cross_val_score(
            model, X_train, y_train, cv=folds, scoring="r2"
        )
        results.append([name, scores.mean(), scores.std()])

    return pd.DataFrame(
        results, columns=["Model Name", "CV Mean R2", "CV STD"]).sort_values("CV Mean R2", ascending=False)

def final_test_evaluation(best_model, X_train, X_test, y_train, y_test):
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    return rmse, r2


In [3]:
import logging

DATA_URL = "https://raw.githubusercontent.com/chandanc5525/CardioVascularRisk_AssessmentModel/refs/heads/main/data/raw/cardiovascular_risk_dataset.csv"
TARGET_COL = "heart_disease_risk_score"

def main():
    logging.info("ML Pipeline Started")

    # --------------------------------
    # Step 1: Data Ingestion
    # --------------------------------
    df = data_ingestion(DATA_URL)

    # --------------------------------
    # Step 2: Data Exploration (EDA)
    # --------------------------------
    numerical_report = data_exploration(df)
    categorical_report = categorical_summary(df)

    print("\nNumerical EDA Report:")
    print(numerical_report)

    print("\nCategorical Summary:")
    print(categorical_report)

    # --------------------------------
    # Step 3: Train–Test Split (ONCE)
    # --------------------------------
    X_train, X_test, y_train, y_test = split_data(
        data=df,
        target_col=TARGET_COL,
        test_size=0.3,
        random_state=42
    )

    logging.info("Train–Test split completed")

    # --------------------------------
    # Step 4: Baseline Model Comparison
    # --------------------------------

    X_train, X_test, encoders = encode_categorical(X_train, X_test)

    baseline_results = compare_models(
        X_train, X_test, y_train, y_test
    )

    print("\nBaseline Model Comparison:")
    print(baseline_results)

    # --------------------------------
    # Step 5: Cross Validation (TRAIN ONLY)
    # --------------------------------
    cv_results = k_fold_cv(
        X_train, y_train, folds=10
    )

    print("\nCross Validation Results (Before Tuning):")
    print(cv_results)

    # --------------------------------
    # Step 6: Hyperparameter Tuning
    # (TRAIN ONLY)
    # --------------------------------
    best_models = hyperparameter_tuning(
        X_train, y_train, folds=5
    )

    logging.info("Hyperparameter tuning completed")

    # --------------------------------
    # Step 7: Post-Tuning Cross Validation
    # --------------------------------
    post_cv_results = post_tuning_cv(
        best_models, X_train, y_train, folds=5
    )

    print("\nCross Validation Results (After Tuning):")
    print(post_cv_results)

    # --------------------------------
    # Step 8: Final Test Evaluation
    # (TEST USED ONLY ONCE)
    # --------------------------------
    best_model_name = post_cv_results.iloc[0]["Model Name"]
    best_model = best_models[best_model_name]

    final_rmse, final_r2 = final_test_evaluation(
        best_model, X_train, X_test, y_train, y_test
    )

    print("\nFinal Test Performance:")
    print(f"Best Model : {best_model_name}")
    print(f"RMSE       : {final_rmse}")
    print(f"R2 Score   : {final_r2}")

    logging.info("ML Pipeline Completed Successfully")

    return best_model

if __name__ == "__main__":
    main()



Numerical EDA Report:
                             Feature  Minimum  Maximum         Mean  Median  \
0                         Patient_ID      1.0   5500.0  2750.500000  2750.5   
1                                age     18.0     90.0    53.872000    54.0   
2                                bmi     15.0     40.9    28.170818    28.4   
3                        systolic_bp    108.0    192.0   147.248182   147.0   
4                       diastolic_bp     64.0    120.0    95.756727    96.0   
5                  cholesterol_mg_dl    147.0    331.0   239.684182   240.0   
6                 resting_heart_rate     48.0     92.0    74.075091    74.0   
7                        daily_steps    500.0  16793.0  5902.929455  5460.0   
8                       stress_level      1.0     10.0     4.907091     5.0   
9   physical_activity_hours_per_week      0.0     12.9     3.299364     2.6   
10                       sleep_hours      4.0     10.0     6.869364     6.9   
11                diet_qualit