In [None]:
import sys

In [None]:
# Global variables
gitBranch = 'main'
datasetFilePath = 'data/HeartDiseaseTrain-Test.csv'


In [None]:
# Clone the repository manually
!git clone -b {gitBranch} https://github.com/chinnuba/prediction-of-heart-disease.git
%cd prediction-of-heart-disease

# Append the src directory to sys.path
sys.path.append('src')

In [None]:
from src.setup import load_dataset
from src.data_analysis import (
    summarize_dataset,
    visualize_target_distribution,
    visualize_feature_distribution,
    correlation_analysis,
    feature_interaction_analysis,
    outlier_detection,
    distribution_analysis_by_target,
)
from src.data_preprocessing import preprocess_data
from src.eda import feature_importance_analysis, perform_pca
from src.utils import train_and_evaluate_models, compare_results, tune_and_evaluate_models
from src.models import get_models
from src.model_optimization import apply_smote

In [None]:
# Load dataset
df = load_dataset(datasetFilePath)

In [None]:
# Define target and feature columns
target_column = 'target'
categorical_columns = ['sex', 'chest_pain_type', 'fasting_blood_sugar', 'rest_ecg', 'exercise_induced_angina', 'slope', 'vessels_colored_by_flourosopy', 'thalassemia']
numerical_columns = ['age', 'resting_blood_pressure', 'cholestoral', 'Max_heart_rate', 'oldpeak']

In [None]:
# Run data analysis
summarize_dataset(df)
visualize_target_distribution(df, target_column)
visualize_feature_distribution(df)
correlation_analysis(df)
feature_interaction_analysis(df, categorical_columns, target_column)
outlier_detection(df, numerical_columns)
distribution_analysis_by_target(df, numerical_columns, target_column)

In [None]:
# Preprocess the data
X_train, X_test, y_train, y_test = preprocess_data(df, categorical_columns, numerical_columns, target_column)

In [None]:
# Perform feature importance analysis
feature_names = X_train.columns
feature_importance_analysis(X_train, y_train, feature_names)

# Perform PCA
perform_pca(X_train, y_train, n_components=2)

In [None]:
# Get the models
models = get_models()

In [None]:
# Train and Evaluate Models
models = get_models()
original_results_df = train_and_evaluate_models(models, X_train, X_test, y_train, y_test, data_label="Original Data")

In [None]:
# Apply SMOTE and Re-train Models
X_resampled, y_resampled = apply_smote(X_train, y_train)
smote_results_df = train_and_evaluate_models(models, X_resampled, X_test, y_resampled, y_test, data_label="After SMOTE")

In [None]:
# Compare Results Before and After SMOTE
comparison_df = compare_results(original_results_df, smote_results_df)

In [None]:
# Hyperparameter Tuning
tuned_results_df = tune_and_evaluate_models(X_train, X_test, y_train, y_test)

# Compare Tuned Results with Original Results
tuned_comparison_df = compare_results(original_results_df, tuned_results_df)