In [1]:

import pandas as pd
from main import analyze_dataframe
# Path to the local CSV file
file_path = '/Users/dougstrouth/Documents/datasets/kaggle_data_sets/data/pratyushpuri/heart-disease-dataset-3k-rows-python-code-2025/heart_disease_dataset.csv'

try:
    # Load the dataset from the local CSV file
    df = pd.read_csv(file_path)

    # Analyze the DataFrame
    analyze_dataframe(df, "heart_disease_dataset.csv")

except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")



--- Analyzing DataFrame: heart_disease_dataset.csv ---

  Dimensions: 3069 rows, 17 columns

  Column Details:
    Column Names:
      - age
      - sex
      - cp
      - trestbps
      - chol
      - fbs
      - restecg
      - thalach
      - exang
      - oldpeak
      - slope
      - ca
      - thal
      - smoking
      - diabetes
      - bmi
      - heart_disease

    Data Types:
      - age: int64
      - sex: int64
      - cp: int64
      - trestbps: int64
      - chol: int64
      - fbs: int64
      - restecg: int64
      - thalach: int64
      - exang: int64
      - oldpeak: float64
      - slope: int64
      - ca: int64
      - thal: int64
      - smoking: int64
      - diabetes: int64
      - bmi: float64
      - heart_disease: int64

    Null/Blank Value Percentages (includes NaNs and empty strings for text columns):
      - age: 0.00% missing/blank
      - sex: 0.00% missing/blank
      - cp: 0.00% missing/blank
      - trestbps: 0.00% missing/blank
      - chol: 0.00% 

In [2]:

import pandas as pd

# Path to the local CSV file
file_path = '/Users/dougstrouth/Documents/datasets/kaggle_data_sets/data/pratyushpuri/heart-disease-dataset-3k-rows-python-code-2025/heart_disease_dataset.csv'

try:
    # Load the dataset from the local CSV file
    df = pd.read_csv(file_path)

    # Calculate the correlation matrix
    correlation_matrix = df.corr()

    # Get the correlation of all columns with 'heart_disease'
    heart_disease_correlation = correlation_matrix['heart_disease'].sort_values(ascending=False)

    # Print the results
    print("Correlation of columns with heart_disease:")
    print(heart_disease_correlation)

except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")


Correlation of columns with heart_disease:
heart_disease    1.000000
fbs              0.019842
bmi              0.015153
ca               0.014646
chol             0.014417
sex              0.007573
exang            0.002772
thalach          0.002237
restecg          0.000615
cp              -0.001665
diabetes        -0.007073
trestbps        -0.011171
smoking         -0.011673
age             -0.013564
thal            -0.023369
slope           -0.024752
oldpeak         -0.033246
Name: heart_disease, dtype: float64


In [3]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Path to the local CSV file
file_path = '/Users/dougstrouth/Documents/datasets/kaggle_data_sets/data/pratyushpuri/heart-disease-dataset-3k-rows-python-code-2025/heart_disease_dataset.csv'

try:
    # Load the dataset from the local CSV file
    df = pd.read_csv(file_path)

    # Prepare the data
    X = df.drop('heart_disease', axis=1)
    y = df['heart_disease']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the Random Forest model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy}")

    # Get feature importances
    feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)

    # Print the feature importances
    print("\nFeature Importances:")
    print(feature_importances)

except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")


Model Accuracy: 0.5439739413680782

Feature Importances:
chol        0.134566
bmi         0.129180
trestbps    0.125375
thalach     0.123318
oldpeak     0.113733
age         0.112405
cp          0.043055
ca          0.036118
slope       0.034837
restecg     0.033428
thal        0.029276
sex         0.020107
smoking     0.018464
exang       0.015847
diabetes    0.015736
fbs         0.014556
dtype: float64


In [4]:

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Path to the local CSV file
file_path = '/Users/dougstrouth/Documents/datasets/kaggle_data_sets/data/pratyushpuri/heart-disease-dataset-3k-rows-python-code-2025/heart_disease_dataset.csv'

try:
    # Load the dataset from the local CSV file
    df = pd.read_csv(file_path)

    # Prepare the data
    X = df.drop('heart_disease', axis=1)
    y = df['heart_disease']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 4],
        'subsample': [0.7, 0.8]
    }

    # Initialize the Gradient Boosting Classifier
    gb_model = GradientBoostingClassifier(random_state=42)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

    # Fit GridSearchCV to the training data
    grid_search.fit(X_train, y_train)

    # Get the best estimator
    best_gb_model = grid_search.best_estimator_

    # Evaluate the best model
    y_pred = best_gb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Print the results
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"New Model Accuracy: {accuracy}")

    # Get feature importances
    feature_importances = pd.Series(best_gb_model.feature_importances_, index=X.columns).sort_values(ascending=False)

    # Print the feature importances
    print("\nFeature Importances:")
    print(feature_importances)

except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")


Fitting 3 folds for each of 16 candidates, totalling 48 fits


Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
New Model Accuracy: 0.5716612377850163

Feature Importances:
chol        0.197026
thalach     0.154860
bmi         0.150782
trestbps    0.118887
oldpeak     0.112124
age         0.109218
ca          0.038931
slope       0.022326
restecg     0.020883
cp          0.017167
thal        0.016728
diabetes    0.014830
smoking     0.008696
sex         0.007200
fbs         0.007086
exang       0.003256
dtype: float64
