In [36]:
from ucimlrepo import fetch_ucirepo 
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 
  
# metadata 
print(heart_disease.metadata) 
  
# variable information 
print(heart_disease.variables) 


{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M. Pfisterer, J. Schmid, S. Sa

In [37]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0


In [38]:
y.head()

Unnamed: 0,num
0,0
1,2
2,1
3,0
4,0


In [39]:
 #Check for NaNs in the entire dataset
nan_count = X.isnull().sum()

# Display columns with NaN values and their count
print("NaN values in each column:\n", nan_count)

NaN values in each column:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
dtype: int64


In [40]:
# Remove rows with NaN in 'thal' or 'ca' from X
X_cleaned = X.dropna(subset=['thal', 'ca'])

# Reset index for X_cleaned to maintain alignment with y
X_cleaned = X_cleaned.reset_index(drop=True)

# Now remove the corresponding rows from y using the index of X_cleaned
y_cleaned = y.loc[X_cleaned.index].reset_index(drop=True)

#y_cleaned['num'] = y_cleaned['num'].apply(lambda x: 0 if x == 0 else 1)
import numpy as np
unique, counts = np.unique(y_cleaned, return_counts=True)
print("Class distribution:", dict(zip(unique, counts)))

from imblearn.over_sampling import SMOTE

#add synthetic data to balance set
smote = SMOTE(random_state=42)
X_cleaned, y_cleaned = smote.fit_resample(X_cleaned, y_cleaned)

y_cleaned = y_cleaned.to_numpy().ravel()

Class distribution: {np.int64(0): np.int64(163), np.int64(1): np.int64(52), np.int64(2): np.int64(35), np.int64(3): np.int64(34), np.int64(4): np.int64(13)}


In [41]:
# Feature scaling is recommended before using chi2
scaler = MinMaxScaler()
#scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cleaned)

In [42]:
# Initialize the base model for RFE (LogisticRegression or another estimator)
estimator = LogisticRegression(max_iter=1000)

# Initialize RFE with the number of features to select (e.g., 10)
# You can adjust the n_features_to_select parameter
# selector = RFE(estimator=estimator, n_features_to_select=13)
# 
# # Fit RFE to the data to select important features
# X_new = selector.fit_transform(X_scaled, y_cleaned)
# 
# # Get the selected feature names
# selected_rfe_features = X_cleaned.columns[selector.support_]
# print("Selected Features via RFE:\n", selected_rfe_features)

# Use SelectKBest to select the top k features
for features in range(10,14):
    k = features  # Set k to the number of features you want to select
    selector = SelectKBest(score_func=chi2, k=k)
    X_new = selector.fit_transform(X_scaled, y_cleaned)
    
    # Get the boolean mask of selected features
    selected_mask = selector.get_support()
    
    # Get the names of the selected features
    selected_features = X_cleaned.columns[selected_mask]
    
    # Display the selected features
    print("Selected Features:\n", selected_features)
    
    # Perform the train-test split on the selected features (X_new) and target (y_cleaned)
    X_train, X_test, y_train, y_test = train_test_split(X_new, y_cleaned, test_size=0.2, random_state=42)
    
    print(f"Training set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")
    
    #figure out best weights for our model later
    rf = RandomForestClassifier(random_state=42)

    # Define the hyperparameter grid
    param_grid = {
        'n_estimators': [50, 100, 200],  # Number of trees
        'max_depth': [5, 7, 10, None],   # Max depth of trees
        'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
        'class_weight': ['balanced',  {0: 1, 1: 2, 2: 1, 3: 1, 4: 5}]  # Only predefined class weights
    }
    
    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
    
    #Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    # Get the best parameters from the grid search
    best_params = grid_search.best_params_
    print("Best Parameters from GridSearchCV:", best_params)

    # Train the RandomForest model with the best parameters
    best_rf = grid_search.best_estimator_

    # Evaluate the performance on the test set
    y_pred = best_rf.predict(X_test)
    print("Test Accuracy with Best Parameters:", accuracy_score(y_test, y_pred))
    
    
    # Initialize the model
    #original model 
    #model = RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced')
    
    #model with some custom hyperparameters to avoid overfitting
    #model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=9, min_samples_split=10, class_weight='balanced')
    
    #model with winners from grid search
    custom_weights = {0: 1, 1: 2, 2: 1, 3: 1, 4: 5}
    model = RandomForestClassifier(class_weight=custom_weights, min_samples_split=2, n_estimators=100)
    
    model2 = DecisionTreeClassifier(random_state=42, class_weight='balanced')
    model3 = LogisticRegression(max_iter=1000, class_weight='balanced')

    # Train the model
    model.fit(X_train, y_train)
    model2.fit(X_train, y_train)
    model3.fit(X_train, y_train)
    
    # Predict using the model
    y_pred = model.predict(X_test)
    y_pred2 = model2.predict(X_test)
    y_pred3 = model3.predict(X_test)
    
    
    
    # Predict on the training set
    y_train_pred = model.predict(X_train)
    
    # Predict on the test set
    y_test_pred = model.predict(X_test)
    
    # Calculate accuracy for both the training and test sets
    from sklearn.metrics import accuracy_score
    
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    print(f"Training Accuracy: {train_accuracy}")
    print(f"Test Accuracy: {test_accuracy}")
    
    # Evaluate with cross-validation
    cv_scores = cross_val_score(model, X_new, y_cleaned, cv=5, scoring='accuracy')
    print(f"Cross-Validation Accuracy Scores: {cv_scores}")
    print(f"Mean Cross-Validation Accuracy: {cv_scores.mean()}")
    
    
    # Print classification report
    print(classification_report(y_test, y_pred, zero_division=1))
    # print(classification_report(y_test, y_pred2, zero_division=1))
    # print(classification_report(y_test, y_pred3, zero_division=1))
    
    # Check accuracy
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(y_test, y_pred)
    # accuracy2 = accuracy_score(y_test, y_pred2)
    # accuracy3 = accuracy_score(y_test, y_pred3)
    print("number of features: ", features)
    print(f"Forest Model Accuracy: {accuracy}")
    # print(f"Tree Model Accuracy: {accuracy2}")
    # print(f"Logistic Regression Model Accuracy: {accuracy3}")
    
    print("----------------------------------------------------")
    
    # print("Predicted classes by RandomForest:", y_pred)
    # print("Predicted classes by DecisionTree:", y_pred2)
    # print("Predicted classes by LogisticRegression:", y_pred3)

Selected Features:
 Index(['sex', 'cp', 'trestbps', 'fbs', 'restecg', 'exang', 'oldpeak', 'slope',
       'ca', 'thal'],
      dtype='object')
Training set shape: (652, 10)
Test set shape: (163, 10)
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters from GridSearchCV: {'class_weight': {0: 1, 1: 2, 2: 1, 3: 1, 4: 5}, 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Test Accuracy with Best Parameters: 0.6809815950920245
Training Accuracy: 0.9938650306748467
Test Accuracy: 0.6748466257668712
Cross-Validation Accuracy Scores: [0.55214724 0.65644172 0.74233129 0.74846626 0.71165644]
Mean Cross-Validation Accuracy: 0.6822085889570553
              precision    recall  f1-score   support

           0       0.61      0.66      0.63        29
           1       0.61      0.65      0.63        34
           2       0.70      0.54      0.61        35
           3       0.63      0.69      0.66        35
           4       0.84      0.87      0.85        