In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import accuracy_score

#Load dataset
df = pd.read_csv('/Users/darwinjuan/Downloads/ANA680FinalProject/heart disease.csv')

#Encode categorical variables to numeric
df_encoded = pd.get_dummies(df, drop_first=True)

#Correlation with the target
target_col = 'HeartDisease'
correlation = df_encoded.corr()[target_col].drop(target_col)
top_corr = correlation.reindex(correlation.abs().sort_values(ascending=False).index)

print("Correlation with HeartDisease by absolute value:")
print(top_corr)

#Top 4 feature selection 
features = ['ST_Slope', 'ExerciseAngina', 'Oldpeak', 'ChestPainType']
target = 'HeartDisease'

X = df[features].copy()
y = df[target]

#Data encoding
slope_mapping = {'Up': 0, 'Flat': 1, 'Down': 2}
angina_mapping = {'N': 0, 'Y': 1}
cp_mapping = {'ASY': 0, 'ATA': 1, 'NAP': 2, 'TA': 3} 

X['ST_Slope'] = X['ST_Slope'].map(slope_mapping)
X['ExerciseAngina'] = X['ExerciseAngina'].map(angina_mapping)
X['ChestPainType'] = X['ChestPainType'].map(cp_mapping)

#Fill in any missing values 
X = X.fillna(0)

#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Train RF model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#Evaluate
predictions = model.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, predictions)}")

#Save model as pickle
with open('/Users/darwinjuan/Downloads/ANA680FinalProject/model.pkl', 'wb') as f: pickle.dump(model, f)


Correlation with HeartDisease by absolute value:
ST_Slope_Up         -0.622164
ST_Slope_Flat        0.554134
ExerciseAngina_Y     0.494282
Oldpeak              0.403951
ChestPainType_ATA   -0.401924
MaxHR               -0.400421
Sex_M                0.305445
Age                  0.282039
FastingBS            0.267291
Cholesterol         -0.232741
ChestPainType_NAP   -0.212964
RestingBP            0.107589
RestingECG_ST        0.102527
RestingECG_Normal   -0.091580
ChestPainType_TA    -0.054790
Name: HeartDisease, dtype: float64
Model Accuracy: 0.7934782608695652
