In [1]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

# 1. Load heart disease dataset
url = 'https://raw.githubusercontent.com/codebasics/py/master/ML/18_PCA/Exercise/heart.csv'
df = pd.read_csv(url)

# 2. Remove outliers using Z score
z_scores = np.abs(zscore(df.select_dtypes(include=[np.number])))
df = df[(z_scores < 3).all(axis=1)]

# 3. Convert text columns to numbers
# Label Encoding for binary categorical variables
label_enc = LabelEncoder()
binary_cols = ['Sex', 'ExerciseAngina']  # Example of binary columns
for col in binary_cols:
    df[col] = label_enc.fit_transform(df[col])

# One Hot Encoding for other categorical variables
df = pd.get_dummies(df, columns=['ChestPainType', 'RestingECG', 'ST_Slope'])

# 4. Apply scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop('HeartDisease', axis=1))
X = pd.DataFrame(scaled_features, columns=df.drop('HeartDisease', axis=1).columns)
y = df['HeartDisease']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Build classification models
models = {
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy of {name}: {accuracy:.4f}')

# 6. Apply PCA and retrain models
pca = PCA(n_components=0.95)  # Keep 95% of the variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

for name, model in models.items():
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy of {name} after PCA: {accuracy:.4f}')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = label_enc.fit_transform(df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = label_enc.fit_transform(df[col])


Accuracy of SVM: 0.8778
Accuracy of Logistic Regression: 0.8833
Accuracy of Random Forest: 0.8833
Accuracy of SVM after PCA: 0.8778
Accuracy of Logistic Regression after PCA: 0.8889
Accuracy of Random Forest after PCA: 0.8667
