In [6]:
# Import necessary libraries
import pandas as pd
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import zscore
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder



In [7]:
df=pd.read_csv('heart.csv')

In [8]:
# Identify the numerical columns
numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Calculate and display the Z-scores for each of the specified columns
z_scores = df[numeric_cols].apply(zscore)

# Display the Z-scores
print(z_scores.head())  # Shows the Z-scores for the first few rows


        age  trestbps      chol   thalach   oldpeak
0 -0.268437 -0.377636 -0.659332  0.821321 -0.060888
1 -0.158157  0.479107 -0.833861  0.255968  1.727137
2  1.716595  0.764688 -1.396233 -1.048692  1.301417
3  0.724079  0.936037 -0.833861  0.516900 -0.912329
4  0.834359  0.364875  0.930822 -1.874977  0.705408


In [None]:
# Create a copy of the data for label encoding
label_encoded_data = df.copy()
# Create a copy of the data for one-hot encoding
one_hot_encoded_data = df.copy()

# Apply Label Encoding to all categorical columns
label_encoder = LabelEncoder()
categorical_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']  # specify relevant columns

for col in categorical_columns:
    label_encoded_data[col] = label_encoder.fit_transform(label_encoded_data[col])

# Apply One-Hot Encoding to all categorical columns
one_hot_encoded_data = pd.get_dummies(one_hot_encoded_data, columns=categorical_columns)

# Display the first few rows of each encoded dataset
print("Label Encoded Data:")
print(label_encoded_data.head())

print("\nOne-Hot Encoded Data:")
print(one_hot_encoded_data.head())


Label Encoded Data:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  

One-Hot Encoded Data:
   age  trestbps  chol  thalach  oldpeak  target  sex_0  sex_1  cp_0   cp_1  \
0   52       125   212      168      1.0       0  False   True  True  False   
1   53       140   203      155      3.1       0  False   True  True  False   
2   70       145   174      125      2.6       0  False   True  True  False   
3   61       148   203

In [12]:
# Step 4: Apply Scaling to Numerical Columns
scaler = StandardScaler()
numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Fit and transform the numeric columns and update them in the DataFrame
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Display the first few rows to see the scaled values
print(df[numeric_cols].head())


        age  trestbps      chol   thalach   oldpeak
0 -0.268437 -0.377636 -0.659332  0.821321 -0.060888
1 -0.158157  0.479107 -0.833861  0.255968  1.727137
2  1.716595  0.764688 -1.396233 -1.048692  1.301417
3  0.724079  0.936037 -0.833861  0.516900 -0.912329
4  0.834359  0.364875  0.930822 -1.874977  0.705408


In [20]:
# Step 5: Build Classification Models (SVM, Logistic Regression, Random Forest)
# Split data into features and target
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier()
}

# Train and evaluate each model
accuracy_results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy_results[name] = accuracy_score(y_test, y_pred)

# Print accuracy results for each model
print("Accuracy without PCA:", accuracy_results)

Accuracy without PCA: {'SVM': 0.8341463414634146, 'Logistic Regression': 0.7951219512195122, 'Random Forest': 0.9853658536585366}


In [23]:
# Step 6: Apply PCA and Retrain the Best Model (Random Forest)
# Apply PCA with a reasonable number of components (e.g., 2)
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Retrain Random Forest model on PCA-transformed data
best_model = RandomForestClassifier()
best_model.fit(X_train_pca, y_train)
y_pred_pca = best_model.predict(X_test_pca)
pca_accuracy = accuracy_score(y_test, y_pred_pca)

# Print accuracy after PCA
print("Accuracy with PCA (Random Forest):", pca_accuracy)

Accuracy with PCA (Random Forest): 0.9853658536585366
