In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pickle


In [3]:
!pip install openpyxl


Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [openpyxl]1/2[0m [openpyxl]
[1A[2KSuccessfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [4]:
# Load dataset
df = pd.read_excel("heart_cleveland_upload.xlsx")

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()


Shape: (297, 14)
Columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'condition']


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [5]:
!pip install openpyxl




# Heart Disease Prediction – Training on AWS SageMaker

## Problem Definition
The goal of this project is to predict whether a patient has heart disease 
based on clinical features. We are retraining the model directly in AWS 
SageMaker to ensure compatibility with the environment’s libraries 
(scikit-learn 1.7.1). This avoids version mismatch warnings that occurred 
when loading pre-trained models.


In [6]:
import pandas as pd

# Load dataset from Excel file
df = pd.read_excel("heart_cleveland_upload.xlsx")

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()


Shape: (297, 14)
Columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'condition']


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


## Data Preparation
We separate the features (X) from the target variable (y), 
then split into training and testing sets (80/20). 
Next, we scale the features for better model performance.


In [9]:
from sklearn.model_selection import train_test_split

# Features (all except condition) and target
X = df.drop("condition", axis=1)
y = df["condition"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [10]:
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## Model Training
We use Logistic Regression as the primary model. 
This is a strong baseline for binary classification problems like heart disease.


In [11]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [12]:
# Evaluate model accuracy
accuracy = model.score(X_test_scaled, y_test)
print("Test Accuracy:", round(accuracy, 4))


Test Accuracy: 0.9167


## Save Model and Scaler
We persist both the trained model and the scaler using pickle, 
so they can be reused in the Flask app, Docker container, and Kubernetes deployments.


In [13]:
import pickle

# Save model and scaler
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("✅ Model and scaler saved successfully.")


✅ Model and scaler saved successfully.


## Test Prediction
We run a sample patient input through the scaler and model 
to confirm predictions work inside SageMaker.


In [14]:
# Example input with all 13 features
sample = pd.DataFrame([[
    63,1,3,145,233,1,0,150,0,2.3,0,0,1
]], columns=X.columns)

# Scale and predict
sample_scaled = scaler.transform(sample)
prediction = model.predict(sample_scaled)[0]

print("Prediction:", "Heart Disease" if prediction == 1 else "No Heart Disease")


Prediction: No Heart Disease
