#Import necessary libraries



In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import joblib
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc

#  Load the dataset

In [2]:
# Download the dataset
data_path = "https://storage.googleapis.com/edulabs-public-datasets/heart_disease_uci.csv"

# Load the dataset into a Pandas DataFrame
df = pd.read_csv(data_path)


In [3]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


**Column description**

- id: unique id
- age: age in years
- sex: gender
- dataset: location of data collection
- cp: chest pain type
- trestbps: resting blood pressure
- chol: cholesterol measure
- fbs: fasting blood sugar
- restecg: ecg observation at resting condition
- thalch: maximum heart rate achieved
- exang: exercise induced angina
- oldpeak: ST depression induced by exercise relative to rest
- slope: the slope of the peak exercise ST segment
- ca: number of major vessels (0-3) colored by flourosopy
- thal: thal
- num: target [0=no heart disease; 1,2,3,4 = stages of heart disease ]

# Data preprocessing

Perform quick data preprocessing:

- Remove redundant columns
- Fill / Drop missing values
- Convert column types if needed

In [4]:
df.shape

(920, 16)

In [5]:
df.drop(columns=['id'], inplace=True)
df.drop(columns=['dataset'], inplace=True)

In [6]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,59
chol,30
fbs,90
restecg,2
thalch,55
exang,55
oldpeak,62


In [7]:
df.dropna(subset=['trestbps','chol','fbs','thalch','exang','oldpeak','restecg'], inplace=True)
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalch,0
exang,0
oldpeak,0


In [8]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def plot_feature_target_scatter(df, features, target_variable):
    """
    Displays a figure with multiple scatter plots showing the correlation
    between each feature and the target variable.

    Args:
        df (pd.DataFrame): DataFrame containing features and target variable.
        features (list): List of column names to be considered as features.
        target_variable (str): Name of the target variable column.
    """

    num_features = len(features)
    if num_features == 0:
        print("No features provided to plot.")
        return

    # Determine subplot grid layout (adjust as needed for better layout)
    if num_features <= 2:
        rows = 1
        cols = num_features
    elif num_features <= 4:
        rows = 2
        cols = 2
    else:
        rows = (num_features + 1) // 3  # Adjust columns for more features
        cols = 3

    fig = make_subplots(rows=rows, cols=cols,
                        subplot_titles=[f'Feature vs. Target: {feature}' for feature in features])

    for i, feature in enumerate(features):
        row_index = (i // cols) + 1
        col_index = (i % cols) + 1

        scatter_trace = go.Scatter(
            x=df[feature],
            y=df[target_variable],
            mode='markers',
            marker=dict(color='blue', size=5),
            name=feature
        )
        fig.add_trace(scatter_trace, row=row_index, col=col_index)

        fig.update_xaxes(title_text=feature, row=row_index, col=col_index)
        fig.update_yaxes(title_text=target_variable, row=row_index, col=col_index)

    fig.update_layout(title_text="Feature vs Target Variable Scatter Plots", showlegend=False)
    fig.show()

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 740 entries, 0 to 919
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       740 non-null    int64  
 1   sex       740 non-null    object 
 2   cp        740 non-null    object 
 3   trestbps  740 non-null    float64
 4   chol      740 non-null    float64
 5   fbs       740 non-null    object 
 6   restecg   740 non-null    object 
 7   thalch    740 non-null    float64
 8   exang     740 non-null    object 
 9   oldpeak   740 non-null    float64
 10  slope     531 non-null    object 
 11  ca        303 non-null    float64
 12  thal      400 non-null    object 
 13  num       740 non-null    int64  
dtypes: float64(5), int64(2), object(7)
memory usage: 86.7+ KB


In [10]:
plot_feature_target_scatter(df, df.select_dtypes(['number','bool']).columns.drop('num'), 'num')

In [11]:
df = df[df['trestbps'] != 0]
df = df[df['chol'] != 0]

In [12]:
plot_feature_target_scatter(df, df.select_dtypes(['number','bool']).columns.drop('num'), 'num')

In [13]:
df['is_male'] = df['sex'].map({'Male': True, 'Female': False})
df.drop(columns=['sex'], inplace=True)

In [14]:
df['exang'] = df['exang'].astype('bool')
df['fbs'] = df['fbs'].astype('bool')
df['age'] = df['age'].astype('int32')

In [15]:
df['is_heart_disease'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
df['is_heart_disease'] = df['is_heart_disease'].astype('bool')
df.drop(columns=['num'], inplace=True)

In [16]:
df['slope'] = df['slope'].fillna('no_data')
df['thal'] = df['thal'].fillna('no_data')
df['ca'] = df['ca'].fillna(0)

In [17]:
fig = px.histogram(
    df,
    x='is_heart_disease',
    # histnorm='percent',
    width=600
)
fig.show()

# Correlations

In [18]:
corr_matrix = df.select_dtypes(include=['number', 'bool']).corr()

In [19]:
corr_matrix['is_heart_disease'].abs().sort_values(ascending=False)

Unnamed: 0,is_heart_disease
is_heart_disease,1.0
exang,0.504639
oldpeak,0.474658
thalch,0.371759
is_male,0.285236
age,0.281379
ca,0.261443
trestbps,0.178637
fbs,0.135589
chol,0.116613


# Data split

Split the data into training and validation sets. Don't forget to use stratify!

In [20]:
df.columns

Index(['age', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang',
       'oldpeak', 'slope', 'ca', 'thal', 'is_male', 'is_heart_disease'],
      dtype='object')

In [21]:
X = df.drop(columns=['is_heart_disease'])
y = df['is_heart_disease']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=101, stratify=y)

# Create model / pipeline

Implement code that trains LogisticRegression model

Don't forget to normalize data!



In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 661 entries, 0 to 919
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               661 non-null    int32  
 1   cp                661 non-null    object 
 2   trestbps          661 non-null    float64
 3   chol              661 non-null    float64
 4   fbs               661 non-null    bool   
 5   restecg           661 non-null    object 
 6   thalch            661 non-null    float64
 7   exang             661 non-null    bool   
 8   oldpeak           661 non-null    float64
 9   slope             661 non-null    object 
 10  ca                661 non-null    float64
 11  thal              661 non-null    object 
 12  is_male           661 non-null    bool   
 13  is_heart_disease  661 non-null    bool   
dtypes: bool(4), float64(5), int32(1), object(4)
memory usage: 56.8+ KB


In [23]:
df

Unnamed: 0,age,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,is_male,is_heart_disease
0,63,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,True,False
1,67,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,True,True
2,67,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,True,True
3,37,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,True,False
4,41,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,62,asymptomatic,158.0,170.0,False,st-t abnormality,138.0,True,0.0,no_data,0.0,no_data,True,True
914,46,asymptomatic,134.0,310.0,False,normal,126.0,False,0.0,no_data,0.0,normal,True,True
915,54,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,no_data,0.0,no_data,False,True
917,55,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,no_data,0.0,fixed defect,True,True


In [24]:
X_train.head(10)

Unnamed: 0,age,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,is_male
533,45,asymptomatic,130.0,219.0,False,st-t abnormality,130.0,True,1.0,flat,0.0,no_data,True
798,60,asymptomatic,142.0,216.0,False,normal,110.0,True,2.5,flat,0.0,no_data,True
127,54,asymptomatic,110.0,239.0,False,normal,126.0,True,2.8,flat,1.0,reversable defect,True
356,42,atypical angina,120.0,196.0,False,normal,150.0,False,0.0,no_data,0.0,no_data,True
358,42,atypical angina,150.0,268.0,False,normal,136.0,False,0.0,no_data,0.0,no_data,True
381,46,asymptomatic,130.0,238.0,False,normal,90.0,False,0.0,no_data,0.0,no_data,False
172,59,asymptomatic,174.0,249.0,False,normal,143.0,True,0.0,flat,0.0,normal,False
390,47,typical angina,110.0,249.0,False,normal,150.0,False,0.0,no_data,0.0,no_data,True
569,53,asymptomatic,180.0,285.0,False,st-t abnormality,120.0,True,1.5,flat,0.0,no_data,True
736,63,asymptomatic,170.0,177.0,False,normal,84.0,True,2.5,downsloping,0.0,no_data,True


In [25]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 561 entries, 533 to 230
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       561 non-null    int32  
 1   cp        561 non-null    object 
 2   trestbps  561 non-null    float64
 3   chol      561 non-null    float64
 4   fbs       561 non-null    bool   
 5   restecg   561 non-null    object 
 6   thalch    561 non-null    float64
 7   exang     561 non-null    bool   
 8   oldpeak   561 non-null    float64
 9   slope     561 non-null    object 
 10  ca        561 non-null    float64
 11  thal      561 non-null    object 
 12  is_male   561 non-null    bool   
dtypes: bool(3), float64(5), int32(1), object(4)
memory usage: 47.7+ KB


In [26]:
# הגדרת תכונות קטגוריות ומספריות
categorical_features = ['cp', 'restecg', 'slope', 'thal']
numerical_features = df.select_dtypes(include=['number']).columns.tolist()

# טרנספורמציה עבור תכונות מספריות
num_transformers = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # טיפול ב-NaN
    ('scaler', StandardScaler())
])

# טרנספורמציה עבור תכונות קטגוריות
cat_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

# שילוב הכל ב-ColumnTransformer
preprocessor = ColumnTransformer([
    ('cat', cat_transformer, categorical_features),
    ('num', num_transformers, numerical_features)
])

# בניית פייפליין מלא
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=17)),  # להפעיל רק אם רוצים בחירת מאפיינים
    ('regressor', LogisticRegression())
])

In [27]:
model = pipeline.fit(X_train, y_train)

# Evaluate the model

- display various evaluation metrics
- find the best threshold
- save the best model in file

In [28]:
model.predict(X_val)

array([False,  True,  True,  True, False, False,  True, False, False,
       False, False, False, False, False, False, False,  True,  True,
        True, False, False,  True,  True, False,  True, False, False,
       False, False,  True,  True, False, False, False,  True,  True,
        True, False, False,  True, False, False,  True, False, False,
       False, False, False, False, False,  True,  True,  True,  True,
        True, False,  True, False,  True,  True, False,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True, False,
        True, False,  True,  True, False, False,  True,  True, False,
       False,  True, False, False,  True, False, False, False,  True,
        True])

In [29]:
model.predict_proba(X_val)

array([[0.8548674 , 0.1451326 ],
       [0.04211208, 0.95788792],
       [0.23086488, 0.76913512],
       [0.27551055, 0.72448945],
       [0.92712619, 0.07287381],
       [0.87076301, 0.12923699],
       [0.1784793 , 0.8215207 ],
       [0.52081806, 0.47918194],
       [0.90558225, 0.09441775],
       [0.73485968, 0.26514032],
       [0.73066468, 0.26933532],
       [0.66808918, 0.33191082],
       [0.90037579, 0.09962421],
       [0.92417131, 0.07582869],
       [0.69947261, 0.30052739],
       [0.86692329, 0.13307671],
       [0.22452777, 0.77547223],
       [0.04808412, 0.95191588],
       [0.21408018, 0.78591982],
       [0.76029429, 0.23970571],
       [0.65475575, 0.34524425],
       [0.36962983, 0.63037017],
       [0.07005239, 0.92994761],
       [0.81409093, 0.18590907],
       [0.13213554, 0.86786446],
       [0.90859763, 0.09140237],
       [0.57907475, 0.42092525],
       [0.65007262, 0.34992738],
       [0.80294988, 0.19705012],
       [0.0989988 , 0.9010012 ],
       [0.

In [30]:
metrics.confusion_matrix(y_val, model.predict(X_val))

array([[44,  8],
       [10, 38]])

In [31]:
pd.DataFrame(metrics.confusion_matrix(y_val, model.predict(X_val),normalize='true'),
             columns=['Predicted Heart Disease (0)', 'Predicted Heart Disease (1)'],
             index=['Actual Heart Disease (0)', 'Actual Heart Disease (1)'])

Unnamed: 0,Predicted Heart Disease (0),Predicted Heart Disease (1)
Actual Heart Disease (0),0.846154,0.153846
Actual Heart Disease (1),0.208333,0.791667


In [32]:
print(metrics.classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

       False       0.81      0.85      0.83        52
        True       0.83      0.79      0.81        48

    accuracy                           0.82       100
   macro avg       0.82      0.82      0.82       100
weighted avg       0.82      0.82      0.82       100



In [33]:
preds = model.predict_proba(X_val)[:,1] >= 0.5
metrics.confusion_matrix(y_val, preds)

array([[44,  8],
       [10, 38]])

In [34]:
probabilities = model.predict_proba(X_val)[:,1]
fpr, tpr, thresholds = roc_curve(y_val, probabilities)
roc_auc = auc(fpr, tpr)

In [35]:
# prompt: display roc curve using plotly with threshold values as tooltips

fig = go.Figure()

# Add the ROC curve
fig.add_trace(go.Scatter(
    x=fpr,
    y=tpr,
    mode='lines',
    name=f'ROC Curve (AUC = {roc_auc:.2f})',
    hovertemplate = 'False Positive Rate: %{x:.3f}<br>True Positive Rate: %{y:.3f}<br>Threshold: %{text:.3f}<extra></extra>',
    text = thresholds
))

# Add a diagonal line for random guessing
fig.add_trace(go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    line=dict(dash='dash'),
    name='Random Guessing'
))


fig.update_layout(
    title='Receiver Operating Characteristic (ROC) Curve',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    width=800,
    height=600
)

fig.show()


In [36]:
import joblib
joblib.dump(pipeline, 'model_heart_disease.pkl')

['model_heart_disease.pkl']

# Predict

Implement function ```predict_heart_dicease()```

The function receives the following parameters: ```age, sex, chest_pain_type,...``` and others.

The funciton returns ```True``` if heart disease is predicted, ```False``` otherwise.

Note, your function should perform all the needed transformations to the data, and return the final answer.

Your function should load the stored model from the file.

In [37]:
def predict_heart_disease(age, cp, trestbps, chol, fbs, restecg, thalch, exang, oldpeak, slope, ca, thal, is_male):
    """
    טוען את המודל ומשתמש בנתונים כדי לבצע חיזוי של מחלת לב.

    פרמטרים:
    age (int): גיל
    cp (str): סוג כאב חזה ('asymptomatic', 'typical angina' וכו')
    trestbps (float): לחץ דם במנוחה
    chol (float): רמת כולסטרול
    fbs (bool): האם רמת הסוכר בצום גבוהה מ-120? (True/False)
    restecg (str): תוצאות ECG במנוחה ('normal', 'st-t abnormality' וכו')
    thalch (float): דופק מרבי
    exang (bool): האם מופיעה אנגינה בזמן מאמץ? (True/False)
    oldpeak (float): דיכוי ST ביחס למנוחה
    slope (str): שיפוע מקטע ST ('upsloping', 'flat', 'downsloping')
    ca (float): מספר כלי דם מוצרים (0-3, יכול להיות NaN)
    thal (str): סוג תליום ('normal', 'fixed defect', 'reversable defect')
    is_male (bool): האם המטופל גבר? (True/False)

    מחזיר:
    int: 1 (חולה במחלת לב) או 0 (לא חולה)
    """

    # טעינת המודל
    loaded_model = joblib.load('model_heart_disease.pkl')

    # יצירת DataFrame עם הנתון
    data = pd.DataFrame([{
        'age': age,
        'cp': cp,
        'trestbps': trestbps,
        'chol': chol,
        'fbs': fbs,
        'restecg': restecg,
        'thalch': thalch,
        'exang': exang,
        'oldpeak': oldpeak,
        'slope': slope,
        'ca': ca,
        'thal': thal,
        'is_male': is_male
    }])

    # ביצוע חיזוי
    prediction = loaded_model.predict(data)

    return prediction[0]  # מחזיר את התוצאה (0 = לא חולה, 1 = חולה)



In [38]:
X_train

Unnamed: 0,age,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,is_male
533,45,asymptomatic,130.0,219.0,False,st-t abnormality,130.0,True,1.0,flat,0.0,no_data,True
798,60,asymptomatic,142.0,216.0,False,normal,110.0,True,2.5,flat,0.0,no_data,True
127,54,asymptomatic,110.0,239.0,False,normal,126.0,True,2.8,flat,1.0,reversable defect,True
356,42,atypical angina,120.0,196.0,False,normal,150.0,False,0.0,no_data,0.0,no_data,True
358,42,atypical angina,150.0,268.0,False,normal,136.0,False,0.0,no_data,0.0,no_data,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
826,50,asymptomatic,144.0,349.0,False,lv hypertrophy,120.0,True,1.0,upsloping,0.0,reversable defect,True
41,40,typical angina,140.0,199.0,False,normal,178.0,True,1.4,upsloping,0.0,reversable defect,True
563,51,non-anginal,135.0,160.0,False,normal,150.0,False,2.0,flat,0.0,no_data,True
463,55,atypical angina,140.0,196.0,False,normal,150.0,False,0.0,no_data,0.0,reversable defect,True


In [39]:
df

Unnamed: 0,age,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,is_male,is_heart_disease
0,63,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,True,False
1,67,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,True,True
2,67,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,True,True
3,37,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,True,False
4,41,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,62,asymptomatic,158.0,170.0,False,st-t abnormality,138.0,True,0.0,no_data,0.0,no_data,True,True
914,46,asymptomatic,134.0,310.0,False,normal,126.0,False,0.0,no_data,0.0,normal,True,True
915,54,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,no_data,0.0,no_data,False,True
917,55,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,no_data,0.0,fixed defect,True,True


In [40]:
result = predict_heart_disease(
    age=54,
    cp='asymptomatic',
    trestbps=130.0,
    chol=110.0,
    fbs=False,
    restecg='normal',
    thalch=130.0,
    exang=False,
    oldpeak=1.0,
    slope='flat',
    ca=0,
    thal='normal',
    is_male=True
)

print("Prediction:", "Heart Disease" if result == 1 else "No Heart Disease")

Prediction: No Heart Disease


In [41]:
result = predict_heart_disease(
    age=54,
    cp='asymptomatic',
    trestbps=130.0,
    chol=219.0,
    fbs=False,
    restecg='st-t abnormality',
    thalch=130.0,
    exang=True,
    oldpeak=1.0,
    slope='flat',
    ca=2,
    thal='reversable defect',
    is_male=True
)

print("Prediction:", "Heart Disease" if result == 1 else "No Heart Disease")


Prediction: Heart Disease


# Model interpretation

In [60]:
!pip freeze | grep scikit-learn

scikit-learn==1.6.1


In [59]:
!pip install --upgrade scikit-learn



In [86]:
pipeline

In [85]:
pipeline[:1]

In [77]:
columns = pipeline[:1].get_feature_names_out()

In [46]:
pd.log_reg.coef_

array([[-1.73904462, -1.338794  , -1.02182214, -0.50083966, -0.15725292,
         0.44115732, -0.25146176, -0.49031974, -0.29925007, -1.66297755,
         0.17131015,  0.1183599 ,  0.22761763,  0.15366961, -0.210177  ,
         0.62613859,  0.63760131]])

In [79]:
coefs = pd.DataFrame(pipeline[-1].coef_, columns=columns).transpose().rename(columns={0:'log-odds'})

In [None]:
coefs

In [81]:
coefs['odds'] = np.exp(coefs['log-odds'])

In [None]:
coefs