#Import necessary libraries



In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import joblib
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc

#  Load the dataset

In [None]:
# Download the dataset
data_path = "https://storage.googleapis.com/edulabs-public-datasets/heart_disease_uci.csv"

# Load the dataset into a Pandas DataFrame
df = pd.read_csv(data_path)


In [None]:
df.head()

**Column description**

- id: unique id
- age: age in years
- sex: gender
- dataset: location of data collection
- cp: chest pain type
- trestbps: resting blood pressure
- chol: cholesterol measure
- fbs: fasting blood sugar
- restecg: ecg observation at resting condition
- thalch: maximum heart rate achieved
- exang: exercise induced angina
- oldpeak: ST depression induced by exercise relative to rest
- slope: the slope of the peak exercise ST segment
- ca: number of major vessels (0-3) colored by flourosopy
- thal: thal
- num: target [0=no heart disease; 1,2,3,4 = stages of heart disease ]

# Data preprocessing

Perform quick data preprocessing:

- Remove redundant columns
- Fill / Drop missing values
- Convert column types if needed

In [None]:
df.shape

In [None]:
df.drop(columns=['id'], inplace=True)
df.drop(columns=['dataset'], inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.dropna(subset=['trestbps','chol','fbs','thalch','exang','oldpeak','restecg'], inplace=True)
df.isnull().sum()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def plot_feature_target_scatter(df, features, target_variable):
    """
    Displays a figure with multiple scatter plots showing the correlation
    between each feature and the target variable.

    Args:
        df (pd.DataFrame): DataFrame containing features and target variable.
        features (list): List of column names to be considered as features.
        target_variable (str): Name of the target variable column.
    """

    num_features = len(features)
    if num_features == 0:
        print("No features provided to plot.")
        return

    # Determine subplot grid layout (adjust as needed for better layout)
    if num_features <= 2:
        rows = 1
        cols = num_features
    elif num_features <= 4:
        rows = 2
        cols = 2
    else:
        rows = (num_features + 1) // 3  # Adjust columns for more features
        cols = 3

    fig = make_subplots(rows=rows, cols=cols,
                        subplot_titles=[f'Feature vs. Target: {feature}' for feature in features])

    for i, feature in enumerate(features):
        row_index = (i // cols) + 1
        col_index = (i % cols) + 1

        scatter_trace = go.Scatter(
            x=df[feature],
            y=df[target_variable],
            mode='markers',
            marker=dict(color='blue', size=5),
            name=feature
        )
        fig.add_trace(scatter_trace, row=row_index, col=col_index)

        fig.update_xaxes(title_text=feature, row=row_index, col=col_index)
        fig.update_yaxes(title_text=target_variable, row=row_index, col=col_index)

    fig.update_layout(title_text="Feature vs Target Variable Scatter Plots", showlegend=False)
    fig.show()

In [None]:
df.info()

In [None]:
plot_feature_target_scatter(df, df.select_dtypes(['number','bool']).columns.drop('num'), 'num')

In [None]:
df = df[df['trestbps'] != 0]
df = df[df['chol'] != 0]

In [None]:
plot_feature_target_scatter(df, df.select_dtypes(['number','bool']).columns.drop('num'), 'num')

In [None]:
df['is_male'] = df['sex'].map({'Male': True, 'Female': False})
df.drop(columns=['sex'], inplace=True)

In [None]:
df['exang'] = df['exang'].astype('bool')
df['fbs'] = df['fbs'].astype('bool')
df['age'] = df['age'].astype('int32')

In [None]:
df['is_heart_disease'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
df['is_heart_disease'] = df['is_heart_disease'].astype('bool')
df.drop(columns=['num'], inplace=True)

In [None]:
df['slope'] = df['slope'].fillna('no_data')
df['thal'] = df['thal'].fillna('no_data')
df['ca'] = df['ca'].fillna(0)

In [None]:
fig = px.histogram(
    df,
    x='is_heart_disease',
    # histnorm='percent',
    width=600
)
fig.show()

# Correlations

In [None]:
corr_matrix = df.select_dtypes(include=['number', 'bool']).corr()

In [None]:
corr_matrix['is_heart_disease'].abs().sort_values(ascending=False)

# Data split

Split the data into training and validation sets. Don't forget to use stratify!

In [None]:
df.columns

In [None]:
X = df.drop(columns=['is_heart_disease'])
y = df['is_heart_disease']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=101, stratify=y)

# Create model / pipeline

Implement code that trains LogisticRegression model

Don't forget to normalize data!



In [None]:
df.info()

In [None]:
df

In [None]:
X_train.head(10)

In [None]:
X_train.info()

In [None]:
# הגדרת תכונות קטגוריות ומספריות
categorical_features = ['cp', 'restecg', 'slope', 'thal']
numerical_features = df.select_dtypes(include=['number']).columns.tolist()

# טרנספורמציה עבור תכונות מספריות
num_transformers = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # טיפול ב-NaN
    ('scaler', StandardScaler())
])

# טרנספורמציה עבור תכונות קטגוריות
cat_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

# שילוב הכל ב-ColumnTransformer
preprocessor = ColumnTransformer([
    ('cat', cat_transformer, categorical_features),
    ('num', num_transformers, numerical_features)
])

# בניית פייפליין מלא
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=17)),  # להפעיל רק אם רוצים בחירת מאפיינים
    ('regressor', LogisticRegression())
])

In [None]:
model = pipeline.fit(X_train, y_train)

# Evaluate the model

- display various evaluation metrics
- find the best threshold
- save the best model in file

In [None]:
model.predict(X_val)

In [None]:
model.predict_proba(X_val)

In [None]:
metrics.confusion_matrix(y_val, model.predict(X_val))

In [None]:
pd.DataFrame(metrics.confusion_matrix(y_val, model.predict(X_val),normalize='true'),
             columns=['Predicted Heart Disease (0)', 'Predicted Heart Disease (1)'],
             index=['Actual Heart Disease (0)', 'Actual Heart Disease (1)'])

In [None]:
print(metrics.classification_report(y_val, model.predict(X_val)))

In [None]:
preds = model.predict_proba(X_val)[:,1] >= 0.5
metrics.confusion_matrix(y_val, preds)

In [None]:
probabilities = model.predict_proba(X_val)[:,1]
fpr, tpr, thresholds = roc_curve(y_val, probabilities)
roc_auc = auc(fpr, tpr)

In [None]:
# prompt: display roc curve using plotly with threshold values as tooltips

fig = go.Figure()

# Add the ROC curve
fig.add_trace(go.Scatter(
    x=fpr,
    y=tpr,
    mode='lines',
    name=f'ROC Curve (AUC = {roc_auc:.2f})',
    hovertemplate = 'False Positive Rate: %{x:.3f}<br>True Positive Rate: %{y:.3f}<br>Threshold: %{text:.3f}<extra></extra>',
    text = thresholds
))

# Add a diagonal line for random guessing
fig.add_trace(go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    line=dict(dash='dash'),
    name='Random Guessing'
))


fig.update_layout(
    title='Receiver Operating Characteristic (ROC) Curve',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    width=800,
    height=600
)

fig.show()


In [None]:
import joblib
joblib.dump(pipeline, 'model_heart_disease.pkl')

# Predict

Implement function ```predict_heart_dicease()```

The function receives the following parameters: ```age, sex, chest_pain_type,...``` and others.

The funciton returns ```True``` if heart disease is predicted, ```False``` otherwise.

Note, your function should perform all the needed transformations to the data, and return the final answer.

Your function should load the stored model from the file.

In [None]:
def predict_heart_disease(age, cp, trestbps, chol, fbs, restecg, thalch, exang, oldpeak, slope, ca, thal, is_male):
    """
    טוען את המודל ומשתמש בנתונים כדי לבצע חיזוי של מחלת לב.

    פרמטרים:
    age (int): גיל
    cp (str): סוג כאב חזה ('asymptomatic', 'typical angina' וכו')
    trestbps (float): לחץ דם במנוחה
    chol (float): רמת כולסטרול
    fbs (bool): האם רמת הסוכר בצום גבוהה מ-120? (True/False)
    restecg (str): תוצאות ECG במנוחה ('normal', 'st-t abnormality' וכו')
    thalch (float): דופק מרבי
    exang (bool): האם מופיעה אנגינה בזמן מאמץ? (True/False)
    oldpeak (float): דיכוי ST ביחס למנוחה
    slope (str): שיפוע מקטע ST ('upsloping', 'flat', 'downsloping')
    ca (float): מספר כלי דם מוצרים (0-3, יכול להיות NaN)
    thal (str): סוג תליום ('normal', 'fixed defect', 'reversable defect')
    is_male (bool): האם המטופל גבר? (True/False)

    מחזיר:
    int: 1 (חולה במחלת לב) או 0 (לא חולה)
    """

    # טעינת המודל
    loaded_model = joblib.load('model_heart_disease.pkl')

    # יצירת DataFrame עם הנתון
    data = pd.DataFrame([{
        'age': age,
        'cp': cp,
        'trestbps': trestbps,
        'chol': chol,
        'fbs': fbs,
        'restecg': restecg,
        'thalch': thalch,
        'exang': exang,
        'oldpeak': oldpeak,
        'slope': slope,
        'ca': ca,
        'thal': thal,
        'is_male': is_male
    }])

    # ביצוע חיזוי
    prediction = loaded_model.predict(data)

    return prediction[0]  # מחזיר את התוצאה (0 = לא חולה, 1 = חולה)



In [None]:
X_train

In [None]:
df

In [None]:
result = predict_heart_disease(
    age=54,
    cp='asymptomatic',
    trestbps=130.0,
    chol=110.0,
    fbs=False,
    restecg='normal',
    thalch=130.0,
    exang=False,
    oldpeak=1.0,
    slope='flat',
    ca=0,
    thal='normal',
    is_male=True
)

print("Prediction:", "Heart Disease" if result == 1 else "No Heart Disease")

In [None]:
result = predict_heart_disease(
    age=54,
    cp='asymptomatic',
    trestbps=130.0,
    chol=219.0,
    fbs=False,
    restecg='st-t abnormality',
    thalch=130.0,
    exang=True,
    oldpeak=1.0,
    slope='flat',
    ca=2,
    thal='reversable defect',
    is_male=True
)

print("Prediction:", "Heart Disease" if result == 1 else "No Heart Disease")
