In [86]:
# importing the necessary libraries
import pandas as pd 
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline 
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from  sklearn.metrics import f1_score



##### loading the data 

In [88]:
df=pd.read_csv("data.xls")
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


#### Exploratory data analysis

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [91]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,299.0,60.833893,11.894809,40.0,51.0,60.0,70.0,95.0
anaemia,299.0,0.431438,0.496107,0.0,0.0,0.0,1.0,1.0
creatinine_phosphokinase,299.0,581.839465,970.287881,23.0,116.5,250.0,582.0,7861.0
diabetes,299.0,0.41806,0.494067,0.0,0.0,0.0,1.0,1.0
ejection_fraction,299.0,38.083612,11.834841,14.0,30.0,38.0,45.0,80.0
high_blood_pressure,299.0,0.351171,0.478136,0.0,0.0,0.0,1.0,1.0
platelets,299.0,263358.029264,97804.236869,25100.0,212500.0,262000.0,303500.0,850000.0
serum_creatinine,299.0,1.39388,1.03451,0.5,0.9,1.1,1.4,9.4
serum_sodium,299.0,136.625418,4.412477,113.0,134.0,137.0,140.0,148.0
sex,299.0,0.648829,0.478136,0.0,0.0,1.0,1.0,1.0


In [92]:
df["DEATH_EVENT"].value_counts(),print("the target column is not balanced")

the target column is not balanced


(DEATH_EVENT
 0    203
 1     96
 Name: count, dtype: int64,
 None)

#### feature engineering 

In [94]:
df["creatinine_ratio"] = df["serum_creatinine"] / df["serum_sodium"]
df["age_group"] = df["age"] // 10
df["risk_score"] = df["ejection_fraction"] * (df["serum_sodium"] / df["creatinine_phosphokinase"])

df['age_x_creatinine'] = df['age'] * df['serum_creatinine']
df['platelets_x_age'] = df['platelets'] * df['age']
df['bp_x_diabetes'] = df['high_blood_pressure'] * df['diabetes']
df['sex_x_smoking'] = df['sex'] * df['smoking']

df['creatinine_per_platelet'] = df['serum_creatinine'] / (df['platelets'] + 1e-5)
df['sodium_creatinine_ratio'] = df['serum_sodium'] / (df['serum_creatinine'] + 1e-5)

df['creatinine_per_time'] = df['serum_creatinine'] / (df['time'] + 1)
df['platelets_per_time'] = df['platelets'] / (df['time'] + 1)

df['anaemia_diabetes'] = ((df['anaemia'] == 1) & (df['diabetes'] == 1)).astype(int)
df['elderly_smoker'] = ((df['age'] > 65) & (df['smoking'] == 1)).astype(int)
df['high_risk_group'] = ((df['ejection_fraction'] < 30) & (df['serum_creatinine'] > 1.5)).astype(int)


#### preparing data for modeling 

In [96]:
# spliting the into features and target
X=df.drop("DEATH_EVENT", axis=1) 
y=df["DEATH_EVENT"]

# spliting data into train and test. This is done before any transformation to avoid data leakage 
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,stratify=y,random_state=42)

print(f"Value count of the target column after spliting: \n {y_train.value_counts()}")

# Apply SMOTE. this is just to balance our target column by increasing the minoriy sample
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print(f" Value counts after applying SMOTE: \n{ y_resampled.value_counts()}")


#scaling the data,this so because we range of values 
scaler = StandardScaler()
scaler.fit(X_train)

# Transform your data
X_resampled_test = scaler.transform(X_resampled)
X_test = scaler.transform(X_test)  


Value count of the target column after spliting: 
 DEATH_EVENT
0    162
1     77
Name: count, dtype: int64
 Value counts after applying SMOTE: 
DEATH_EVENT
0    162
1    162
Name: count, dtype: int64


#### Modeling 

In [98]:
pipe = Pipeline([("scaler", StandardScaler()), ("Lgr", LogisticRegression())])

param_grid = {'Lgr__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],"Lgr__penalty":["l2"], 
              "Lgr__solver":["liblinear"],"Lgr__max_iter":[1000],"Lgr__random_state":[42]}     
grid  = GridSearchCV(pipe,                     
        param_grid, scoring="f1", cv=5)            

grid.fit(X_resampled,y_resampled)
print("Best C:", grid.best_params_)
print("Best F1-score:", grid.best_score_)


# y_predict=model.predict(X_test)
# score=f1_score(y_predict, y_test)



Best C: {'Lgr__C': 1, 'Lgr__max_iter': 1000, 'Lgr__penalty': 'l2', 'Lgr__random_state': 42, 'Lgr__solver': 'liblinear'}
Best F1-score: 0.8448886628409771


In [114]:
model = LogisticRegression(penalty= "l2",C=1,solver="liblinear",max_iter=1000,random_state=42)
model.fit(X_resampled_test,y_resampled)
y_predict=model.predict(X_test)
score=f1_score(y_predict, y_test)                   
score                                        

0.6486486486486487

### 📌 Why `grid.best_score_` and Final `f1_score` Are Different

The difference between the F1-score obtained from `GridSearchCV` (`grid.best_score_`) and the F1-score calculated on the test set is **expected** and explained by the following:

#### 🔁 1. Cross-Validation Uses SMOTE-Resampled Data
- `GridSearchCV` performs k-fold cross-validation **only on the training data**.
- This training data was **resampled using SMOTE**, which creates synthetic examples to balance the classes.
- During cross-validation, both training and validation folds contain **synthetic, balanced data**.
- This often makes the classification problem easier, leading to a **higher F1-score**.

#### 🧪 2. Final Model Evaluation Uses the Real Test Set
- The final model is evaluated on a **held-out test set** that was **not resampled**.
- This test set retains the **original class imbalance**, making it more challenging.
- The model, although trained on balanced data, now has to perform on **imbalanced, real-world data**, where metrics like F1-score usually **drop**.

#### 🎯 3. Summary
> The F1-score from cross-validation reflects performance on synthetic, balanced data.
> The test F1-score reflects performance on real, imbalanced data.
> A drop in the final score is **normal** and shows how well the model generalizes.



In [100]:
# using the entire data to train the model 

# Apply SMOTE. this is just to balance our target column by increasing the minoriy sample
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

#scaling the data. this need to be done because we have high range of values 
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)  

Model = LogisticRegression(penalty= "l2",C=0.01,solver="liblinear",max_iter=1000,random_state=42)
Model.fit(X,y)
joblib.dump(model, 'model.pkl')

['model.pkl']

In [101]:
0.8448886628409771

0.8114649202362193

0.8114649202362193