## Import libraries


In [None]:
import pandas as pd     # for data manipulation and analysis
import numpy as np      # for numerical computations
import matplotlib.pyplot as plt   # for basic data visualization
import seaborn as sns   # for statistical data visualization
from ydata_profiling import ProfileReport


## EDA


In [173]:
hospital = pd.read_csv("hospital_readmissions.csv")
profile = ProfileReport(hospital, title="Data Profiling Report", explorative=True)

profile.to_file("report.html")

100%|██████████| 17/17 [00:01<00:00, 15.54it/s]1<00:00, 14.06it/s, Describe variable: readmitted]      
Summarize dataset: 100%|██████████| 75/75 [00:19<00:00,  3.91it/s, Completed]                                 
Generate report structure: 100%|██████████| 1/1 [00:09<00:00,  9.09s/it]
Render HTML: 100%|██████████| 1/1 [00:04<00:00,  4.84s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 37.30it/s]


In [174]:
print(hospital["change"].unique())
print(hospital["diabetes_med"].unique())
print(hospital["readmitted"].unique())


['no' 'yes']
['yes' 'no']
['no' 'yes']


## Data Cleaning


In [175]:

hospital['age_num'] = hospital['age'].str.extract(r'\[(\d+)-(\d+)\)').astype(int).mean(axis=1)


In [176]:
hospital['n_emergency_log'] = np.log1p(hospital['n_emergency'])


In [177]:
hospital.columns

Index(['age', 'time_in_hospital', 'n_lab_procedures', 'n_procedures',
       'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency',
       'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'glucose_test',
       'A1Ctest', 'change', 'diabetes_med', 'readmitted', 'age_num',
       'n_emergency_log'],
      dtype='object')

In [178]:
hospital["age"] = hospital["age"].replace({
    "[0-10)": 5, "[10-20)": 15, "[20-30)": 25, "[30-40)": 35,
    "[40-50)": 45, "[50-60)": 55, "[60-70)": 65, "[70-80)": 75,
    "[80-90)": 85, "[90-100)": 95
})


  hospital["age"] = hospital["age"].replace({


In [179]:
# تحويل الأعمدة للبوليان بطريقة آمنة
for col in ["change", "diabetes_med", "readmitted"]:
    if col in hospital.columns:
        # تحويل العمود لنصوص أولاً
        hospital[col] = hospital[col].astype(str).str.strip().str.lower()
        # استبدال yes/no بـ 1/0
        hospital[col] = hospital[col].map({"no": 0, "yes": 1})

# التأكد من الأعمدة بعد التحويل
print(hospital[["change", "diabetes_med", "readmitted"]].head(10))


   change  diabetes_med  readmitted
0       0             1           0
1       0             1           0
2       1             1           1
3       1             1           1
4       0             1           0
5       0             0           1
6       1             1           0
7       0             1           1
8       0             0           1
9       1             1           0


In [180]:
categorical_cols = ["medical_specialty", "diag_1", "diag_2", "diag_3", "glucose_test", "A1Ctest"]
categorical_cols = [col for col in categorical_cols if col in hospital.columns]

hospital = pd.get_dummies(hospital, columns=categorical_cols, drop_first=True)




In [181]:
hospital.to_csv("cleaned_hospital_readmissions.csv", index=False)   
# Save the cleaned dataframe to a CSV file without the index column


# preprocessing


In [182]:
hospital.columns

Index(['age', 'time_in_hospital', 'n_lab_procedures', 'n_procedures',
       'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency', 'change',
       'diabetes_med', 'readmitted', 'age_num', 'n_emergency_log',
       'medical_specialty_Emergency/Trauma',
       'medical_specialty_Family/GeneralPractice',
       'medical_specialty_InternalMedicine', 'medical_specialty_Missing',
       'medical_specialty_Other', 'medical_specialty_Surgery',
       'diag_1_Diabetes', 'diag_1_Digestive', 'diag_1_Injury',
       'diag_1_Missing', 'diag_1_Musculoskeletal', 'diag_1_Other',
       'diag_1_Respiratory', 'diag_2_Diabetes', 'diag_2_Digestive',
       'diag_2_Injury', 'diag_2_Missing', 'diag_2_Musculoskeletal',
       'diag_2_Other', 'diag_2_Respiratory', 'diag_3_Diabetes',
       'diag_3_Digestive', 'diag_3_Injury', 'diag_3_Missing',
       'diag_3_Musculoskeletal', 'diag_3_Other', 'diag_3_Respiratory',
       'glucose_test_no', 'glucose_test_normal', 'A1Ctest_no',
       'A1Ctest_normal']

In [183]:
hospital.dtypes

age                                           int64
time_in_hospital                              int64
n_lab_procedures                              int64
n_procedures                                  int64
n_medications                                 int64
n_outpatient                                  int64
n_inpatient                                   int64
n_emergency                                   int64
change                                        int64
diabetes_med                                  int64
readmitted                                    int64
age_num                                     float64
n_emergency_log                             float64
medical_specialty_Emergency/Trauma             bool
medical_specialty_Family/GeneralPractice       bool
medical_specialty_InternalMedicine             bool
medical_specialty_Missing                      bool
medical_specialty_Other                        bool
medical_specialty_Surgery                      bool
diag_1_Diabe

In [184]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [185]:

X = hospital.drop("readmitted", axis=1)
y = hospital["readmitted"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [186]:
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')


# Model Building


In [187]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ======== 4. تدريب نموذج RandomForest ========
rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced',
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2
)
rf_model.fit(X_train, y_train)

# ======== 5. التقييم ========
y_pred = rf_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

print(classification_report(y_test, y_pred))


Accuracy: 0.617
Confusion Matrix:
 [[1807  851]
 [1064 1278]]
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.68      0.65      2658
           1       0.60      0.55      0.57      2342

    accuracy                           0.62      5000
   macro avg       0.61      0.61      0.61      5000
weighted avg       0.62      0.62      0.62      5000

              precision    recall  f1-score   support

           0       0.63      0.68      0.65      2658
           1       0.60      0.55      0.57      2342

    accuracy                           0.62      5000
   macro avg       0.61      0.61      0.61      5000
weighted avg       0.62      0.62      0.62      5000



In [190]:
!C:\Users\nerom\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install xgboost





[notice] A new release of pip is available: 23.0.1 -> 25.2
[notice] To update, run: C:\Users\nerom\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [188]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb

model = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=len(y[y==0]) / len(y[y==1]),  # التعامل مع imbalance
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print("Accuracy:", round(acc, 3))
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", cr)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.61
Confusion Matrix:
 [[1699  959]
 [ 989 1353]]
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.64      0.64      2658
           1       0.59      0.58      0.58      2342

    accuracy                           0.61      5000
   macro avg       0.61      0.61      0.61      5000
weighted avg       0.61      0.61      0.61      5000



In [191]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6164
Confusion Matrix:
 [[1865  793]
 [1125 1217]]
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.70      0.66      2658
           1       0.61      0.52      0.56      2342

    accuracy                           0.62      5000
   macro avg       0.61      0.61      0.61      5000
weighted avg       0.62      0.62      0.61      5000



In [193]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE  


smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)

param_dist = {
    'n_estimators': [300, 500, 700, 1000],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2]
}

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50, 
    scoring='accuracy',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_res, y_train_res)

print("Best Parameters:", random_search.best_params_)

y_pred = random_search.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Parameters: {'subsample': 0.7, 'reg_lambda': 1, 'reg_alpha': 0.1, 'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.01, 'gamma': 0.3, 'colsample_bytree': 0.9}
Accuracy: 0.6192
Confusion Matrix:
 [[1808  850]
 [1054 1288]]
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.68      0.66      2658
           1       0.60      0.55      0.57      2342

    accuracy                           0.62      5000
   macro avg       0.62      0.62      0.62      5000
weighted avg       0.62      0.62      0.62      5000



In [194]:
import joblib

joblib.dump(random_search.best_estimator_, "xgb_model.pkl")

model_loaded = joblib.load("xgb_model.pkl")
