In [26]:
# Importing required libraries
import numpy as np
import pandas as pd

# Data Preprocessing and Scaling
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Logistic Regression and Classifier
from sklearn.linear_model import LogisticRegression

# Resampling Techniques for Imbalanced Data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

# Model Evaluation
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_recall_curve

# For Ensemble Methods (optional, if needed)
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier

# For Visualizations
import matplotlib.pyplot as plt
import seaborn as sns


In [27]:
train = pd.read_csv('data/train_data.csv')
test = pd.read_csv('data/test_data.csv')

In [28]:
# Define the target variable (e.g., 'target') and features (all other columns)
X_train = train.drop('readmitted', axis=1)  # Drop the target column for features
y_train = train['readmitted']  # Select the target column

X_test = test.drop('readmitted', axis=1)  # Same for test set
y_test = test['readmitted']

#### Feature scaling and balancing



In [29]:

# Identify the binary and continuous columns

# List of continuous columns that you want to scale
continuous_columns = ['age', 'time_in_hospital','num_lab_procedures','num_procedures', 'num_medications', 'number_outpatient_log',
       'number_emergency_log', 'number_inpatient_log']

# Separate the continuous features
X_train_continuous = X_train[continuous_columns]
X_test_continuous = X_test[continuous_columns]

# Apply StandardScaler to the continuous features
scaler = StandardScaler()
X_train_continuous_scaled = scaler.fit_transform(X_train_continuous)
X_test_continuous_scaled = scaler.fit_transform(X_test_continuous)


# Convert the scaled features back to a DataFrame
X_train_continuous_scaled = pd.DataFrame(X_train_continuous_scaled, columns=continuous_columns)
X_test_continuous_scaled = pd.DataFrame(X_test_continuous_scaled, columns=continuous_columns)

# Concatenate the scaled continuous columns with the rest of the data
X_train_scaled = pd.concat([X_train.drop(continuous_columns, axis=1), X_train_continuous_scaled], axis=1)
X_test_scaled = pd.concat([X_test.drop(continuous_columns, axis=1), X_test_continuous_scaled], axis=1)




In [30]:


from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the classes in the training set
smote = SMOTE(sampling_strategy='minority',random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)


In [31]:


# Split the training data into a new training set and validation set (80% training, 20% validation)
X_train, X_train_val, y_train, y_train_val = train_test_split(X_train_resampled, y_train_resampled, test_size=0.2, random_state=42)
X_test,y_test = X_test_scaled,y_test
# Now X_train_split and y_train_split are used for training, and X_val and y_val are used for validation
# Print shapes of the datasets
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")


print(f"Shape of X_train_val: {X_train_val.shape}")
print(f"Shape of y_train_val: {y_train_val.shape}")

print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")



Shape of X_train: (113848, 113)
Shape of y_train: (113848,)
Shape of X_train_val: (28462, 113)
Shape of y_train_val: (28462,)
Shape of X_test: (20049, 113)
Shape of y_test: (20049,)


In [22]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
model = LogisticRegression(random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
from sklearn.metrics import classification_report, roc_auc_score

# Make predictions on the validation set
y_train_val_pred = model.predict(X_train_val)
y_val_pred_proba = model.predict_proba(X_train_val)[:, 1]

# Evaluate the model
print("Validation Classification Report:")
print(classification_report(y_train_val, y_train_val_pred))

print(f"Validation ROC-AUC Score: {roc_auc_score(y_train_val, y_val_pred_proba):.4f}")


Validation Classification Report:
              precision    recall  f1-score   support

           0       0.85      1.00      0.92     14186
           1       1.00      0.82      0.90     14276

    accuracy                           0.91     28462
   macro avg       0.92      0.91      0.91     28462
weighted avg       0.92      0.91      0.91     28462

Validation ROC-AUC Score: 0.9431


In [24]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter grid for Logistic Regression
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga','newton-cg'],
    'max_iter': [100, 200, 300],
}

# Grid search with cross-validation (using the validation set)
grid_search = GridSearchCV(LogisticRegression(random_state=42),
                           param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

grid_search.fit(X_train, y_train)

# Best parameters found
print(f"Best Parameters: {grid_search.best_params_}")

# Use the best model
best_model = grid_search.best_estimator_

# Predict with the best model on the validation set
y_val_pred_best = best_model.predict(X_train_val)
y_val_pred_proba_best = best_model.predict_proba(X_train_val)[:, 1]

# Evaluate the best model on the validation set
print("Best Model Validation Classification Report:")
print(classification_report(y_train_val, y_val_pred_best))

print(f"Best Model Validation ROC-AUC Score: {roc_auc_score(y_train_val, y_val_pred_proba_best):.4f}")




Best Parameters: {'C': 10, 'max_iter': 100, 'solver': 'liblinear'}
Best Model Validation Classification Report:
              precision    recall  f1-score   support

           0       0.85      1.00      0.92     14186
           1       1.00      0.82      0.90     14276

    accuracy                           0.91     28462
   macro avg       0.92      0.91      0.91     28462
weighted avg       0.92      0.91      0.91     28462

Best Model Validation ROC-AUC Score: 0.9435


In [32]:
# Make final predictions on the test set

y_test_pred = best_model.predict(X_test)
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Evaluate on the test set
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

print(f"Test ROC-AUC Score: {roc_auc_score(y_test, y_test_pred_proba):.4f}")


Test Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     17836
           1       0.38      0.01      0.03      2213

    accuracy                           0.89     20049
   macro avg       0.64      0.51      0.48     20049
weighted avg       0.83      0.89      0.84     20049

Test ROC-AUC Score: 0.6339
