In [None]:
import pandas as pd
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_auc_score

# --- Load CSVs ---
patients_df = pd.read_csv('patients.csv')
medications_df = pd.read_csv('medications.csv')
encounters_df = pd.read_csv('encounters.csv')

# --- Preprocess patients.csv ---
patients_df['BIRTHDATE'] = pd.to_datetime(patients_df['BIRTHDATE'])
patients_df['DEATHDATE'] = pd.to_datetime(patients_df['DEATHDATE'], errors='coerce')
patients_df['age'] = patients_df.apply(lambda row: (row['DEATHDATE'] if pd.notnull(row['DEATHDATE']) else datetime.today()) - row['BIRTHDATE'], axis=1).dt.days // 365
patients_df['GENDER'] = patients_df['GENDER'].map({'M': 0, 'F': 1})
patients_df = pd.get_dummies(patients_df, columns=['RACE'], drop_first=True)

# --- Preprocess medications.csv ---
medications_df['START'] = pd.to_datetime(medications_df['START'])
medications_summary = medications_df.groupby('PATIENT')['START'].agg(['count']).reset_index()
medications_summary.columns = ['PATIENT', 'med_count']
patients_df = patients_df.merge(medications_summary, left_on='Id', right_on='PATIENT', how='left')

# --- Preprocess encounters.csv ---
encounters_df['ENCOUNTERCLASS'] = encounters_df['ENCOUNTERCLASS'].str.lower()
encounters_df['START'] = pd.to_datetime(encounters_df['START'])
encounters_df.sort_values(by=['PATIENT', 'START'], inplace=True)

# --- Label readmissions ---
def label_readmission(df, days):
    df['readmit_{}d'.format(days)] = 0
    for patient_id, group in df.groupby('PATIENT'):
        visit_dates = list(group['START'])
        for i in range(len(visit_dates) - 1):
            diff = (visit_dates[i + 1] - visit_dates[i]).days
            if 0 < diff <= days:
                df.loc[group.index[i], 'readmit_{}d'.format(days)] = 1
    return df

encounters_df = label_readmission(encounters_df, 30)
encounters_df = label_readmission(encounters_df, 60)

# --- Collapse encounter-level data to patient-level ---
readmit_labels = encounters_df.groupby('PATIENT')[['readmit_30d', 'readmit_60d']].max().reset_index()
patients_df = patients_df.merge(readmit_labels, left_on='Id', right_on='PATIENT', how='left')
patients_df.fillna(0, inplace=True)

# --- Feature matrix and model training ---
feature_cols = ['age', 'GENDER', 'med_count'] + [col for col in patients_df.columns if col.startswith("RACE_")]
X = patients_df[feature_cols].fillna(0)

# Train model for 30-day readmission
y_30 = patients_df['readmit_30d']
X_train_30, X_test_30, y_train_30, y_test_30 = train_test_split(X, y_30, test_size=0.2, random_state=42)

rf_30 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_30.fit(X_train_30, y_train_30)
y_pred_30 = rf_30.predict(X_test_30)
print("30-Day Readmission:\n", classification_report(y_test_30, y_pred_30))

# Train model for 60-day readmission
y_60 = patients_df['readmit_60d']
X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(X, y_60, test_size=0.2, random_state=42)

rf_60 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_60.fit(X_train_60, y_train_60)
y_pred_60 = rf_60.predict(X_test_60)
print("60-Day Readmission:\n", classification_report(y_test_60, y_pred_60))

# --- Evaluate 30-day readmission ---
print("=== 30-Day Readmission ===")
print(classification_report(y_test_30, y_pred_30))
print("Confusion Matrix:\n", confusion_matrix(y_test_30, y_pred_30))
probs_30 = rf_30.predict_proba(X_test_30)[:, 1]
print("ROC AUC Score:", roc_auc_score(y_test_30, probs_30))

# --- Evaluate 60-day readmission ---
print("\n=== 60-Day Readmission ===")
print(classification_report(y_test_60, y_pred_60))
print("Confusion Matrix:\n", confusion_matrix(y_test_60, y_pred_60))
probs_60 = rf_60.predict_proba(X_test_60)[:, 1]
print("ROC AUC Score:", roc_auc_score(y_test_60, probs_60))



30-Day Readmission:
               precision    recall  f1-score   support

           0       0.50      0.30      0.38        20
           1       0.93      0.97      0.95       186

    accuracy                           0.90       206
   macro avg       0.71      0.63      0.66       206
weighted avg       0.89      0.90      0.89       206

60-Day Readmission:
               precision    recall  f1-score   support

           0       0.50      0.29      0.37        17
           1       0.94      0.97      0.96       189

    accuracy                           0.92       206
   macro avg       0.72      0.63      0.66       206
weighted avg       0.90      0.92      0.91       206



  patients_df.fillna(0, inplace=True)


In [6]:
import pandas as pd
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler

# --- Load CSVs ---
patients_df = pd.read_csv('patients.csv')
medications_df = pd.read_csv('medications.csv')
encounters_df = pd.read_csv('encounters.csv')

# --- Preprocess patients.csv ---
patients_df['BIRTHDATE'] = pd.to_datetime(patients_df['BIRTHDATE'])
patients_df['DEATHDATE'] = pd.to_datetime(patients_df['DEATHDATE'], errors='coerce')
patients_df['age'] = patients_df.apply(
    lambda row: (row['DEATHDATE'] if pd.notnull(row['DEATHDATE']) else datetime.today()) - row['BIRTHDATE'],
    axis=1
).dt.days // 365
patients_df['GENDER'] = patients_df['GENDER'].map({'M': 0, 'F': 1})
patients_df = pd.get_dummies(patients_df, columns=['RACE'], drop_first=True)

# --- Preprocess medications.csv ---
medications_df['START'] = pd.to_datetime(medications_df['START'])
medications_summary = medications_df.groupby('PATIENT')['START'].agg(['count']).reset_index()
medications_summary.columns = ['PATIENT', 'med_count']
patients_df = patients_df.merge(medications_summary, left_on='Id', right_on='PATIENT', how='left')

# --- Preprocess encounters.csv ---
encounters_df['ENCOUNTERCLASS'] = encounters_df['ENCOUNTERCLASS'].str.lower()
encounters_df['START'] = pd.to_datetime(encounters_df['START'])
encounters_df.sort_values(by=['PATIENT', 'START'], inplace=True)

# --- Label readmissions ---
def label_readmission(df, days):
    df[f'readmit_{days}d'] = 0
    for patient_id, group in df.groupby('PATIENT'):
        visit_dates = list(group['START'])
        for i in range(len(visit_dates) - 1):
            diff = (visit_dates[i + 1] - visit_dates[i]).days
            if 0 < diff <= days:
                df.loc[group.index[i], f'readmit_{days}d'] = 1
    return df

encounters_df = label_readmission(encounters_df, 30)
encounters_df = label_readmission(encounters_df, 60)

# --- Collapse encounter-level data to patient-level ---
readmit_labels = encounters_df.groupby('PATIENT')[['readmit_30d', 'readmit_60d']].max().reset_index()
patients_df = patients_df.merge(readmit_labels, left_on='Id', right_on='PATIENT', how='left')
patients_df.fillna(0, inplace=True)

# --- Feature matrix ---
feature_cols = ['age', 'GENDER', 'med_count'] + [col for col in patients_df.columns if col.startswith("RACE_")]
X = patients_df[feature_cols].fillna(0)

# --- Scale features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 30-day readmission model ---
y_30 = patients_df['readmit_30d']
X_train_30, X_test_30, y_train_30, y_test_30 = train_test_split(X_scaled, y_30, test_size=0.2, random_state=42)

lr_30 = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
lr_30.fit(X_train_30, y_train_30)
y_pred_30 = lr_30.predict(X_test_30)

print("=== 30-Day Readmission ===")
print(classification_report(y_test_30, y_pred_30))
print("Confusion Matrix:\n", confusion_matrix(y_test_30, y_pred_30))
print("ROC AUC Score:", roc_auc_score(y_test_30, lr_30.predict_proba(X_test_30)[:, 1]))

# --- 60-day readmission model ---
y_60 = patients_df['readmit_60d']
X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(X_scaled, y_60, test_size=0.2, random_state=42)

lr_60 = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
lr_60.fit(X_train_60, y_train_60)
y_pred_60 = lr_60.predict(X_test_60)

print("\n=== 60-Day Readmission ===")
print(classification_report(y_test_60, y_pred_60))
print("Confusion Matrix:\n", confusion_matrix(y_test_60, y_pred_60))
print("ROC AUC Score:", roc_auc_score(y_test_60, lr_60.predict_proba(X_test_60)[:, 1]))


=== 30-Day Readmission ===
              precision    recall  f1-score   support

           0       0.20      0.80      0.32        20
           1       0.97      0.65      0.78       186

    accuracy                           0.67       206
   macro avg       0.58      0.73      0.55       206
weighted avg       0.89      0.67      0.73       206

Confusion Matrix:
 [[ 16   4]
 [ 65 121]]
ROC AUC Score: 0.8079301075268818

=== 60-Day Readmission ===
              precision    recall  f1-score   support

           0       0.18      0.88      0.30        17
           1       0.98      0.64      0.78       189

    accuracy                           0.66       206
   macro avg       0.58      0.76      0.54       206
weighted avg       0.92      0.66      0.74       206

Confusion Matrix:
 [[ 15   2]
 [ 68 121]]
ROC AUC Score: 0.8227513227513227


  patients_df.fillna(0, inplace=True)
