In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, f1_score


In [2]:
file_path = r'C:\Users\gusgl\OneDrive\Desktop\Project 2\diabetic_data.csv'
data = pd.read_csv(file_path)

In [3]:
# List of columns to encode
categorical_columns = ['age', 'race', 'gender'] 

# Initializing LabelEncoder
label_encoder = LabelEncoder()

# Applying LabelEncoder to all categorical columns
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

# Checking to ensure all columns are numeric
print(data.dtypes)

encounter_id                 int64
patient_nbr                  int64
race                         int32
gender                       int32
age                          int32
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [5]:
# Categorical columns to encode
categorical_columns = ['race', 'gender', 'change', 'diabetesMed', 'readmitted']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Applying LabelEncoder
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

In [6]:
# Converting columns that are supposed to be numeric
numeric_columns = ['weight', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult']

for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

In [7]:
# One-hot encode remaining object columns
data = pd.get_dummies(data, drop_first=True)

In [8]:
# Checking the data types of all columns
print(data.dtypes)

# Checking if there are any remaining non-numeric columns
non_numeric_columns = data.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

encounter_id                       int64
patient_nbr                        int64
race                               int64
gender                             int64
age                                int32
                                   ...  
glyburide-metformin_Up              bool
glipizide-metformin_Steady          bool
glimepiride-pioglitazone_Steady     bool
metformin-rosiglitazone_Steady      bool
metformin-pioglitazone_Steady       bool
Length: 162, dtype: object
Non-numeric columns: Index([], dtype='object')


In [10]:
# Initializing the scaler
scaler = StandardScaler()

# Fit and transform the data (excluding the target variable if applicable)
X = data.drop('readmitted', axis=1)
X_scaled = scaler.fit_transform(X)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [11]:
# 'readmitted' is the target variable
X = data.drop('readmitted', axis=1)
y = data['readmitted']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing and training Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.5946742655006387


In [12]:
# Checking the importance of columns
importances = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False))

                           Feature    Importance
1                      patient_nbr  8.009996e-02
0                     encounter_id  7.998352e-02
10              num_lab_procedures  6.374845e-02
17                          diag_2  6.351851e-02
18                          diag_3  6.345532e-02
..                             ...           ...
87    medical_specialty_Proctology  1.379107e-07
83  medical_specialty_Perinatology  0.000000e+00
21                       A1Cresult  0.000000e+00
20                   max_glu_serum  0.000000e+00
5                           weight  0.000000e+00

[161 rows x 2 columns]


In [13]:
# Drop Patient number and encounter id
X = X.drop(columns=['patient_nbr', 'encounter_id'])

In [14]:
# Drop low-importance features
low_importance_features = feature_importance_df[feature_importance_df['Importance'] < 1e-5]['Feature']
X = X.drop(columns=low_importance_features)

In [17]:
print(y.unique())

[2 1 0]


In [18]:
# Remapping all readmissions to 1
y = y.map({0: 0, 1: 1, 2: 1}) 

# Verifying the remapping
print(y.unique())  # Should show only [0, 1]

[1 0]


In [25]:
# Splitting the data again
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Re-training the Random Forest model
rf_model = RandomForestClassifier(class_weight='balanced',random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f'Accuracy: {accuracy:.2f}')
print(f'ROC-AUC Score: {roc_auc:.2f}')

Accuracy: 0.89
ROC-AUC Score: 0.64


In [26]:
# Calculating metrics
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1]))

Precision: 0.8879823052347015
Recall: 0.9998339697825004
F1 Score: 0.9405945748945697
ROC-AUC Score: 0.640967538367694


In [40]:
# Adjusting the decision threshold for the positive class
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

# Identifying thresholds where recall is within a desired range
desired_recall_min = 0.5
desired_recall_max = 0.9

# Filtering for thresholds that fall within the desired recall range
valid_indices = (recall[:-1] >= desired_recall_min) & (recall[:-1] <= desired_recall_max)
optimal_thresholds = thresholds[valid_indices]

# Selecting the average threshold in the desired range
if len(optimal_thresholds) > 0:
    optimal_threshold = optimal_thresholds.mean()
else:
    optimal_threshold = 0.5 

# Applying the optimal threshold to the predicted probabilities
y_pred_optimal = (y_prob >= optimal_threshold).astype(int)

# Evaluating the new threshold
new_precision = precision_score(y_test, y_pred_optimal)
new_recall = recall_score(y_test, y_pred_optimal)
new_f1 = f1_score(y_test, y_pred_optimal)

print("Accuracy Score:", accuracy)
print("New Precision:", new_precision)
print("New Recall:", new_recall)
print("New F1 Score:", new_f1)

Accuracy Score: 0.8878844453178736
New Precision: 0.9155515496082646
New Recall: 0.7308096740273397
New F1 Score: 0.8128154622676351


In [31]:
roc_auc_new = roc_auc_score(y_test, y_prob)
print(f"New ROC-AUC Score: {roc_auc_new:.2f}")

New ROC-AUC Score: 0.64
