In [1]:
import pandas as pd
import os
import numpy as np

# 현재 파이썬 코드의 파일 경로
current_path = os.getcwd()  # 현재 작업 디렉토리를 가져옵니다.

# CSV 파일 경로
file_path = os.path.join(current_path, 'filtered_data.csv')  # User uploaded fioytle to this path

surgery_data = pd.read_csv(file_path)

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.base import clone

# Preprocessing steps based on the previous discussion
df = surgery_data.drop(columns=['note id', 'person id', 'surgeon estimated op time', 'final op name'])
binary_cols = ['condition source value', 'surgeon id', 'ward', 'surgery room', 'op code']
for col in binary_cols:
    df[col] = df[col].astype('category').cat.codes

one_hot_cols = ['surgical department', 'op timing', 'month', 'anesthesia type',
                'day of the week', 'asa class', 'week of the month', 
                'division', 'previous surgery', 'emergency status', 'gender source value', 
                'admission department']
df_encoded = pd.get_dummies(df, columns=one_hot_cols)

X_all = df_encoded.drop("surgery duration", axis=1)
y_all = df_encoded["surgery duration"]
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

departments = df['surgical department'].unique()

# Dictionary to store department-specific Random Forest models
rf_department_models = {}

# Training Random Forest models for each department
for dept in departments:
    # Adjusting column name for one-hot encoded data
    dept_col_name = 'surgical department_' + dept

    # Filtering data for the department
    dept_data = df_encoded[df_encoded[dept_col_name] == 1]
    X_dept = dept_data.drop('surgery duration', axis=1)
    y_dept = dept_data['surgery duration']

    # Splitting the data for the department
    X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(X_dept, y_dept, test_size=0.2, random_state=42)

    # Training the Random Forest model
    rf_model = RandomForestRegressor(random_state=42)
    rf_model.fit(X_train_dept, y_train_dept)
    rf_department_models[dept] = rf_model

# Preparing for ensemble predictions
final_rf_predictions = np.zeros(len(X_test_all))
test_indices_rf = X_test_all.index

# Applying each department-specific Random Forest model to the entire test dataset
for dept, rf_model in rf_department_models.items():
    # Adjusting column name
    dept_col_name = 'surgical department_' + dept

    # Indices of test data belonging to the department
    dept_indices_rf = X_test_all[X_test_all[dept_col_name] == 1].index

    # Calculating predictions for the department
    dept_rf_predictions = rf_model.predict(X_test_all.loc[dept_indices_rf])

    # Updating the final prediction array
    final_rf_predictions[np.isin(test_indices_rf, dept_indices_rf)] = dept_rf_predictions

# Evaluating performance of Random Forest ensemble
mae_rf = mean_absolute_error(y_test_all, final_rf_predictions)
rmse_rf = np.sqrt(mean_squared_error(y_test_all, final_rf_predictions))
r2_rf = r2_score(y_test_all, final_rf_predictions)

mae_rf, rmse_rf, r2_rf


(16.62513925071331, 31.609588222762078, 0.916107368799772)

In [4]:
# Dictionary to store the MAE for each department
mae_per_department = {}

# Evaluating MAE for each department
for dept, rf_model in rf_department_models.items():
    # Adjusting column name
    dept_col_name = 'surgical department_' + dept

    # Indices of test data belonging to the department
    dept_indices_rf = X_test_all[X_test_all[dept_col_name] == 1].index

    # Calculating predictions for the department
    dept_rf_predictions = rf_model.predict(X_test_all.loc[dept_indices_rf])

    # Evaluating MAE for the department
    dept_mae_rf = mean_absolute_error(y_test_all.loc[dept_indices_rf], dept_rf_predictions)
    mae_per_department[dept] = dept_mae_rf

mae_per_department


{'General Surgery': 17.97449758787043,
 'Otolaryngology': 21.29770025839793,
 'Orthopedics': 17.735042092603727,
 'Ophthalmology': 6.711396085204375,
 'Obstetrics & Gynecology': 17.513872782345302,
 'Urology': 11.764183187946074,
 'Plastic Surgery': 20.812400000000004,
 'Neurosurgery': 24.830393220338983,
 'Cardiovascular Thoracic Surgery': 24.109567053854278,
 'Pediatric Otolaryngology': 11.368510078878177,
 'Pediatric Orthopedics': 23.24108723135272,
 'Pediatric Thoracic Surgery': 25.550911458333335,
 'Pediatric Urology': 15.806160558464224,
 'Pediatric Surgery': 15.678263971462545,
 'Pediatric Ophthalmology': 5.260770202020201,
 'Pediatric Plastic Surgery': 16.823589164785552,
 'Pediatric Neurosurgery': 31.814077253218887}

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
import numpy as np

departments = df['surgical department'].unique()

# Dictionary to store department-specific Random Forest models
rf_department_models = {}

# Splitting the dataset into training, validation, and testing sets
X_train_all, X_temp, y_train_all, y_temp = train_test_split(df_encoded.drop('surgery duration', axis=1), 
                                                            df_encoded['surgery duration'], 
                                                            test_size=0.4, random_state=42)
X_valid_all, X_test_all, y_valid_all, y_test_all = train_test_split(X_temp, y_temp, 
                                                                    test_size=0.5, random_state=42)

# Training Random Forest models for each department and validating them
for dept in departments:
    # Adjusting column name for one-hot encoded data
    dept_col_name = 'surgical department_' + dept

    # Filtering training data for the department
    dept_train_data = X_train_all[X_train_all[dept_col_name] == 1]
    y_train_dept = y_train_all[dept_train_data.index]

    # Filtering validation data for the department
    dept_valid_data = X_valid_all[X_valid_all[dept_col_name] == 1]
    y_valid_dept = y_valid_all[dept_valid_data.index]

    # Training the Random Forest model
    rf_model = RandomForestRegressor(random_state=42)
    rf_model.fit(dept_train_data, y_train_dept)

    # Validating the model
    valid_predictions = rf_model.predict(dept_valid_data)
    valid_mae = mean_absolute_error(y_valid_dept, valid_predictions)
    print(f"Validation MAE for department {dept}: {valid_mae}")

    # Storing the model
    rf_department_models[dept] = rf_model

# Preparing for ensemble predictions on the test set
final_rf_predictions = np.zeros(len(X_test_all))
test_indices_rf = X_test_all.index

# Applying each department-specific Random Forest model to the entire test dataset
for dept, rf_model in rf_department_models.items():
    # Adjusting column name
    dept_col_name = 'surgical department_' + dept

    # Indices of test data belonging to the department
    dept_indices_rf = X_test_all[X_test_all[dept_col_name] == 1].index

    # Calculating predictions for the department
    dept_rf_predictions = rf_model.predict(X_test_all.loc[dept_indices_rf])

    # Updating the final prediction array
    final_rf_predictions[np.isin(test_indices_rf, dept_indices_rf)] = dept_rf_predictions

# Evaluating performance of Random Forest ensemble on the test set
mae_rf = mean_absolute_error(y_test_all, final_rf_predictions)
rmse_rf = np.sqrt(mean_squared_error(y_test_all, final_rf_predictions))
r2_rf = r2_score(y_test_all, final_rf_predictions)

mae_rf, rmse_rf, r2_rf


Validation MAE for department General Surgery: 36.78339861563518
Validation MAE for department Otolaryngology: 44.63705757832346
Validation MAE for department Orthopedics: 35.731790998766954
Validation MAE for department Ophthalmology: 13.822612005856513
Validation MAE for department Obstetrics & Gynecology: 34.60307592190889
Validation MAE for department Urology: 23.49177483974359
Validation MAE for department Plastic Surgery: 40.980011709601875
Validation MAE for department Neurosurgery: 51.364898511502034
Validation MAE for department Cardiovascular Thoracic Surgery: 50.8784306569343
Validation MAE for department Pediatric Otolaryngology: 24.875228645383952
Validation MAE for department Pediatric Orthopedics: 51.80442970822281
Validation MAE for department Pediatric Thoracic Surgery: 51.08018666666666
Validation MAE for department Pediatric Urology: 28.549652317880792
Validation MAE for department Pediatric Surgery: 32.4589896373057
Validation MAE for department Pediatric Ophthalmol

(34.0326708095563, 57.082166549751484, 0.7212544011745616)

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
import numpy as np

departments = df['surgical department'].unique()

# Dictionary to store department-specific Random Forest models
rf_department_models = {}

# Splitting the dataset by department into training, validation, and testing sets
for dept in departments:
    # Adjusting column name for one-hot encoded data
    dept_col_name = 'surgical department_' + dept

    # Filtering data for the department
    dept_data = df_encoded[df_encoded[dept_col_name] == 1]

    # Splitting department data into training, validation, and testing sets
    X_dept = dept_data.drop('surgery duration', axis=1)
    y_dept = dept_data['surgery duration']
    X_train_dept, X_temp_dept, y_train_dept, y_temp_dept = train_test_split(X_dept, y_dept, test_size=0.4, random_state=42)
    X_valid_dept, X_test_dept, y_valid_dept, y_test_dept = train_test_split(X_temp_dept, y_temp_dept, test_size=0.5, random_state=42)

    # Training the Random Forest model for the department
    rf_model = RandomForestRegressor(random_state=42)
    rf_model.fit(X_train_dept, y_train_dept)

    # Validating the model
    valid_predictions = rf_model.predict(X_valid_dept)
    valid_mae = mean_absolute_error(y_valid_dept, valid_predictions)
    print(f"Validation MAE for department {dept}: {valid_mae}")

    # Storing the trained model and test sets for final evaluation
    rf_department_models[dept] = (rf_model, X_test_dept, y_test_dept)

# Preparing for ensemble predictions on the test set
final_rf_predictions = []
y_test_all = []

# Applying each department-specific Random Forest model to its test dataset
for dept, (rf_model, X_test_dept, y_test_dept) in rf_department_models.items():
    dept_rf_predictions = rf_model.predict(X_test_dept)
    final_rf_predictions.extend(dept_rf_predictions)
    y_test_all.extend(y_test_dept)

# Converting lists to numpy arrays for evaluation
final_rf_predictions = np.array(final_rf_predictions)
y_test_all = np.array(y_test_all)

# Evaluating performance of Random Forest ensemble on the test set
mae_rf = mean_absolute_error(y_test_all, final_rf_predictions)
rmse_rf = np.sqrt(mean_squared_error(y_test_all, final_rf_predictions))
r2_rf = r2_score(y_test_all, final_rf_predictions)

mae_rf, rmse_rf, r2_rf


Validation MAE for department General Surgery: 37.266765391471274
Validation MAE for department Otolaryngology: 45.63090909090909
Validation MAE for department Orthopedics: 35.65923863978333
Validation MAE for department Ophthalmology: 14.082256954402066
Validation MAE for department Obstetrics & Gynecology: 35.053088618592525
Validation MAE for department Urology: 25.24129513343799
Validation MAE for department Plastic Surgery: 42.609917257683215
Validation MAE for department Neurosurgery: 52.717425191370914
Validation MAE for department Cardiovascular Thoracic Surgery: 48.17118843683083
Validation MAE for department Pediatric Otolaryngology: 23.663521248915874
Validation MAE for department Pediatric Orthopedics: 49.32612035851472
Validation MAE for department Pediatric Thoracic Surgery: 55.6808
Validation MAE for department Pediatric Urology: 33.9387731092437
Validation MAE for department Pediatric Surgery: 30.868805970149253
Validation MAE for department Pediatric Ophthalmology: 10.

(33.95792198933399, 57.48062134461031, 0.7253025559079569)

In [7]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, StackingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import numpy as np

departments = df['surgical department'].unique()

# Dictionary to store department-specific ensemble models
ensemble_department_models = {}


for dept in departments:
    # Filter data for the current department
    dept_col_name = 'surgical department_' + dept
    dept_data = df_encoded[df_encoded[dept_col_name] == 1]
    
    # Splitting data into training, validation, and testing sets for the department
    X_dept = dept_data.drop('surgery duration', axis=1)
    y_dept = dept_data['surgery duration']
    X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(X_dept, y_dept, test_size=0.2, random_state=42)

    # Define base learners for the ensemble
    base_learners_dept = [
        ('rf', RandomForestRegressor(random_state=42)),
        ('xgb', xgb.XGBRegressor(random_state=42)),
        ('ada', AdaBoostRegressor(random_state=42))
    ]

    # Define the meta-learner
    meta_learner_dept = RandomForestRegressor(random_state=42)

    # Create the stacking ensemble
    stacked_ensemble_dept = StackingRegressor(estimators=base_learners_dept, final_estimator=meta_learner_dept)

    # Train the ensemble model for the department
    stacked_ensemble_dept.fit(X_train_dept, y_train_dept)

    # Store the trained ensemble model
    ensemble_department_models[dept] = stacked_ensemble_dept

# Evaluate the models
final_predictions = []
y_test_all = []

y_test_all_df = pd.DataFrame(y_test_all, index=X_test_all.index, columns=['surgery_duration'])

for dept in departments:
    # Extract the model for the department
    model = ensemble_department_models[dept]
    
    # Filter the test data for the department
    dept_col_name = 'surgical department_' + dept
    X_test_all_dept = X_test_all[X_test_all[dept_col_name] == 1]
    y_test_all_dept = y_test_all_df[X_test_all[dept_col_name] == 1]

    # Make predictions and store them
    dept_predictions = model.predict(X_test_all_dept)
    final_predictions.extend(dept_predictions)
    y_test_all.extend(y_test_all_dept['surgery_duration'].tolist())  # Assuming y_test_all is meant to be a list

# Calculate the overall performance
final_mae = mean_absolute_error(y_test_all, final_predictions)
final_mae


ValueError: Input contains NaN.

In [None]:
y_test_all

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan