In [8]:
import pandas as pd
import os

# 현재 파이썬 코드의 파일 경로
current_path = os.getcwd()  # 현재 작업 디렉토리를 가져옵니다.

# CSV 파일 경로
file_path = os.path.join(current_path, 'filtered_data.csv')  # User uploaded fioytle to this path

df = pd.read_csv(file_path)
df

Unnamed: 0,note_id,person_id,age,gender_source_value,BMI,admission_department,division,ward,asa_class,surgeon_id,...,condition_source_value,surgery_room,previous_surgery,emergency_status,op_timing,day_of_the_week,week_of_the_month,month,surgeon_estimated_op_time,surgery_duration
0,101058,29,81,F,25.247087,General Surgery,Admission,NUGW2,2,9885,...,D00002196,203,N,N,TF2,Thursday,4,October,130,66
1,57801,64,60,F,24.376249,Otolaryngology,Admission,102,2,6194,...,D00003798,504,N,N,8A,Friday,2,January,300,130
2,71288,64,60,F,24.376249,Otolaryngology,Admission,102,3,6194,...,D00003798,504,Y,N,TF4,Monday,4,April,100,85
3,135104,64,60,F,24.376249,Otolaryngology,Admission,102,3,6194,...,D00003798,504,Y,N,TF2,Monday,3,August,100,83
4,221210,71,94,M,27.963140,Orthopedics,Admission,41,2,29473,...,D00018711,108,N,N,TF4,Monday,5,March,100,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161214,297111,4055249,1,M,23.700428,Pediatric Surgery,Admission,5A,1,100613,...,D00011688,5,N,Y,etc,Tuesday,2,September,200,123
161215,297455,4055328,1,M,20.612160,Pediatric Urology,Day,PDSC,1,6259,...,D00016707,7,N,N,8A,Monday,4,September,130,45
161216,297761,4055407,1,M,12.502703,Pediatric Surgery,Admission,5A,2,105057,...,D00011524,5,N,N,8A,Wednesday,3,September,130,43
161217,297753,4055558,4,F,14.365794,Pediatric Surgery,Admission,5A,2,105057,...,D00004831,5,N,N,TF6,Wednesday,3,September,130,82


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Remove unnecessary columns
df.drop(columns=['note_id', 'person_id', 'surgeon_estimated_op_time', 'final_op_name'], inplace=True)

# Preserve 'surgical_department' before preprocessing
surgical_department = df['surgical_department']

# Encoding binary columns
binary_cols = ['condition_source_value', 'op_code', 'surgeon_id', 'ward', 'admission_department', 'surgery_room']
for col in binary_cols:
    df[col] = df[col].astype('category').cat.codes

# One-hot encoding (excluding 'surgical_department')
one_hot_cols = ['op_timing', 'month', 'anesthesia_type', 
                'day_of_the_week', 'asa_class', 'week_of_the_month', 
                'division', 'previous_surgery', 'emergency_status', 'gender_source_value']
df_encoded = pd.get_dummies(df, columns=one_hot_cols)

# Add 'surgical_department' back to df_encoded
df_encoded['surgical_department'] = surgical_department

# Function to evaluate a model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predictions)
    return mae, rmse, r2

# Splitting the data based on 'surgical_department'
department_subsets = df_encoded.groupby('surgical_department')

# Placeholder for department-specific model results
department_results = {}

for department, data in department_subsets:
    X = data.drop(["surgery_duration", "surgical_department"], axis=1)
    y = data["surgery_duration"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train and evaluate models for each department
    models = {
        "Random Forest": RandomForestRegressor(random_state=42),
        "XGBoost": XGBRegressor(random_state=42),
        "Linear Regression": LinearRegression(),
        "LightGBM": LGBMRegressor(random_state=42),
        "Decision Tree": DecisionTreeRegressor(random_state=42)
    }

    results = {}
    for name, model in models.items():
        mae, rmse, r2 = evaluate_model(model, X_train, y_train, X_test, y_test)
        results[name] = {"MAE": mae, "RMSE": rmse, "R2": r2}
    
    department_results[department] = results

# department_results will contain the evaluation results for each model in each department


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000489 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 846
[LightGBM] [Info] Number of data points in the train set: 7472, number of used features: 54
[LightGBM] [Info] Start training from score 233.097698
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001974 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1089
[LightGBM] [Info] Number of data points in the train set: 29172, number of used features: 66
[LightGBM] [Info] Start training from score 156.203071
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000698 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

In [10]:
results

{'Random Forest': {'MAE': 24.46112593173794,
  'RMSE': 39.71973812136697,
  'R2': 0.7431361022343103},
 'XGBoost': {'MAE': 25.061224685645,
  'RMSE': 40.06344369208152,
  'R2': 0.7386714438009898},
 'Linear Regression': {'MAE': 45.49329042853504,
  'RMSE': 66.10873075995613,
  'R2': 0.28844585361957964},
 'LightGBM': {'MAE': 24.328005166441436,
  'RMSE': 39.464001598173475,
  'R2': 0.7464331032503502},
 'Decision Tree': {'MAE': 33.67791290702236,
  'RMSE': 53.485808019270905,
  'R2': 0.5342343698803368}}

In [11]:
department_results

{'Cardiovascular Thoracic Surgery': {'Random Forest': {'MAE': 49.354296415195286,
   'RMSE': 70.16801682327724,
   'R2': 0.7712387992968056},
  'XGBoost': {'MAE': 50.095564855650316,
   'RMSE': 71.30222919088254,
   'R2': 0.7637835280268057},
  'Linear Regression': {'MAE': 68.46217353864104,
   'RMSE': 94.09932364637639,
   'R2': 0.588587988726204},
  'LightGBM': {'MAE': 48.33348709848463,
   'RMSE': 69.14461450983437,
   'R2': 0.7778631124186025},
  'Decision Tree': {'MAE': 66.38630283574103,
   'RMSE': 95.10093761897703,
   'R2': 0.5797830554495579}},
 'General Surgery': {'Random Forest': {'MAE': 36.08831505346861,
   'RMSE': 55.234495841714725,
   'R2': 0.7109464746676326},
  'XGBoost': {'MAE': 36.764975279888326,
   'RMSE': 55.20245546773779,
   'R2': 0.7112817251933325},
  'Linear Regression': {'MAE': 62.780799934456915,
   'RMSE': 85.20540846529403,
   'R2': 0.312153063536898},
  'LightGBM': {'MAE': 36.98022511053607,
   'RMSE': 55.30674735178211,
   'R2': 0.7101897659195691},
  

In [12]:
# Extracting Random Forest results from department_results
random_forest_results = {dept: metrics["Random Forest"]["MAE"] for dept, metrics in department_results.items()}

# Calculating the average MAE across all departments for all models
average_mae_all_models = {}
for model_name in models.keys():
    total_mae = 0
    count = 0
    for dept, metrics in department_results.items():
        total_mae += metrics[model_name]["MAE"]
        count += 1
    average_mae_all_models[model_name] = total_mae / count if count else None

random_forest_results, average_mae_all_models


({'Cardiovascular Thoracic Surgery': 49.354296415195286,
  'General Surgery': 36.08831505346861,
  'Neurosurgery': 50.81699582753824,
  'Obstetrics & Gynecology': 34.67819287576021,
  'Ophthalmology': 13.204320527522935,
  'Orthopedics': 35.06332430806257,
  'Otolaryngology': 46.02273036093418,
  'Pediatric Neurosurgery': 67.92961538461539,
  'Pediatric Ophthalmology': 10.505884223918574,
  'Pediatric Orthopedics': 49.44277848911651,
  'Pediatric Otolaryngology': 23.239757155247183,
  'Pediatric Plastic Surgery': 33.94943529411765,
  'Pediatric Surgery': 32.56139130434783,
  'Pediatric Thoracic Surgery': 53.40164893617021,
  'Pediatric Urology': 37.41781512605042,
  'Plastic Surgery': 41.55431442080378,
  'Urology': 24.46112593173794},
 {'Random Forest': 37.6289377432122,
  'XGBoost': 38.8661890261742,
  'Linear Regression': 54.28266078623499,
  'LightGBM': 37.66210130359777,
  'Decision Tree': 50.30102829282433})