In [1]:
import pandas as pd
import os
import numpy as np

# 현재 파이썬 코드의 파일 경로
current_path = os.getcwd()  # 현재 작업 디렉토리를 가져옵니다.

# CSV 파일 경로
file_path = os.path.join(current_path, 'filtered_data.csv')  # User uploaded fioytle to this path

df = pd.read_csv(file_path)
df

Unnamed: 0,note id,person id,age,gender source value,BMI,admission department,division,ward,asa class,surgeon id,...,condition source value,surgery room,previous surgery,emergency status,op timing,day of the week,week of the month,month,surgeon estimated op time,surgery duration
0,101058,29,81,F,25.247087,General Surgery,Admission,NUGW2,2,9885,...,D00002196,203,N,N,TF2,Thursday,4,October,130,66
1,57801,64,60,F,24.376249,Otolaryngology,Admission,102,2,6194,...,D00003798,504,N,N,8A,Friday,2,January,300,130
2,71288,64,60,F,24.376249,Otolaryngology,Admission,102,3,6194,...,D00003798,504,Y,N,TF4,Monday,4,April,100,85
3,135104,64,60,F,24.376249,Otolaryngology,Admission,102,3,6194,...,D00003798,504,Y,N,TF2,Monday,3,August,100,83
4,221210,71,94,M,27.963140,Orthopedics,Admission,41,2,29473,...,D00018711,108,N,N,TF4,Monday,5,March,100,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161214,297111,4055249,1,M,23.700428,Pediatric Surgery,Admission,5A,1,100613,...,D00011688,5,N,Y,etc,Tuesday,2,September,200,123
161215,297455,4055328,1,M,20.612160,Pediatric Urology,Day,PDSC,1,6259,...,D00016707,7,N,N,8A,Monday,4,September,130,45
161216,297761,4055407,1,M,12.502703,Pediatric Surgery,Admission,5A,2,105057,...,D00011524,5,N,N,8A,Wednesday,3,September,130,43
161217,297753,4055558,4,F,14.365794,Pediatric Surgery,Admission,5A,2,105057,...,D00004831,5,N,N,TF6,Wednesday,3,September,130,82


In [2]:
from sklearn.model_selection import train_test_split

# Removing unnecessary columns
df.drop(columns=['note id', 'person id', 'surgeon estimated op time', 'final op name'], inplace=True)

# Encoding binary columns
binary_cols = ['condition source value', 'op code']
for col in binary_cols:
    df[col] = df[col].astype('category').cat.codes

# One-hot encoding for other categorical columns
one_hot_cols = ['surgical department', 'op timing', 'month', 'anesthesia type',
                'day of the week', 'asa class', 'week of the month', 
                'division', 'previous surgery', 'emergency status', 'gender source value', 'surgeon id', 'ward', 
                'admission department', 'surgery room']
df_encoded = pd.get_dummies(df, columns=one_hot_cols)

X = df_encoded.drop("surgery duration", axis=1)
y = df_encoded["surgery duration"]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)



In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Train, validate, and test models for each department
departments = df['surgical department'].unique()
department_models = {}

for dept in departments:
    # Adjusting column name
    dept_col_name = 'surgical department_' + dept

    # Filtering data for the department
    dept_data_train = X_train[X_train[dept_col_name] == 1]
    dept_data_val = X_val[X_val[dept_col_name] == 1]
    dept_data_test = X_test[X_test[dept_col_name] == 1]
    y_dept_train = y_train[X_train[dept_col_name] == 1]
    y_dept_val = y_val[X_val[dept_col_name] == 1]
    y_dept_test = y_test[X_test[dept_col_name] == 1]

    # Training the model
    department_model = RandomForestRegressor(random_state=42)
    department_model.fit(dept_data_train, y_dept_train)

    # Validation (Tune your model based on validation set, if needed)
    # ... [Validation steps here]

    # Store the trained model
    department_models[dept] = department_model

# Ensemble predictions
final_predictions = np.zeros(len(X_test))
test_indices = X_test.index
for dept, department_model in department_models.items():
    dept_col_name = 'surgical department_' + dept
    dept_indices = X_test[X_test[dept_col_name] == 1].index
    dept_predictions = department_model.predict(X_test.loc[dept_indices])
    final_predictions[np.isin(test_indices, dept_indices)] = dept_predictions

# Calculate performance metrics
mae_dt = mean_absolute_error(y_test, final_predictions)
rmse_dt = np.sqrt(mean_squared_error(y_test, final_predictions))
r2_dt = r2_score(y_test, final_predictions)

mae_dt, rmse_dt, r2_dt


(33.985135043212175, 57.29333083478465, 0.7191882571851279)

In [5]:
# Train, validate, and test models for each department
departments = df['surgical department'].unique()
department_models = {}
validation_scores = {}

for dept in departments:
    # Adjusting column name
    dept_col_name = 'surgical department_' + dept

    # Filtering data for the department
    dept_data_train = X_train[X_train[dept_col_name] == 1]
    dept_data_val = X_val[X_val[dept_col_name] == 1]
    dept_data_test = X_test[X_test[dept_col_name] == 1]
    y_dept_train = y_train[X_train[dept_col_name] == 1]
    y_dept_val = y_val[X_val[dept_col_name] == 1]
    y_dept_test = y_test[X_test[dept_col_name] == 1]

    # Training the model
    department_model = RandomForestRegressor(random_state=42)
    department_model.fit(dept_data_train, y_dept_train)

    # Validation
    val_predictions = department_model.predict(dept_data_val)
    mae_val = mean_absolute_error(y_dept_val, val_predictions)
    rmse_val = np.sqrt(mean_squared_error(y_dept_val, val_predictions))
    r2_val = r2_score(y_dept_val, val_predictions)

    # Storing validation scores
    validation_scores[dept] = (mae_val, rmse_val, r2_val)

    # Store the trained model
    department_models[dept] = department_model

# [Rest of the ensemble predictions and performance metrics calculation]

validation_scores

{'General Surgery': (36.65305803086707, 58.44871483018508, 0.6882433727438997),
 'Otolaryngology': (44.46826418289585, 74.47568124814372, 0.5179509470125027),
 'Orthopedics': (35.47254623921085, 61.71462010654518, 0.5405787716093389),
 'Ophthalmology': (13.763830161054173, 23.17064715550011, 0.5805141131054061),
 'Obstetrics & Gynecology': (34.58726247288503,
  53.05859519755865,
  0.5933261786545387),
 'Urology': (23.669771634615383, 36.869223457259665, 0.7520626383510476),
 'Plastic Surgery': (40.48772833723654, 66.09776986870227, 0.7009112631159609),
 'Neurosurgery': (51.00187415426252, 74.56766110188575, 0.6298125937855461),
 'Cardiovascular Thoracic Surgery': (50.80613138686131,
  74.33101929270717,
  0.7377736808953229),
 'Pediatric Otolaryngology': (24.4520880069025,
  47.499997752145624,
  0.578540559896906),
 'Pediatric Orthopedics': (51.02367374005305,
  76.7293781910567,
  0.40460723364807005),
 'Pediatric Thoracic Surgery': (51.439306666666674,
  77.38800603711147,
  0.8156