In [1]:
import pandas as pd
import os

# 현재 파이썬 코드의 파일 경로
current_path = os.getcwd()  # 현재 작업 디렉토리를 가져옵니다.

# CSV 파일 경로
file_path = os.path.join(current_path, 'filtered_data.csv')  # User uploaded fioytle to this path

df = pd.read_csv(file_path)
df

Unnamed: 0,note id,person id,age,gender source value,BMI,admission department,division,ward,asa class,surgeon id,...,condition source value,surgery room,previous surgery,emergency status,op timing,day of the week,week of the month,month,surgeon estimated op time,surgery duration
0,101058,29,81,F,25.247087,General Surgery,Admission,NUGW2,2,9885,...,D00002196,203,N,N,TF2,Thursday,4th,October,130,66
1,57801,64,60,F,24.376249,Otolaryngology,Admission,102,2,6194,...,D00003798,504,N,N,8A,Friday,2nd,January,300,130
2,71288,64,60,F,24.376249,Otolaryngology,Admission,102,3,6194,...,D00003798,504,Y,N,TF4,Monday,4th,April,100,85
3,135104,64,60,F,24.376249,Otolaryngology,Admission,102,3,6194,...,D00003798,504,Y,N,TF2,Monday,3rd,August,100,83
4,221210,71,94,M,27.963140,Orthopedics,Admission,41,2,29473,...,D00018711,108,N,N,TF4,Monday,5th,March,100,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161214,297111,4055249,1,M,23.700428,Pediatric Surgery,Admission,5A,1,100613,...,D00011688,5,N,Y,etc,Tuesday,2nd,September,200,123
161215,297455,4055328,1,M,20.612160,Pediatric Urology,Day,PDSC,1,6259,...,D00016707,7,N,N,8A,Monday,4th,September,130,45
161216,297761,4055407,1,M,12.502703,Pediatric Surgery,Admission,5A,2,105057,...,D00011524,5,N,N,8A,Wednesday,3rd,September,130,43
161217,297753,4055558,4,F,14.365794,Pediatric Surgery,Admission,5A,2,105057,...,D00004831,5,N,N,TF6,Wednesday,3rd,September,130,82


In [2]:
from sklearn.model_selection import train_test_split

# Removing unnecessary columns
df.drop(columns=['note id', 'person id', 'surgeon estimated op time', 'final op name'], inplace=True)

# Encoding binary columns
binary_cols = ['condition source value', 'op code', 'surgeon id', 'ward', 'admission department', 'surgery room']
for col in binary_cols:
    df[col] = df[col].astype('category').cat.codes

# One-hot encoding for other categorical columns
one_hot_cols = ['surgical department', 'op timing', 'month', 'anesthesia type', 
                'day of the week', 'asa class', 'week of the month', 
                'division', 'previous surgery', 'emergency status', 'gender source value']
df_encoded = pd.get_dummies(df, columns=one_hot_cols)

# Splitting the data
X_all = df_encoded.drop("surgery duration", axis=1)
y_all = df_encoded["surgery duration"]
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Displaying the first few rows of the resulting dataframe
X_train_all
y_train_all


61344     122
137241     48
139478     76
113549     36
149411    127
         ... 
119879     76
103694     57
131932    311
146867     82
121958     57
Name: surgery duration, Length: 128975, dtype: int64

In [3]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# 분과별 데이터셋 준비
departments = df['surgical department'].unique()
models = {}
predictions = {}

# 각 분과별 모델 훈련
for dept in departments:
    # 열 이름 조정
    dept_col_name = 'surgical department_' + dept

    # 분과별 데이터 필터링
    dept_data = df_encoded[df_encoded[dept_col_name] == 1]
    X_dept = dept_data.drop('surgery duration', axis=1)
    y_dept = dept_data['surgery duration']

    # 데이터 분할
    X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(X_dept, y_dept, test_size=0.2, random_state=42)

    # 모델 훈련
    model = CatBoostRegressor(iterations=100, random_state=42)
    model.fit(X_train_dept, y_train_dept, verbose=False)
    models[dept] = model

    # 테스트 데이터셋에 대한 예측 수행
    predictions[dept] = model.predict(X_test_dept)

# 앙상블을 위한 준비
final_predictions = np.zeros(len(X_test_all))
test_indices = X_test_all.index

# 각 분과별 모델을 전체 테스트 데이터셋에 적용
for dept, model in models.items():
    # 열 이름 조정
    dept_col_name = 'surgical department_' + dept

    # 해당 분과에 해당하는 테스트 데이터 인덱스
    dept_indices = X_test_all[X_test_all[dept_col_name] == 1].index

    # 해당 분과의 예측값 계산
    dept_predictions = model.predict(X_test_all.loc[dept_indices])

    # 최종 예측 배열에 해당 부분 업데이트
    final_predictions[np.isin(test_indices, dept_indices)] = dept_predictions

# 성능 평가
mae_cb = mean_absolute_error(y_test_all, final_predictions)
rmse_cb = np.sqrt(mean_squared_error(y_test_all, final_predictions))
r2_cb = r2_score(y_test_all, final_predictions)

print(f"Ensemble MAE: {mae_cb}, RMSE: {rmse_cb}, R²: {r2_cb}")


Ensemble MAE: 29.59938614649006, RMSE: 46.86902026895335, R²: 0.8155588209914891


In [4]:
# Training a model on the entire dataset
full_model = CatBoostRegressor(iterations=100, random_state=42)
full_model.fit(X_train_all, y_train_all)

# Predicting on the test set
full_model_predictions = full_model.predict(X_test_all)

# Evaluating the full model's performance
full_model_mae = mean_absolute_error(y_test_all, full_model_predictions)
full_model_rmse = np.sqrt(mean_squared_error(y_test_all, full_model_predictions))
full_model_r2 = r2_score(y_test_all, full_model_predictions)

full_model_mae, full_model_rmse, full_model_r2


Learning rate set to 0.5
0:	learn: 91.7602875	total: 17.1ms	remaining: 1.69s
1:	learn: 84.5750810	total: 27.1ms	remaining: 1.33s
2:	learn: 81.3295372	total: 35.6ms	remaining: 1.15s
3:	learn: 79.5957721	total: 42.3ms	remaining: 1.01s
4:	learn: 78.3425860	total: 48.2ms	remaining: 917ms
5:	learn: 77.0335198	total: 54.7ms	remaining: 856ms
6:	learn: 76.4869700	total: 60.6ms	remaining: 806ms
7:	learn: 75.2808840	total: 66.4ms	remaining: 763ms
8:	learn: 74.3139224	total: 73ms	remaining: 738ms
9:	learn: 74.0466624	total: 79.3ms	remaining: 713ms
10:	learn: 73.5176541	total: 84.4ms	remaining: 683ms
11:	learn: 73.0535794	total: 90.2ms	remaining: 661ms
12:	learn: 72.5645112	total: 95.9ms	remaining: 642ms
13:	learn: 71.7214923	total: 101ms	remaining: 623ms
14:	learn: 71.2680946	total: 108ms	remaining: 614ms
15:	learn: 70.3048047	total: 114ms	remaining: 596ms
16:	learn: 70.1467237	total: 119ms	remaining: 582ms
17:	learn: 69.7995100	total: 125ms	remaining: 569ms
18:	learn: 69.3604244	total: 130ms	rem

(37.65593124629112, 60.75771181898664, 0.6900519143082409)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# 분과별 데이터셋 준비
departments = df['surgical department'].unique()
models = {}
predictions = {}

# 각 분과별 모델 훈련
for dept in departments:
    # 열 이름 조정
    dept_col_name = 'surgical department_' + dept

    # 분과별 데이터 필터링
    dept_data = df_encoded[df_encoded[dept_col_name] == 1]
    X_dept = dept_data.drop('surgery duration', axis=1)
    y_dept = dept_data['surgery duration']

    # 데이터 분할
    X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(X_dept, y_dept, test_size=0.2, random_state=42)

    # 모델 훈련
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_dept, y_train_dept)
    models[dept] = model

    # 테스트 데이터셋에 대한 예측 수행
    predictions[dept] = model.predict(X_test_dept)

# 앙상블을 위한 준비
final_predictions = np.zeros(len(X_test_all))
test_indices = X_test_all.index

# 각 분과별 모델을 전체 테스트 데이터셋에 적용
for dept, model in models.items():
    # 열 이름 조정
    dept_col_name = 'surgical department_' + dept

    # 해당 분과에 해당하는 테스트 데이터 인덱스
    dept_indices = X_test_all[X_test_all[dept_col_name] == 1].index

    # 해당 분과의 예측값 계산
    dept_predictions = model.predict(X_test_all.loc[dept_indices])

    # 최종 예측 배열에 해당 부분 업데이트
    final_predictions[np.isin(test_indices, dept_indices)] = dept_predictions

# 성능 평가
mae_rf = mean_absolute_error(y_test_all, final_predictions)
rmse_rf = np.sqrt(mean_squared_error(y_test_all, final_predictions))
r2_rf = r2_score(y_test_all, final_predictions)

print(f"Ensemble MAE: {mae_rf}, RMSE: {rmse_rf}, R²: {r2_rf}")

Ensemble MAE: 16.67870301451433, RMSE: 31.662393537060968, R²: 0.9158268414503081


In [None]:
from sklearn.linear_model import LinearRegression

# 분과별 데이터셋 준비
departments = df['surgical department'].unique()
models = {}
predictions = {}

# 각 분과별 모델 훈련
for dept in departments:
    # 열 이름 조정
    dept_col_name = 'surgical department_' + dept

    # 분과별 데이터 필터링
    dept_data = df_encoded[df_encoded[dept_col_name] == 1]
    X_dept = dept_data.drop('surgery duration', axis=1)
    y_dept = dept_data['surgery duration']

    # 데이터 분할
    X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(X_dept, y_dept, test_size=0.2, random_state=42)

    # 모델 훈련
    model = LinearRegression()
    model.fit(X_train_dept, y_train_dept)
    models[dept] = model

    # 테스트 데이터셋에 대한 예측 수행
    predictions[dept] = model.predict(X_test_dept)

# 앙상블을 위한 준비
final_predictions = np.zeros(len(X_test_all))
test_indices = X_test_all.index

# 각 분과별 모델을 전체 테스트 데이터셋에 적용
for dept, model in models.items():
    # 열 이름 조정
    dept_col_name = 'surgical department_' + dept

    # 해당 분과에 해당하는 테스트 데이터 인덱스
    dept_indices = X_test_all[X_test_all[dept_col_name] == 1].index

    # 해당 분과의 예측값 계산
    dept_predictions = model.predict(X_test_all.loc[dept_indices])

    # 최종 예측 배열에 해당 부분 업데이트
    final_predictions[np.isin(test_indices, dept_indices)] = dept_predictions

# 성능 평가
mae_lr = mean_absolute_error(y_test_all, final_predictions)
rmse_lr = np.sqrt(mean_squared_error(y_test_all, final_predictions))
r2_lr = r2_score(y_test_all, final_predictions)

print(f"Ensemble MAE: {mae_lr}, RMSE: {rmse_lr}, R²: {r2_lr}")

Ensemble MAE: 50.2211215236737, RMSE: 76.9795817900624, R²: 0.5024494234514032


In [None]:
from xgboost import XGBRegressor

# 분과별 데이터셋 준비
departments = df['surgical department'].unique()
models = {}
predictions = {}

# 각 분과별 모델 훈련
for dept in departments:
    # 열 이름 조정
    dept_col_name = 'surgical department_' + dept

    # 분과별 데이터 필터링
    dept_data = df_encoded[df_encoded[dept_col_name] == 1]
    X_dept = dept_data.drop('surgery duration', axis=1)
    y_dept = dept_data['surgery duration']

    # 데이터 분할
    X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(X_dept, y_dept, test_size=0.2, random_state=42)

    # 모델 훈련
    model = XGBRegressor(random_state=42)
    model.fit(X_train_dept, y_train_dept)
    models[dept] = model

    # 테스트 데이터셋에 대한 예측 수행
    predictions[dept] = model.predict(X_test_dept)

# 앙상블을 위한 준비
final_predictions = np.zeros(len(X_test_all))
test_indices = X_test_all.index

# 각 분과별 모델을 전체 테스트 데이터셋에 적용
for dept, model in models.items():
    # 열 이름 조정
    dept_col_name = 'surgical department_' + dept

    # 해당 분과에 해당하는 테스트 데이터 인덱스
    dept_indices = X_test_all[X_test_all[dept_col_name] == 1].index

    # 해당 분과의 예측값 계산
    dept_predictions = model.predict(X_test_all.loc[dept_indices])

    # 최종 예측 배열에 해당 부분 업데이트
    final_predictions[np.isin(test_indices, dept_indices)] = dept_predictions

# 성능 평가
mae_xgb = mean_absolute_error(y_test_all, final_predictions)
rmse_xgb = np.sqrt(mean_squared_error(y_test_all, final_predictions))
r2_xgb = r2_score(y_test_all, final_predictions)

print(f"Ensemble MAE: {mae_xgb}, RMSE: {rmse_xgb}, R²: {r2_xgb}")

Ensemble MAE: 22.601232791322957, RMSE: 37.310003900594154, R²: 0.883120935189753


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# 분과별 데이터셋 준비
departments = df['surgical department'].unique()
models = {}
predictions = {}

# 각 분과별 모델 훈련
for dept in departments:
    # 열 이름 조정
    dept_col_name = 'surgical department_' + dept

    # 분과별 데이터 필터링
    dept_data = df_encoded[df_encoded[dept_col_name] == 1]
    X_dept = dept_data.drop('surgery duration', axis=1)
    y_dept = dept_data['surgery duration']

    # 데이터 분할
    X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(X_dept, y_dept, test_size=0.2, random_state=42)

    # 모델 훈련
    model = DecisionTreeRegressor(random_state=42)
    model.fit(X_train_dept, y_train_dept)
    models[dept] = model

    # 테스트 데이터셋에 대한 예측 수행
    predictions[dept] = model.predict(X_test_dept)

# 앙상블을 위한 준비
final_predictions = np.zeros(len(X_test_all))
test_indices = X_test_all.index

# 각 분과별 모델을 전체 테스트 데이터셋에 적용
for dept, model in models.items():
    # 열 이름 조정
    dept_col_name = 'surgical department_' + dept

    # 해당 분과에 해당하는 테스트 데이터 인덱스
    dept_indices = X_test_all[X_test_all[dept_col_name] == 1].index

    # 해당 분과의 예측값 계산
    dept_predictions = model.predict(X_test_all.loc[dept_indices])

    # 최종 예측 배열에 해당 부분 업데이트
    final_predictions[np.isin(test_indices, dept_indices)] = dept_predictions

# 성능 평가
mae_dt = mean_absolute_error(y_test_all, final_predictions)
rmse_dt = np.sqrt(mean_squared_error(y_test_all, final_predictions))
r2_dt = r2_score(y_test_all, final_predictions)

print(f"Ensemble MAE: {mae_dt}, RMSE: {rmse_dt}, R²: {r2_dt}")

Ensemble MAE: 8.789387172807343, RMSE: 33.36653615556264, R²: 0.9065222205584245


In [None]:
# Adjusting the code to calculate and display the MAE for each department

from sklearn.metrics import mean_absolute_error

# Dictionary to store MAE for each department
department_mae = {}

for dept, model in models.items():
    dept_col_name = 'surgical department_' + dept
    dept_indices = X_test_all[X_test_all[dept_col_name] == 1].index

    # Check if the department has test data
    if len(dept_indices) > 0:
        # Calculate predictions for this department
        dept_predictions = model.predict(X_test_all.loc[dept_indices])

        # Extract the actual values for this department from the whole test dataset
        actual_values = y_test_all.loc[dept_indices]

        # Calculate MAE for this department
        mae = mean_absolute_error(actual_values, dept_predictions)
        department_mae[dept] = mae

department_mae



{'General Surgery': 9.49593383873191,
 'Otolaryngology': 10.50947459086994,
 'Orthopedics': 10.144918821407096,
 'Ophthalmology': 3.6727115716753023,
 'Obstetrics & Gynecology': 10.022933794893985,
 'Urology': 6.10071371927042,
 'Plastic Surgery': 11.034029850746268,
 'Neurosurgery': 12.349152542372881,
 'Cardiovascular Thoracic Surgery': 12.488384371700105,
 'Pediatric Otolaryngology': 6.259421560035057,
 'Pediatric Orthopedics': 13.420986093552465,
 'Pediatric Thoracic Surgery': 13.419270833333334,
 'Pediatric Urology': 8.743455497382199,
 'Pediatric Surgery': 5.699167657550535,
 'Pediatric Ophthalmology': 3.1666666666666665,
 'Pediatric Plastic Surgery': 8.358916478555305,
 'Pediatric Neurosurgery': 14.060085836909872}

In [None]:
from lightgbm import LGBMRegressor

# 분과별 데이터셋 준비
departments = df['surgical department'].unique()
models = {}
predictions = {}

# 각 분과별 모델 훈련
for dept in departments:
    # 열 이름 조정
    dept_col_name = 'surgical department_' + dept

    # 분과별 데이터 필터링
    dept_data = df_encoded[df_encoded[dept_col_name] == 1]
    X_dept = dept_data.drop('surgery duration', axis=1)
    y_dept = dept_data['surgery duration']

    # 데이터 분할
    X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(X_dept, y_dept, test_size=0.2, random_state=42)

    # 모델 훈련
    model = LGBMRegressor(random_state=42)
    model.fit(X_train_dept, y_train_dept)
    models[dept] = model

    # 테스트 데이터셋에 대한 예측 수행
    predictions[dept] = model.predict(X_test_dept)

# 앙상블을 위한 준비
final_predictions = np.zeros(len(X_test_all))
test_indices = X_test_all.index

# 각 분과별 모델을 전체 테스트 데이터셋에 적용
for dept, model in models.items():
    # 열 이름 조정
    dept_col_name = 'surgical department_' + dept

    # 해당 분과에 해당하는 테스트 데이터 인덱스
    dept_indices = X_test_all[X_test_all[dept_col_name] == 1].index

    # 해당 분과의 예측값 계산
    dept_predictions = model.predict(X_test_all.loc[dept_indices])

    # 최종 예측 배열에 해당 부분 업데이트
    final_predictions[np.isin(test_indices, dept_indices)] = dept_predictions

# 성능 평가
mae_lgbm = mean_absolute_error(y_test_all, final_predictions)
rmse_lgbm = np.sqrt(mean_squared_error(y_test_all, final_predictions))
r2_lgbm = r2_score(y_test_all, final_predictions)

print(f"Ensemble MAE: {mae_lgbm}, RMSE: {rmse_lgbm}, R²: {r2_lgbm}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004882 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1089
[LightGBM] [Info] Number of data points in the train set: 29172, number of used features: 66
[LightGBM] [Info] Start training from score 156.203071
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000427 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 935
[LightGBM] [Info] Number of data points in the train set: 9417, number of used features: 58
[LightGBM] [Info] Start training from score 138.766380
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000715 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 

In [None]:
# 결과 값을 딕셔너리 형태로 정리
ensemble_results = {
    "Random Forest": {"MAE": mae_rf, "RMSE": rmse_rf, "R2": r2_rf},
    "Linear Regression": {"MAE": mae_lr, "RMSE": rmse_lr, "R2": r2_lr},
    "XGBoost": {"MAE": mae_xgb, "RMSE": rmse_xgb, "R2": r2_xgb},
    "Decision Tree": {"MAE": mae_dt, "RMSE": rmse_dt, "R2": r2_dt},
    "LightGBM": {"MAE": mae_lgbm, "RMSE": rmse_lgbm, "R2": r2_lgbm}
}

# 결과를 DataFrame으로 변환 및 소수점 둘째 자리 반올림
results_df = pd.DataFrame(ensemble_results).T.round(2)

results_df



Unnamed: 0,MAE,RMSE,R2
Random Forest,16.68,31.66,0.92
Linear Regression,50.22,76.98,0.5
XGBoost,22.6,37.31,0.88
Decision Tree,8.79,33.37,0.91
LightGBM,28.91,46.73,0.82
