In [1]:
import pandas as pd
import os
import numpy as np

# 현재 파이썬 코드의 파일 경로
current_path = os.getcwd()  # 현재 작업 디렉토리를 가져옵니다.

# CSV 파일 경로
file_path = os.path.join(current_path, 'filtered_data.csv')  # User uploaded fioytle to this path

df = pd.read_csv(file_path)
df

Unnamed: 0,note id,person id,age,gender source value,BMI,admission department,division,ward,asa class,surgeon id,...,condition source value,surgery room,previous surgery,emergency status,op timing,day of the week,week of the month,month,surgeon estimated op time,surgery duration
0,101058,29,81,F,25.247087,General Surgery,Admission,NUGW2,2,9885,...,D00002196,203,N,N,TF2,Thursday,4,October,130,66
1,57801,64,60,F,24.376249,Otolaryngology,Admission,102,2,6194,...,D00003798,504,N,N,8A,Friday,2,January,300,130
2,71288,64,60,F,24.376249,Otolaryngology,Admission,102,3,6194,...,D00003798,504,Y,N,TF4,Monday,4,April,100,85
3,135104,64,60,F,24.376249,Otolaryngology,Admission,102,3,6194,...,D00003798,504,Y,N,TF2,Monday,3,August,100,83
4,221210,71,94,M,27.963140,Orthopedics,Admission,41,2,29473,...,D00018711,108,N,N,TF4,Monday,5,March,100,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161214,297111,4055249,1,M,23.700428,Pediatric Surgery,Admission,5A,1,100613,...,D00011688,5,N,Y,etc,Tuesday,2,September,200,123
161215,297455,4055328,1,M,20.612160,Pediatric Urology,Day,PDSC,1,6259,...,D00016707,7,N,N,8A,Monday,4,September,130,45
161216,297761,4055407,1,M,12.502703,Pediatric Surgery,Admission,5A,2,105057,...,D00011524,5,N,N,8A,Wednesday,3,September,130,43
161217,297753,4055558,4,F,14.365794,Pediatric Surgery,Admission,5A,2,105057,...,D00004831,5,N,N,TF6,Wednesday,3,September,130,82


In [2]:
from sklearn.model_selection import train_test_split

# Removing unnecessary columns
df.drop(columns=['note id', 'person id', 'surgeon estimated op time', 'final op name'], inplace=True)

# Encoding binary columns
binary_cols = ['condition source value', 'op code']
for col in binary_cols:
    df[col] = df[col].astype('category').cat.codes

# One-hot encoding for other categorical columns
one_hot_cols = ['surgical department', 'op timing', 'month', 'anesthesia type', 
                'day of the week', 'asa class', 'week of the month', 
                'division', 'previous surgery', 'emergency status', 'gender source value',
                'surgeon id', 'ward', 'admission department', 'surgery room'
                ]
df_encoded = pd.get_dummies(df, columns=one_hot_cols)

# Splitting the data
X_all = df_encoded.drop("surgery duration", axis=1)
y_all = df_encoded["surgery duration"]


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# 데이터를 훈련 세트, 나머지 세트로 분할
X_train, X_temp, y_train, y_temp = train_test_split(X_all, y_all, test_size=0.4, random_state=42)

# 나머지 세트를 검증 세트와 테스트 세트로 분할
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 모델 훈련 및 평가
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 검증 세트에 대한 예측 및 평가
val_predictions = model.predict(X_val)
mae_val = mean_absolute_error(y_val, val_predictions)
rmse_val = np.sqrt(mean_squared_error(y_val, val_predictions))
r2_val = r2_score(y_val, val_predictions)

# 테스트 세트에 대한 예측 및 평가
test_predictions = model.predict(X_test)
mae_test = mean_absolute_error(y_test, test_predictions)
rmse_test = np.sqrt(mean_squared_error(y_test, test_predictions))
r2_test = r2_score(y_test, test_predictions)

print(f"Validation Set - MAE: {mae_val}, RMSE: {rmse_val}, R²: {r2_val}")
print(f"Test Set - MAE: {mae_test}, RMSE: {rmse_test}, R²: {r2_test}")


Validation Set - MAE: 34.43315843774552, RMSE: 58.865324108147185, R²: 0.7121835688515422
Test Set - MAE: 33.94738587023942, RMSE: 57.474655533360384, R²: 0.7174079913874565


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# 분과별 데이터셋 준비
departments = df['surgical department'].unique()
department_models = {}
validation_scores = {}
test_scores = {}

for dept in departments:
    # 열 이름 조정
    dept_col_name = 'surgical department_' + dept

    # 분과별 데이터 필터링
    dept_data = df_encoded[df_encoded[dept_col_name] == 1]
    X_dept = dept_data.drop('surgery duration', axis=1)
    y_dept = dept_data['surgery duration']

    # 데이터를 훈련, 검증, 테스트 세트로 분할
    X_train_dept, X_temp_dept, y_train_dept, y_temp_dept = train_test_split(X_dept, y_dept, test_size=0.4, random_state=42)
    X_val_dept, X_test_dept, y_val_dept, y_test_dept = train_test_split(X_temp_dept, y_temp_dept, test_size=0.5, random_state=42)

    # 모델 훈련
    department_model = RandomForestRegressor(n_estimators=100, random_state=42)
    department_model.fit(X_train_dept, y_train_dept)
    department_models[dept] = department_model

    # 검증 세트에 대한 예측 및 평가
    val_predictions = department_model.predict(X_val_dept)
    mae_val = mean_absolute_error(y_val_dept, val_predictions)
    rmse_val = np.sqrt(mean_squared_error(y_val_dept, val_predictions))
    r2_val = r2_score(y_val_dept, val_predictions)
    validation_scores[dept] = (mae_val, rmse_val, r2_val)

    # 테스트 세트에 대한 예측 및 평가
    test_predictions = department_model.predict(X_test_dept)
    mae_test = mean_absolute_error(y_test_dept, test_predictions)
    rmse_test = np.sqrt(mean_squared_error(y_test_dept, test_predictions))
    r2_test = r2_score(y_test_dept, test_predictions)
    test_scores[dept] = (mae_test, rmse_test, r2_test)

# 결과 출력
print("Validation Scores:", validation_scores)
print("Test Scores:", test_scores)


Validation Scores: {'General Surgery': (36.94285890580009, 56.795058095437945, 0.703895973467267), 'Otolaryngology': (45.58908666100255, 76.86334512296436, 0.5238602895316882), 'Orthopedics': (35.66033102618116, 62.17539509937082, 0.5490021555573142), 'Ophthalmology': (14.123252079151133, 25.092650590947553, 0.5489381119547678), 'Obstetrics & Gynecology': (34.75223284100782, 55.757806337708416, 0.58001812335228), 'Urology': (25.25846546310832, 40.12990677901734, 0.752620859667), 'Plastic Surgery': (42.19206855791961, 73.43901199459533, 0.6367535723172886), 'Neurosurgery': (52.255219206680586, 75.08610131288496, 0.6477560928784855), 'Cardiovascular Thoracic Surgery': (48.70296573875804, 70.37522280611768, 0.7622637057457655), 'Pediatric Otolaryngology': (23.847779705117084, 41.07663022722212, 0.6380268216214506), 'Pediatric Orthopedics': (48.43170294494239, 73.67064210434562, 0.3481068660642823), 'Pediatric Thoracic Surgery': (56.27232, 83.9775818719893, 0.7895667023185566), 'Pediatric 

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# 데이터를 훈련, 검증, 테스트 세트로 분할
X_train, X_temp, y_train, y_temp = train_test_split(X_all, y_all, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 분과별 모델 훈련 및 앙상블 예측 준비
department_models = {}

# 검증 및 테스트 세트의 원래 인덱스 유지
val_indices = X_val.index
test_indices = X_test.index

# 앙상블 예측을 위한 배열 초기화
val_predictions = np.zeros(len(X_val))
test_predictions = np.zeros(len(X_test))

for dept in departments:
    dept_col_name = 'surgical department_' + dept

    # 훈련, 검증, 테스트 세트에서 해당 분과에 속하는 데이터의 인덱스 찾기
    dept_indices_train = X_train[X_train[dept_col_name] == 1].index
    dept_indices_val = X_val[X_val[dept_col_name] == 1].index
    dept_indices_test = X_test[X_test[dept_col_name] == 1].index

    # 분과별 데이터셋 분할
    X_train_dept = X_train.loc[dept_indices_train]
    y_train_dept = y_train.loc[dept_indices_train]

    # 모델 훈련
    department_model = RandomForestRegressor(n_estimators=100, random_state=42)
    department_model.fit(X_train_dept, y_train_dept)

    # 검증 및 테스트 세트에 대한 예측 수행
    val_predictions[np.isin(val_indices, dept_indices_val)] = department_model.predict(X_val.loc[dept_indices_val])
    test_predictions[np.isin(test_indices, dept_indices_test)] = department_model.predict(X_test.loc[dept_indices_test])

# 앙상블 성능 평가
mae_val = mean_absolute_error(y_val, val_predictions)
rmse_val = np.sqrt(mean_squared_error(y_val, val_predictions))
r2_val = r2_score(y_val, val_predictions)

mae_test = mean_absolute_error(y_test, test_predictions)
rmse_test = np.sqrt(mean_squared_error(y_test, test_predictions))
r2_test = r2_score(y_test, test_predictions)

print(f"Validation Set - MAE: {mae_val}, RMSE: {rmse_val}, R²: {r2_val}")
print(f"Test Set - MAE: {mae_test}, RMSE: {rmse_test}, R²: {r2_test}")


Validation Set - MAE: 33.97246314264448, RMSE: 57.476512580525636, R²: 0.7256042862971949
Test Set - MAE: 33.985135043212175, RMSE: 57.29333083478465, R²: 0.7191882571851279
