In [1]:
import pandas as pd
import os
import numpy as np

# 현재 파이썬 코드의 파일 경로
current_path = os.getcwd()  # 현재 작업 디렉토리를 가져옵니다.

# CSV 파일 경로
file_path = os.path.join(current_path, 'filtered_data.csv')  # User uploaded fioytle to this path

df = pd.read_csv(file_path)
df

Unnamed: 0,note id,person id,age,gender source value,BMI,admission department,division,ward,asa class,surgeon id,...,condition source value,surgery room,previous surgery,emergency status,op timing,day of the week,week of the month,month,surgeon estimated op time,surgery duration
0,101058,29,81,F,25.247087,General Surgery,Admission,NUGW2,2,9885,...,D00002196,203,N,N,TF2,Thursday,4,October,130,66
1,57801,64,60,F,24.376249,Otolaryngology,Admission,102,2,6194,...,D00003798,504,N,N,8A,Friday,2,January,300,130
2,71288,64,60,F,24.376249,Otolaryngology,Admission,102,3,6194,...,D00003798,504,Y,N,TF4,Monday,4,April,100,85
3,135104,64,60,F,24.376249,Otolaryngology,Admission,102,3,6194,...,D00003798,504,Y,N,TF2,Monday,3,August,100,83
4,221210,71,94,M,27.963140,Orthopedics,Admission,41,2,29473,...,D00018711,108,N,N,TF4,Monday,5,March,100,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161214,297111,4055249,1,M,23.700428,Pediatric Surgery,Admission,5A,1,100613,...,D00011688,5,N,Y,etc,Tuesday,2,September,200,123
161215,297455,4055328,1,M,20.612160,Pediatric Urology,Day,PDSC,1,6259,...,D00016707,7,N,N,8A,Monday,4,September,130,45
161216,297761,4055407,1,M,12.502703,Pediatric Surgery,Admission,5A,2,105057,...,D00011524,5,N,N,8A,Wednesday,3,September,130,43
161217,297753,4055558,4,F,14.365794,Pediatric Surgery,Admission,5A,2,105057,...,D00004831,5,N,N,TF6,Wednesday,3,September,130,82


In [2]:
from sklearn.model_selection import train_test_split

# Removing unnecessary columns
df.drop(columns=['note id', 'person id', 'surgeon estimated op time', 'final op name'], inplace=True)

# Encoding binary columns
binary_cols = ['condition source value', 'op code']
for col in binary_cols:
    df[col] = df[col].astype('category').cat.codes

# One-hot encoding for other categorical columns
one_hot_cols = ['surgical department', 'op timing', 'month', 'anesthesia type',
                'day of the week', 'asa class', 'week of the month', 
                'division', 'previous surgery', 'emergency status', 'gender source value', 'surgeon id', 'ward', 
                'admission department', 'surgery room']
df_encoded = pd.get_dummies(df, columns=one_hot_cols)

# Splitting the data
X_all = df_encoded.drop("surgery duration", axis=1)
y_all = df_encoded["surgery duration"]
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Displaying the first few rows of the resulting dataframe
X_train_all
y_train_all


61344     122
137241     48
139478     76
113549     36
149411    127
         ... 
119879     76
103694     57
131932    311
146867     82
121958     57
Name: surgery duration, Length: 128975, dtype: int64

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

departments = df['surgical department'].unique()

# Dictionary to store department-specific Random Forest models
rf_department_models = {}

# Training Random Forest models for each department
for dept in departments:
    # Adjusting column name for one-hot encoded data
    dept_col_name = 'surgical department_' + dept

    # Filtering data for the department
    dept_data = df_encoded[df_encoded[dept_col_name] == 1]
    X_dept = dept_data.drop('surgery duration', axis=1)
    y_dept = dept_data['surgery duration']

    # Splitting the data for the department
    X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(X_dept, y_dept, test_size=0.2, random_state=42)

    # Training the Random Forest model
    rf_model = RandomForestRegressor(random_state=42)
    rf_model.fit(X_train_dept, y_train_dept)
    rf_department_models[dept] = rf_model

# Preparing for ensemble predictions
final_rf_predictions = np.zeros(len(X_test_all))
test_indices_rf = X_test_all.index

# Applying each department-specific Random Forest model to the entire test dataset
for dept, rf_model in rf_department_models.items():
    # Adjusting column name
    dept_col_name = 'surgical department_' + dept

    # Indices of test data belonging to the department
    dept_indices_rf = X_test_all[X_test_all[dept_col_name] == 1].index

    # Calculating predictions for the department
    dept_rf_predictions = rf_model.predict(X_test_all.loc[dept_indices_rf])

    # Updating the final prediction array
    final_rf_predictions[np.isin(test_indices_rf, dept_indices_rf)] = dept_rf_predictions

# Evaluating performance of Random Forest ensemble
mae_rf = mean_absolute_error(y_test_all, final_rf_predictions)
rmse_rf = np.sqrt(mean_squared_error(y_test_all, final_rf_predictions))
r2_rf = r2_score(y_test_all, final_rf_predictions)

mae_rf, rmse_rf, r2_rf


(16.665345490633918, 32.015591691698546, 0.9139384419413664)

In [4]:
from sklearn.model_selection import KFold

# Setting up K-Fold cross-validation
n_splits_cv = 5
kf_cv = KFold(n_splits=n_splits_cv, shuffle=True, random_state=42)

# Preparing to store performance metrics for each fold
mae_scores_rf = []
rmse_scores_rf = []
r2_scores_rf = []

# Cross-validation process for Random Forest
for train_index_cv, test_index_cv in kf_cv.split(X_all):
    # Splitting data into training and testing sets for this fold
    X_train_cv_rf, X_test_cv_rf = X_all.iloc[train_index_cv], X_all.iloc[test_index_cv]
    y_train_cv_rf, y_test_cv_rf = y_all.iloc[train_index_cv], y_all.iloc[test_index_cv]

    # Resetting final predictions for this fold
    final_predictions_cv_rf = np.zeros(len(X_test_cv_rf))
    test_indices_cv_rf = X_test_cv_rf.index

    # Training and predicting for each department
    for dept in departments:
        # Adjusting column name for one-hot encoded data
        dept_col_name_rf = 'surgical department_' + dept

        # Filtering data for the department
        dept_data_train_rf = X_train_cv_rf[X_train_cv_rf[dept_col_name_rf] == 1]
        dept_data_test_rf = X_test_cv_rf[X_test_cv_rf[dept_col_name_rf] == 1]
        y_dept_train_rf = y_train_cv_rf[dept_data_train_rf.index]
        y_dept_test_rf = y_test_cv_rf[dept_data_test_rf.index]

        # Training the Random Forest model for the department
        rf_model_cv = RandomForestRegressor(random_state=42)
        rf_model_cv.fit(dept_data_train_rf, y_dept_train_rf)

        # Calculating predictions for the department
        dept_predictions_cv_rf = rf_model_cv.predict(dept_data_test_rf)

        # Updating the final prediction array for this fold
        final_predictions_cv_rf[np.isin(test_indices_cv_rf, dept_data_test_rf.index)] = dept_predictions_cv_rf

    # Evaluating performance for this fold
    mae_cv_rf = mean_absolute_error(y_test_cv_rf, final_predictions_cv_rf)
    rmse_cv_rf = np.sqrt(mean_squared_error(y_test_cv_rf, final_predictions_cv_rf))
    r2_cv_rf = r2_score(y_test_cv_rf, final_predictions_cv_rf)

    # Storing performance metrics
    mae_scores_rf.append(mae_cv_rf)
    rmse_scores_rf.append(rmse_cv_rf)
    r2_scores_rf.append(r2_cv_rf)

# Averaging performance metrics across all folds
average_mae_rf = np.mean(mae_scores_rf)
average_rmse_rf = np.mean(rmse_scores_rf)
average_r2_rf = np.mean(r2_scores_rf)

average_mae_rf, average_rmse_rf, average_r2_rf


(33.5016867893357, 56.82845838343731, 0.7311137859785544)

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score

# Dictionary to store the cross-validation results for each department using Random Forest
rf_cv_results = {}
n_folds = 5

for dept in departments:
    dept_col_name = 'surgical department_' + dept

    # Filtering data for the department
    dept_data = df_encoded[df_encoded[dept_col_name] == 1]
    X_dept = dept_data.drop('surgery duration', axis=1)
    y_dept = dept_data['surgery duration']

    # Creating the Random Forest model
    rf_model = RandomForestRegressor(random_state=42)

    # Performing cross-validation
    rf_mae_scores = -cross_val_score(rf_model, X_dept, y_dept, cv=n_folds, scoring='neg_mean_absolute_error')
    rf_rmse_scores = np.sqrt(-cross_val_score(rf_model, X_dept, y_dept, cv=n_folds, scoring='neg_mean_squared_error'))
    rf_r2_scores = cross_val_score(rf_model, X_dept, y_dept, cv=n_folds, scoring='r2')

    # Storing results
    rf_cv_results[dept] = {
        'MAE': rf_mae_scores.mean(),
        'RMSE': rf_rmse_scores.mean(),
        'R2': rf_r2_scores.mean()
    }

# Displaying the average of the cross-validation results across all departments for Random Forest
avg_rf_mae = np.mean([rf_cv_results[dept]['MAE'] for dept in rf_cv_results])
avg_rf_rmse = np.mean([rf_cv_results[dept]['RMSE'] for dept in rf_cv_results])
avg_rf_r2 = np.mean([rf_cv_results[dept]['R2'] for dept in rf_cv_results])

avg_rf_mae, avg_rf_rmse, avg_rf_r2


KeyboardInterrupt: 