In [2]:
import pandas as pd
import os

# 현재 파이썬 코드의 파일 경로
current_path = os.getcwd()  # 현재 작업 디렉토리를 가져옵니다.

# CSV 파일 경로
file_path = os.path.join(current_path, 'filtered_data.csv')  # User uploaded fioytle to this path

In [4]:
# Re-importing necessary libraries and reloading the dataset as the code execution state was reset
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.base import clone

surgery_data = pd.read_csv(file_path)

# Preprocessing steps based on the previous discussion
df = surgery_data.drop(columns=['note id', 'person id', 'surgeon estimated op time', 'final op name'])
binary_cols = ['condition source value', 'surgeon id', 'ward', 'surgery room', 'op code']
for col in binary_cols:
    df[col] = df[col].astype('category').cat.codes

one_hot_cols = ['surgical department', 'op timing', 'month', 'anesthesia type',
                'day of the week', 'asa class', 'week of the month', 
                'division', 'previous surgery', 'emergency status', 'gender source value', 
                'admission department']
df_encoded = pd.get_dummies(df, columns=one_hot_cols)

X_all = df_encoded.drop("surgery duration", axis=1)
y_all = df_encoded["surgery duration"]
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Extract the unique surgical departments
surgical_departments = df['surgical department'].unique()

# Training individual models for each department
department_models = {}

for department in surgical_departments:
    # Filter the training data for the current department
    department_train_data = X_train_all[X_train_all[f'surgical department_{department}'] == 1]
    y_department_train = y_train_all.loc[department_train_data.index]

    # Check if there is sufficient data to train a model for the department
    if len(department_train_data) > 0:
        # Initialize and train the model
        model = clone(rf_regressor)
        model.fit(department_train_data, y_department_train)

        # Store the model
        department_models[department] = model

# Function to make predictions using the ensemble of department-specific models
def ensemble_predict(X):
    predictions = []

    for idx, row in X.iterrows():
        for department in surgical_departments:
            if row[f'surgical department_{department}'] == 1 and department in department_models:
                model = department_models[department]
                prediction = model.predict(row.values.reshape(1, -1))[0]  # Correctly reshape the row
                predictions.append(prediction)
                break

    return predictions

# Make predictions on the test set using the ensemble model
ensemble_predictions = ensemble_predict(X_test_all)

# Evaluate the ensemble model's performance (e.g., using Mean Absolute Error)
ensemble_mae = mean_absolute_error(y_test_all, ensemble_predictions)
ensemble_mae




33.598816017450275

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

departments = df['surgical department'].unique()

# Dictionary to store department-specific Random Forest models
rf_department_models = {}

# Training Random Forest models for each department
for dept in departments:
    # Adjusting column name for one-hot encoded data
    dept_col_name = 'surgical department_' + dept

    # Filtering data for the department
    dept_data = df_encoded[df_encoded[dept_col_name] == 1]
    X_dept = dept_data.drop('surgery duration', axis=1)
    y_dept = dept_data['surgery duration']

    # Splitting the data for the department
    X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(X_dept, y_dept, test_size=0.2, random_state=42)

    # Training the Random Forest model
    rf_model = RandomForestRegressor(random_state=42)
    rf_model.fit(X_train_dept, y_train_dept)
    rf_department_models[dept] = rf_model

# Preparing for ensemble predictions
final_rf_predictions = np.zeros(len(X_test_all))
test_indices_rf = X_test_all.index

# Applying each department-specific Random Forest model to the entire test dataset
for dept, rf_model in rf_department_models.items():
    # Adjusting column name
    dept_col_name = 'surgical department_' + dept

    # Indices of test data belonging to the department
    dept_indices_rf = X_test_all[X_test_all[dept_col_name] == 1].index

    # Calculating predictions for the department
    dept_rf_predictions = rf_model.predict(X_test_all.loc[dept_indices_rf])

    # Updating the final prediction array
    final_rf_predictions[np.isin(test_indices_rf, dept_indices_rf)] = dept_rf_predictions

# Evaluating performance of Random Forest ensemble
mae_rf = mean_absolute_error(y_test_all, final_rf_predictions)
rmse_rf = np.sqrt(mean_squared_error(y_test_all, final_rf_predictions))
r2_rf = r2_score(y_test_all, final_rf_predictions)

mae_rf, rmse_rf, r2_rf


KeyboardInterrupt: 