In [None]:
#Random Forest model
#10Fold

import joblib
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

# Read CSV file
df = pd.read_csv("CSV_PATH")

path_data = sklearn.utils.Bunch()
#Read lavel information
y = df["label"]

#Read factors
#Change to your own factors to use
X = df.loc[:, [ 'age', 'sex', 'T', 'DOI', 'RL', 'ly', 'v01', 'pn01', 'CLAM_score']]
 
#feature list 
path_data["features"] = [ 'age', 'sex', 'pT', 'DOI', 'side', 'ly', 'v','pn', 'WSI_score']

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=0)

# 10-Fold Cross Validationの
cv = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)

# List for feature importance
feature_importances_list = []

# AUC、F1、Accuracy
auc_scores = []
f1_scores = []
accuracy_scores = []

#Save model
fold_models = [] 

# For grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

grid_search = GridSearchCV(rf_model, param_grid, cv=cv, scoring='roc_auc')

# each fold
for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), start=1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    grid_search.fit(X_train, y_train)

    
    rf_model = grid_search.best_estimator_


    y_pred_prob = rf_model.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_pred_prob)
    auc_scores.append(auc)

    y_pred = rf_model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    f1_scores.append(f1)

    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

    feature_importances_list.append(rf_model.feature_importances_)
    fold_models.append(rf_model)


    joblib.dump(rf_model, f'fold_{fold}_model.pkl')


print("Mean AUC:", np.mean(auc_scores))
print("Mean F1 Score:", np.mean(f1_scores))
print("Mean Accuracy:", np.mean(accuracy_scores))

rf_model.fit(X, y)

average_feature_importances = np.mean(feature_importances_list, axis=0)


sorted_indices = np.argsort(average_feature_importances)[::-1]
sorted_feature_importances = average_feature_importances[sorted_indices]
sorted_feature_names = np.array(path_data["features"])[sorted_indices]


for idx, importance in enumerate(sorted_feature_importances):
    print(f"{sorted_feature_names[idx]}: {importance}")

plt.figure(figsize=(10, 6))
plt.barh(range(X.shape[1]), sorted_feature_importances)

# 特徴の名前をxticks()関数を使って設定
plt.yticks(range(X.shape[1]), sorted_feature_names)
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance from Random Forest')
plt.tight_layout()

plt.show()


In [None]:

plt.figure(figsize=(10, 6))
plt.barh(range(X.shape[1]), sorted_feature_importances)

plt.yticks(range(X.shape[1]), sorted_feature_names, fontsize=22)
plt.xlabel('Feature Importance', fontsize=22)
plt.ylabel('Features', fontsize=22)
plt.title('Feature Importance from Random Forest', fontsize=22)
plt.tight_layout()

plt.show()

In [None]:
import csv

with open('auc_accuracy_results.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['Fold', 'AUC', 'Accuracy'])
    for fold, (auc, accuracy) in enumerate(zip(auc_scores, accuracy_scores), start=1):
        csvwriter.writerow([fold, auc, accuracy])
        
#学習したモデルの保存
import joblib

# 学習済みモデルのFold毎に保存
joblib.dump(rf_model, 'final_model.pkl')

In [None]:
#TEST

import joblib
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score

# 新しいデータセットを読み込む1
new_df = pd.read_csv("TEST_DATA_CSV_PATH")

# Features 
X_new = new_df.loc[:, [  'age', 'sex', 'T', 'DOI', 'RL', 'ly', 'v01', 'pn01', 'CLAM_score']]

# Label
y_true = new_df['label']

auc_scores = []
accuracy_scores = []

results = []

# For each Fold
for fold in range(1, 11):  
    loaded_model = joblib.load(f'fold_{fold}_model.pkl')
    y_pred_prob = loaded_model.predict_proba(X_new)[:, 1]
    
    # AUC
    auc = roc_auc_score(y_true, y_pred_prob)
    auc_scores.append(auc)

    # label prediction
    y_pred = (y_pred_prob >= 0.5).astype(int)

    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    accuracy_scores.append(accuracy)
    
    
    fold_result = {
        'Fold': fold,
        'AUC': auc,
        'Accuracy': accuracy
    }
    results.append(fold_result)
    

# Fold AUC and Accuracy
for fold in range(10):
    print(f"Fold {fold + 1} - AUC: {auc_scores[fold]}, Accuracy: {accuracy_scores[fold]}")

# mean
print("Mean AUC:", sum(auc_scores) / len(auc_scores))
print("Mean Accuracy:", sum(accuracy_scores) / len(accuracy_scores))

# To dataframe
results_df = pd.DataFrame(results)

# CSV(Result)
results_df.to_csv('Test_results.csv', index=False)