In [8]:
##Build decision tree training model
# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load Data
file_path = ' '
data = pd.read_csv(file_path)

# Preprocess Data
features = ['Gender', 'Subluxation_percent', 'Femoral_neck_angle', 'Lateral_center_edge_angle', 'extrusion_index']
target = 'Cluster'
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree Model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
y_prob = model.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_prob)

# Print results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Misclassification Rate:", misclassification_rate)
print("AUC of ROC:", auc_roc)

# Create CSV with Predicted Clusters
test_results = X_test.copy()
test_results['Actual_Cluster'] = y_test
test_results['Predicted_Cluster'] = y_pred
test_results['PatientID'] = data.loc[test_results.index, 'PatientID']
output_path = ' '
test_results.to_csv(output_path, index=False)


Accuracy: 0.6164383561643836
Precision: 0.7608695652173914
Recall: 0.6730769230769231
F1 Score: 0.7142857142857143
Misclassification Rate: 0.3835616438356164
AUC of ROC: 0.5746336996336996


In [10]:
# External validation
# Load External Data
external_file_path = ''
external_data = pd.read_csv(external_file_path)

# Preprocess External Data
features = ['Gender', 'Subluxation_percent', 'Femoral_neck_angle', 'Lateral_center_edge_angle', 'extrusion_index']
X_external = external_data[features]
y_external = external_data['Groundtruth_Cluster']

# Predict Clusters for External Data
y_external_pred = model.predict(X_external)

# Evaluate Model on External Data
external_accuracy = accuracy_score(y_external, y_external_pred)
external_precision = precision_score(y_external, y_external_pred)
external_recall = recall_score(y_external, y_external_pred)
external_f1 = f1_score(y_external, y_external_pred)
external_misclassification_rate = 1 - external_accuracy
external_y_prob = model.predict_proba(X_external)[:, 1]
external_auc_roc = roc_auc_score(y_external, external_y_prob)

#Print the results
print("External Validation Metrics:")
print("Accuracy:", external_accuracy)
print("Precision:", external_precision)
print("Recall:", external_recall)
print("F1 Score:", external_f1)
print("Misclassification Rate:", external_misclassification_rate)
print("AUC of ROC:", external_auc_roc)

# Create CSV with Predicted Clusters for External Data
external_results = external_data.copy()
external_results['Predicted_Cluster'] = y_external_pred
external_output_path = ''
external_results.to_csv(external_output_path, index=False)

External Validation Metrics:
Accuracy: 0.7142857142857143
Precision: 0.8431372549019608
Recall: 0.7288135593220338
F1 Score: 0.7818181818181819
Misclassification Rate: 0.2857142857142857
AUC of ROC: 0.7044067796610167


In [1]:
#Internal roc data preparation

import pandas as pd
from sklearn.metrics import roc_curve

# Read the file
file_path = ' '
data = pd.read_csv(file_path)

# Assuming the ground truth cluster is in a column named 'ground_truth_cluster'
# and the predicted cluster is in a column named 'predicted_cluster'
ground_truth = data['ground_truth_cluster']
predicted_cluster = data['predicted_cluster']

# Calculate the ROC curve values
fpr, tpr, _ = roc_curve(ground_truth, predicted_cluster)

# Create a DataFrame with the FPR and TPR values
roc_data = pd.DataFrame({'FPR': fpr, 'TPR': tpr})

# Save the DataFrame to a CSV file
output_file_path = ''
roc_data.to_csv(output_file_path, index=False)

output_file_path

'C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/Prediction_decisiontree/internal_roc_data.csv'

In [2]:
#External roc data preparation

import pandas as pd
from sklearn.metrics import roc_curve

# Read the file
file_path = ''
data = pd.read_csv(file_path)

# Assuming the ground truth cluster is in a column named 'ground_truth_cluster'
# and the predicted cluster is in a column named 'predicted_cluster'
ground_truth = data['ground_truth_cluster']
predicted_cluster = data['predicted_cluster']

# Calculate the ROC curve values
fpr, tpr, _ = roc_curve(ground_truth, predicted_cluster)

# Create a DataFrame with the FPR and TPR values
roc_data = pd.DataFrame({'FPR': fpr, 'TPR': tpr})

# Save the DataFrame to a CSV file
output_file_path = ''
roc_data.to_csv(output_file_path, index=False)

output_file_path

'C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/Prediction_decisiontree/external_roc_data.csv'