In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the data
file_path = ' '#File path here
data = pd.read_csv(file_path)

# Select features and target variable
features = ['Gender', 'Subluxation_percent', 'Femoral_neck_angle', 'Lateral_center_edge_angle', 'extrusion_index']
X = data[features]
y = data['Cluster']

# Convert Gender to numerical if it's categorical
if X['Gender'].dtype == 'object':
    X['Gender'] = X['Gender'].astype('category').cat.codes

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Predicting the classes and probabilities
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

# Evaluate the model
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))


Accuracy: 0.6712328767123288
              precision    recall  f1-score   support

           0       0.43      0.43      0.43        21
           1       0.77      0.77      0.77        52

    accuracy                           0.67        73
   macro avg       0.60      0.60      0.60        73
weighted avg       0.67      0.67      0.67        73



In [2]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Calculating accuracy, precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Calculating misclassification rate
misclassification_rate = 1 - accuracy

# Calculating AUC of ROC
auc_roc = roc_auc_score(y_test, y_prob)

# Printing the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Misclassification Rate:", misclassification_rate)
print("AUC of ROC:", auc_roc)

Accuracy: 0.6712328767123288
Precision: 0.6712328767123288
Recall: 0.6712328767123288
F1-Score: 0.6712328767123288
Misclassification Rate: 0.3287671232876712
AUC of ROC: 0.673992673992674


In [3]:
##Saving the csv file
# Adding the predictions to the testing set
X_test_with_predictions = X_test.copy()
X_test_with_predictions['Predicted_Cluster'] = y_pred

# Concatenating the original training data with the testing data that includes predictions
final_data_with_predictions = pd.concat([X_train, X_test_with_predictions])

# Merging with other columns from the original data if needed (e.g., other columns not used in the model)
final_data_with_predictions = data.merge(final_data_with_predictions, how='left', left_index=True, right_index=True)

# Adding the ground truth cluster labels
final_data_with_predictions['ground_truth_cluster'] = data['Cluster']

# Saving to a new CSV file
output_file_path = ' '#File path here
final_data_with_predictions.to_csv(output_file_path, index=False)

print(f"File saved to {output_file_path}")

File saved to C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/Prediction_Random_forest_model/internal_data_with_predicted_clusters.csv


In [4]:
# Read the external validation file
external_file_path = ' '#File path here
external_data = pd.read_csv(external_file_path)

# Selecting features and target variable
external_X = external_data[features]
external_y = external_data['Groundtruth_Cluster']

# Preprocessing the external data (scaling)
external_X_scaled = scaler.transform(external_X)

# Predicting the clusters and probabilities for external data
external_y_pred = model.predict(external_X_scaled)
external_y_prob = model.predict_proba(external_X_scaled)[:, 1]

# Calculating accuracy, precision, recall, and F1-score for external validation
external_accuracy = accuracy_score(external_y, external_y_pred)
external_precision = precision_score(external_y, external_y_pred, average='weighted')
external_recall = recall_score(external_y, external_y_pred, average='weighted')
external_f1 = f1_score(external_y, external_y_pred, average='weighted')

# Calculating misclassification rate for external validation
external_misclassification_rate = 1 - external_accuracy

# Calculating AUC of ROC for external validation
external_auc_roc = roc_auc_score(external_y, external_y_prob)

# Printing the results
print("External Validation:")
print("Accuracy:", external_accuracy)
print("Precision:", external_precision)
print("Recall:", external_recall)
print("F1-Score:", external_f1)
print("Misclassification Rate:", external_misclassification_rate)
print("AUC of ROC:", external_auc_roc)

# Creating a new CSV file with predicted clusters for external data
external_data_with_predictions = external_data.copy()
external_data_with_predictions['Predicted_Cluster'] = external_y_pred
output_file_path_external = 'C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/Prediction_Random_forest_model/external_data_with_predicted_clusters.csv'
external_data_with_predictions.to_csv(output_file_path_external, index=False)

print(f"File saved to {output_file_path_external}")


External Validation:
Accuracy: 0.7380952380952381
Precision: 0.7445001392369813
Recall: 0.7380952380952381
F1-Score: 0.7408740684602754
Misclassification Rate: 0.26190476190476186
AUC of ROC: 0.7515254237288136
File saved to C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/Prediction_Random_forest_model/external_data_with_predicted_clusters.csv
