In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Read the file
file_path = 'C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/All/data_with_clusters.csv'
data = pd.read_csv(file_path)

# Display the first few rows
data.head()


Unnamed: 0,PatientID,Gender,Subluxation_percent,Femoral_neck_angle,Lateral_center_edge_angle,extrusion_index,Femoral_anteversion,Acetabular_anteversion,Anterior_sector_angle,Posterior_sector_angle,Cluster
0,1,0,0.439726,138.7,17.3,0.456075,48.5,12.0,36.0,69.4,0
1,2,1,0.0,136.2,11.5,0.349057,19.2,15.9,55.5,90.0,1
2,3,1,0.0,139.9,11.5,0.295238,33.95,19.9,51.8,94.2,0
3,4,0,0.484536,136.6,-18.1,0.773196,36.1,10.5,42.3,74.1,0
4,5,0,0.225806,153.7,-5.3,0.512799,29.3,11.8,53.7,76.7,0


In [30]:

# Select features and target variable
features = ['Gender', 'Subluxation_percent', 'Femoral_neck_angle', 'Lateral_center_edge_angle', 'extrusion_index']
X = data[features]
y = data['Cluster']

# Convert gender to numerical values if it's categorical
if X['Gender'].dtype == 'object':
    X['Gender'] = X['Gender'].astype('category').cat.codes

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the K-NN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5) # You can change the number of neighbors

# Train the model
knn_classifier.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn_classifier.predict(X_test_scaled)

# Compute the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the model: {accuracy:.2f}')


Accuracy of the model: 0.66


In [31]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.2f}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1:.2f}')

# Misclassification Rate
misclassification_rate = 1 - accuracy
print(f'Misclassification Rate: {misclassification_rate:.2f}')

# AUC of ROC (assuming binary classification with labels 0 and 1)
y_prob = knn_classifier.predict_proba(X_test_scaled)[:, 1] # Probability estimates of the positive class
auc_roc = roc_auc_score(y_test, y_prob)
print(f'AUC of ROC: {auc_roc:.2f}')


Accuracy: 0.66
Precision: 0.78
Recall: 0.73
F1 Score: 0.75
Misclassification Rate: 0.34
AUC of ROC: 0.64


In [32]:
# Create a new DataFrame with the test data and the predicted clusters
predicted_data = X_test.copy()
predicted_data['predicted_cluster'] = y_pred

# Adding the ground truth cluster labels
predicted_data['ground_truth_cluster'] = data['Cluster']

# Save the DataFrame to a new CSV file
output_file_path = 'C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/Prediction K_nearest_neighbors/internal_predicted_clusters.csv'
predicted_data.to_csv(output_file_path, index=False)

print(f'Predicted clusters have been saved to {output_file_path}')


Predicted clusters have been saved to C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/Prediction K_nearest_neighbors/internal_predicted_clusters.csv


In [33]:
##External validation
# Read the external validation data
external_file_path = 'C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/External validation/External_validation_data_withgroundtruth_Cluster.csv'
external_data = pd.read_csv(external_file_path)

# Select the same features as before
external_X = external_data[features]

# Assuming the target variable (ground truth) is named 'cluster'
external_y = external_data['Groundtruth_Cluster']

# Convert gender to numerical values if it's categorical
if external_X['Gender'].dtype == 'object':
    external_X['Gender'] = external_X['Gender'].astype('category').cat.codes

# Scale the features using the same scaler as before
external_X_scaled = scaler.transform(external_X)

# Make predictions using the trained K-NN model
external_y_pred = knn_classifier.predict(external_X_scaled)

# Calculate the metrics
external_accuracy = accuracy_score(external_y, external_y_pred)
external_precision = precision_score(external_y, external_y_pred)
external_recall = recall_score(external_y, external_y_pred)
external_f1 = f1_score(external_y, external_y_pred)
external_misclassification_rate = 1 - external_accuracy
external_y_prob = knn_classifier.predict_proba(external_X_scaled)[:, 1]
external_auc_roc = roc_auc_score(external_y, external_y_prob)

# Print the results
print(f'External Validation Metrics:')
print(f'Accuracy: {external_accuracy:.2f}')
print(f'Precision: {external_precision:.2f}')
print(f'Recall: {external_recall:.2f}')
print(f'F1 Score: {external_f1:.2f}')
print(f'Misclassification Rate: {external_misclassification_rate:.2f}')
print(f'AUC of ROC: {external_auc_roc:.2f}')

# Create a new DataFrame with the external data and the predicted clusters
external_predicted_data = external_X.copy()
external_predicted_data['predicted_cluster'] = external_y_pred

# Adding the ground truth cluster labels (y_external) to the external predicted data DataFrame
external_predicted_data['ground_truth_cluster'] = external_data['Groundtruth_Cluster']

# Save the DataFrame to a new CSV file
external_output_file_path = 'C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/Prediction K_nearest_neighbors/external_predicted_clusters.csv'
external_predicted_data.to_csv(external_output_file_path, index=False)

print(f'Predicted clusters for the external data have been saved to {external_output_file_path}')


External Validation Metrics:
Accuracy: 0.68
Precision: 0.78
Recall: 0.76
F1 Score: 0.77
Misclassification Rate: 0.32
AUC of ROC: 0.67
Predicted clusters for the external data have been saved to C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/Prediction K_nearest_neighbors/external_predicted_clusters.csv
