In [1]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, roc_auc_score

# Loading the data
file_path = "" #File path here
data = pd.read_csv(file_path)

# Selecting the required features
features = ['Gender', 'Subluxation_percent', 'Femoral_neck_angle', 'Lateral_center_edge_angle', 'extrusion_index']
X = data[features]
y = data['Cluster']

# Splitting the data into training and testing sets (80:20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a Logistic Regression model
model = LogisticRegression(random_state=42)

# Training the model using the training data
model.fit(X_train, y_train)

# Making predictions on the testing data
y_pred = model.predict(X_test)

# Calculating Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
y_prob = model.predict_proba(X_test)[:, 1] # Getting the probability estimates of the positive class
auc_roc = roc_auc_score(y_test, y_prob)

# Printing the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("F1 Score:", f1)
print("Recall:", recall)
print("Misclassification Rate:", misclassification_rate)
print("AUC of ROC:", auc_roc)


Accuracy: 0.7123287671232876
Precision: 0.7818181818181819
F1 Score: 0.8037383177570092
Recall: 0.8269230769230769
Misclassification Rate: 0.28767123287671237
AUC of ROC: 0.739010989010989


In [2]:
# Creating a DataFrame for the testing data
test_data_with_predictions = X_test.copy()

# Adding the actual and predicted cluster information
test_data_with_predictions['Actual_Cluster'] = y_test
test_data_with_predictions['Predicted_Cluster'] = y_pred

# Defining the path for the new CSV file
output_file_path = "" #File path here

# Exporting the DataFrame to a CSV file
test_data_with_predictions.to_csv(output_file_path, index=False)

# Printing a message indicating the successful creation of the CSV file
print("CSV file with predicted clusters has been created at:", output_file_path)


CSV file with predicted clusters has been created at: C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/Prediction_logistic_regression/internal_predicted_clusters.csv


In [5]:
# Path to the external dataset
external_file_path = " "#File path here

# Loading the external data
external_data = pd.read_csv(external_file_path)

# Selecting the required features and ground truth clusters for validation
X_external = external_data[features]
y_external_groundtruth = external_data['Groundtruth_Cluster']

# Making predictions on the external data
y_external_pred = model.predict(X_external)

# Calculating Evaluation Metrics for External Validation
accuracy_external = accuracy_score(y_external_groundtruth, y_external_pred)
precision_external = precision_score(y_external_groundtruth, y_external_pred)
f1_external = f1_score(y_external_groundtruth, y_external_pred)
recall_external = recall_score(y_external_groundtruth, y_external_pred)
misclassification_rate_external = 1 - accuracy_external
y_external_prob = model.predict_proba(X_external)[:, 1] # Probability estimates of the positive class
auc_roc_external = roc_auc_score(y_external_groundtruth, y_external_prob)

# Printing the results
print("External Validation Results:")
print("Accuracy:", accuracy_external)
print("Precision:", precision_external)
print("F1 Score:", f1_external)
print("Recall:", recall_external)
print("Misclassification Rate:", misclassification_rate_external)
print("AUC of ROC:", auc_roc_external)

# Creating a DataFrame for the external data with predictions
external_data_with_predictions = external_data.copy()
external_data_with_predictions['Predicted_Cluster'] = y_external_pred

# Defining the path for the new CSV file for external data
external_output_file_path = " "#File path here

# Exporting the DataFrame to a CSV file
external_data_with_predictions.to_csv(external_output_file_path, index=False)

# Message indicating the successful creation of the CSV file
print("CSV file with predicted clusters for external data has been created at:", external_output_file_path)


External Validation Results:
Accuracy: 0.7380952380952381
Precision: 0.8032786885245902
F1 Score: 0.8166666666666667
Recall: 0.8305084745762712
Misclassification Rate: 0.26190476190476186
AUC of ROC: 0.7505084745762712
CSV file with predicted clusters for external data has been created at: C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/Prediction_logistic_regression/external_predicted_clusters_external.csv


In [6]:
##ROC curve
# Importing the necessary libraries
from sklearn.metrics import roc_curve
import pandas as pd

# Calculating the False Positive Rate (FPR), True Positive Rate (TPR), and thresholds
fpr, tpr, thresholds = roc_curve(y_external_groundtruth, y_external_prob)

# Creating a DataFrame to hold the FPR, TPR, and thresholds
roc_data = pd.DataFrame({
    "Threshold": thresholds,
    "False_Positive_Rate": fpr,
    "True_Positive_Rate": tpr
})

# Defining the path for the ROC data CSV file
roc_data_file_path = "C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/Prediction_logistic_regression/logistic_roc_data.csv"

# Exporting the ROC data to a CSV file
roc_data.to_csv(roc_data_file_path, index=False)

# Printing a message indicating the successful creation of the CSV file
print("CSV file with ROC data has been created at:", roc_data_file_path)


CSV file with ROC data has been created at: C:/Work/AI_Sports_Medicine/Hip/DDH/Cluster/Prediction_logistic_regression/logistic_roc_data.csv
