In [None]:
### Python Code (Core ML Pipeline)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, roc_curve, classification_report


# Load DEG matrix (after DESeq2 output)
data = pd.read_csv("degs_expression_matrix.csv")
X = data.drop("label", axis=1) # features
y = data["label"] # 0 = Control, 1 = PCOS


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# LASSO Logistic Regression
lasso = LogisticRegression(penalty='l1', solver='saga', max_iter=5000)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
print("LASSO AUC:", roc_auc_score(y_test, lasso.predict_proba(X_test)[:,1]))


# Support Vector Machine
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict_proba(X_test)[:,1] # predict_proba returns probabilities, we need the probability of the positive class (index 1)
print("SVM AUC:", roc_auc_score(y_test, y_pred_svm))


# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict_proba(X_test)[:,1] # predict_proba returns probabilities, we need the probability of the positive class (index 1)
print("XGBoost AUC:", roc_auc_score(y_test, y_pred_xgb))

LASSO AUC: 1.0
SVM AUC: 1.0


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost AUC: 1.0


In [None]:
### Python Code (cross-validation)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, roc_curve, classification_report

# Load DEG matrix (after DESeq2 output)
data = pd.read_csv("degs_expression_matrix.csv")
# Drop the 'Unnamed: 0' column if it exists, as it seems to be an artifact from saving/loading
if 'Unnamed: 0' in data.columns:
    data = data.drop('Unnamed: 0', axis=1)

X = data.drop("label", axis=1)  # features
y = data["label"]  # 0 = Control, 1 = PCOS

# Train-test split (still useful for a final evaluation, but cross-validation is better for model selection)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) # Use stratify for balanced splits

# Initialize models
lasso = LogisticRegression(penalty='l1', solver='saga', max_iter=5000)
svm = SVC(kernel='linear', probability=True)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Perform cross-validation
# Using StratifiedKFold to ensure folds have representative proportions of classes
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Using 5 splits for cross-validation

print("Cross-validation AUC scores:")

# LASSO Logistic Regression Cross-validation
lasso_cv_scores = cross_val_score(lasso, X, y, cv=cv, scoring='roc_auc')
print(f"LASSO (CV) AUC: Mean={np.mean(lasso_cv_scores):.4f}, Std={np.std(lasso_cv_scores):.4f}")

# Support Vector Machine Cross-validation
svm_cv_scores = cross_val_score(svm, X, y, cv=cv, scoring='roc_auc')
print(f"SVM (CV) AUC: Mean={np.mean(svm_cv_scores):.4f}, Std={np.std(svm_cv_scores):.4f}")

# XGBoost Cross-validation
xgb_cv_scores = cross_val_score(xgb, X, y, cv=cv, scoring='roc_auc')
print(f"XGBoost (CV) AUC: Mean={np.mean(xgb_cv_scores):.4f}, Std={np.std(xgb_cv_scores):.4f}")

# Optional: You can still evaluate on the test set for comparison
# print("\nTest set AUC scores:")
# print("LASSO AUC:", roc_auc_score(y_test, lasso.predict_proba(X_test)[:,1]))
# print("SVM AUC:", roc_auc_score(y_test, svm.predict_proba(X_test)[:,1]))
# print("XGBoost AUC:", roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1]))

# Further steps to address overfitting and model interpretation can follow,
# such as feature selection based on cross-validation, or model interpretation
# techniques (e.g., examining LASSO coefficients or XGBoost feature importances).

Cross-validation AUC scores:
LASSO (CV) AUC: Mean=1.0000, Std=0.0000
SVM (CV) AUC: Mean=1.0000, Std=0.0000


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost (CV) AUC: Mean=0.9500, Std=0.1000
