Skip to content

Commit 436ab62

Browse files
committed
fix merge issues of ml
1 parent 3bfd5f4 commit 436ab62

File tree

5 files changed

+67
-43
lines changed

5 files changed

+67
-43
lines changed

.DS_Store

4 KB
Binary file not shown.

api/.DS_Store

4 KB
Binary file not shown.

api/code/batch_effect_correction.R

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,84 +1,86 @@
1-
21
batch_effect_correction <- function(input_file, output_dir, user_id) {
32
library(jsonlite)
43
library(sva) # For batch effect correction
5-
4+
65
tryCatch(
76
{
87
# Read and preprocess data - preserve exact feature names
98
merged_df_data <- read.csv(input_file, header = TRUE, row.names = 1, check.names = FALSE)
109
merged_df_data <- na.omit(merged_df_data)
11-
10+
1211
# Ensure unique column names
1312
colnames(merged_df_data) <- make.unique(colnames(merged_df_data))
14-
13+
1514
# Extract condition and expression matrix
1615
condition_info <- merged_df_data$condition
1716
data_t <- t(merged_df_data[, !(colnames(merged_df_data) %in% c("condition", "batch"))])
18-
17+
1918
# Save original feature names
2019
feature_names <- rownames(data_t)
2120
sample_names <- colnames(data_t)
22-
21+
2322
# Batch effect correction with ComBat
2423
batch_info <- merged_df_data$batch
2524
data_combat <- ComBat(dat = as.matrix(data_t), batch = batch_info, par.prior = TRUE, prior.plots = FALSE)
26-
25+
2726
# Restore original feature names
2827
rownames(data_combat) <- feature_names
29-
28+
3029
# Save corrected data
3130
output_file <- file.path(output_dir, paste0("batch_", basename(input_file)))
3231
data_corrected <- t(data_combat)
3332
data_corrected_with_condition <- cbind(condition = condition_info, data_corrected)
34-
33+
3534
# Write CSV with proper quoting to preserve commas/spaces in feature names
3635
write.csv(
3736
data_corrected_with_condition,
3837
output_file,
3938
row.names = TRUE,
4039
quote = TRUE,
4140
na = "",
42-
fileEncoding = "UTF-8")
43-
41+
fileEncoding = "UTF-8"
42+
)
43+
4444
# Create boxplots in PDF and PNG formats only
4545
plot_formats <- c("pdf", "png")
4646
for (fmt in plot_formats) {
4747
file_name <- file.path(output_dir, paste0("batch_correction_boxplots.", fmt))
48-
48+
4949
# Set up the plotting device
5050
if (fmt == "png") {
51-
png(file_name, width = 1200, height = 600, res = 300)
51+
png(file_name, width = 2400, height = 1200, res = 300)
5252
} else {
5353
pdf(file_name, width = 12, height = 6)
5454
}
55-
55+
5656
# Create the plots
5757
par(mfrow = c(1, 2), mar = c(10, 5, 4, 2))
58-
58+
5959
# Pre-correction plot
6060
boxplot(data_t,
61-
main = "Before Batch Correction",
62-
las = 2,
63-
col = "lightblue",
64-
outline = FALSE,
65-
ylab = "Expression Levels",
66-
cex.axis = 0.7,
67-
names = sample_names)
68-
61+
main = "Before Batch Correction",
62+
las = 2,
63+
col = "lightblue",
64+
outline = FALSE,
65+
ylab = "Expression Levels",
66+
cex.axis = 0.7,
67+
names = sample_names
68+
)
69+
6970
# Post-correction plot
7071
boxplot(data_combat,
71-
main = "After Batch Correction",
72-
las = 2,
73-
col = "lightgreen",
74-
outline = FALSE,
75-
ylab = "Expression Levels",
76-
cex.axis = 0.7,
77-
names = sample_names)
78-
72+
main = "After Batch Correction",
73+
las = 2,
74+
col = "lightgreen",
75+
outline = FALSE,
76+
ylab = "Expression Levels",
77+
cex.axis = 0.7,
78+
names = sample_names
79+
)
80+
7981
dev.off()
8082
}
81-
83+
8284
# Output completion message
8385
cat("Batch effect correction completed. Corrected data saved to:", output_file, "\n")
8486
cat("Boxplots saved in PDF and PNG formats.\n")

api/code/code.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -856,12 +856,16 @@ def set_perplexity(n_samples):
856856
)
857857
from sklearn.base import clone
858858

859-
def rank_features(top10_df, selected_model, param_grids, classifiers, output_dir, user_info):
859+
def rank_features(top10_df_path, selected_model, param_grids, classifiers, output_dir, user_info):
860860
"""
861861
Rank top features based on single-feature model performance (AUPRC, AUROC, etc.).
862862
Saves CSV and plots ROC/PR curves for each.
863863
"""
864864

865+
top10_df = pd.read_csv(top10_df_path)
866+
867+
print('top10_df:', top10_df.head())
868+
865869
try:
866870
# --- Validate inputs ---
867871
if selected_model not in param_grids:
@@ -932,6 +936,9 @@ def rank_features(top10_df, selected_model, param_grids, classifiers, output_dir
932936
csv_path = os.path.join(output_dir, 'single_feature_metrics_ranking.csv')
933937
metrics_df.to_csv(csv_path, index=False)
934938

939+
940+
print("okay till plotting")
941+
935942
# --- Plotting ---
936943
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
937944

@@ -981,19 +988,20 @@ def rank_features(top10_df, selected_model, param_grids, classifiers, output_dir
981988

982989
# Return URLs
983990
base_url = f"{BASE_URL}/files/{user_info['user_id']}"
984-
return json.dumps({
991+
return {
985992
"message": "Feature ranking and plotting completed successfully.",
986993
"ranking_file": f"{base_url}/single_feature_metrics_ranking.csv",
987994
"plot_png": f"{base_url}/single_feature_model_performance_landscape.png",
988995
"plot_pdf": f"{base_url}/single_feature_model_performance_landscape.pdf",
989996
"metrics": metrics_df.to_dict(orient="records")
990-
})
997+
}
991998

992999
except Exception as e:
993-
return json.dumps({
1000+
print(e)
1001+
return {
9941002
"message": "Error during feature ranking and plotting.",
9951003
"error": str(e)
996-
})
1004+
}
9971005

9981006

9991007

@@ -1009,7 +1017,7 @@ def rank_features(top10_df, selected_model, param_grids, classifiers, output_dir
10091017
matthews_corrcoef, log_loss
10101018
)
10111019

1012-
def evaluate_model_with_features(top10_df, top10_df_array, selected_model, param_grids, classifiers, output_dir, user_info):
1020+
def evaluate_model_with_features(top10_df_path, selected_model, param_grids, classifiers, output_dir, user_info):
10131021
"""
10141022
Evaluate the performance of models using top-N features (10 to 1), save plots and metrics, and select the best feature subset.
10151023
"""
@@ -1018,6 +1026,10 @@ def evaluate_model_with_features(top10_df, top10_df_array, selected_model, param
10181026
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
10191027
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
10201028

1029+
top10_df = pd.read_csv(top10_df_path)
1030+
1031+
top10_df_array = top10_df.drop(columns='condition').columns.to_numpy()
1032+
10211033
# Storage
10221034
roc_curves = []
10231035
pr_curves = []

api/routers/operation_router.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,7 @@ async def benchmark_models_api(user_info: dict = Depends(verify_token)):
455455
from code.code import get_model_and_importance_with_top10, best_models
456456
from fastapi import Form
457457
global_model_name = "Extra Trees"
458+
global_basef_name = "top10_features_extra_trees.csv"
458459

459460
@router.post('/top10-features')
460461
async def top10_features(model_name: str = Form(...), user_info: dict = Depends(verify_token)):
@@ -491,6 +492,8 @@ async def top10_features(model_name: str = Form(...), user_info: dict = Depends(
491492
user_info=user_info
492493
)
493494

495+
global_basef_name = result['top10_features_path']
496+
494497
return {
495498
"message": "Top 10 features extracted successfully.",
496499
"top10_features": result["top10_features"],
@@ -516,7 +519,7 @@ async def visualize_dimensions_api(
516519
try:
517520
# Define file paths
518521
user_id = str(user_info['user_id'])
519-
input_file = os.path.join("code", user_id, "files", "top10_features_extra_trees.csv")
522+
input_file = os.path.join("code", user_id, "files", global_basef_name)
520523
output_dir = os.path.join("code", user_id, "files")
521524

522525
# Ensure the input file exists
@@ -546,7 +549,7 @@ async def visualize_dimensions_api(
546549
from code.code import rank_features, param_grids, classifiers
547550

548551
@router.get('/evaluate-single-features')
549-
async def rank_features_api(
552+
async def evaluate_single_features(
550553
user_info: dict = Depends(verify_token)
551554
):
552555
"""
@@ -555,7 +558,7 @@ async def rank_features_api(
555558
try:
556559
# Define file paths
557560
user_id = str(user_info['user_id'])
558-
input_file = os.path.join("code", user_id, "files", "top10_features_extra_trees.csv")
561+
input_file = os.path.join("code", user_id, "files", global_basef_name)
559562
output_dir = os.path.join("code", user_id, "files")
560563

561564
# Ensure the input file exists
@@ -568,6 +571,8 @@ async def rank_features_api(
568571
# Call the feature ranking function
569572
result = rank_features(input_file, global_model_name, param_grids, classifiers, output_dir, user_info)
570573

574+
print('result: ', result)
575+
571576
# Check for errors in the result
572577
if "error" in result:
573578
return {"message": "Feature ranking failed.", "error": result["error"]}
@@ -609,10 +614,15 @@ async def evaluate_model_features_api(
609614
# Call the function
610615
result = evaluate_model_with_features(input_file, global_model_name, param_grids, classifiers, output_dir, user_info)
611616

617+
print('result: ', result)
618+
612619
# Handle errors
613620
if "error" in result:
614621
return {"message": "Evaluation failed.", "error": result["error"]}
615622

623+
624+
625+
616626
return {
617627
"message": result["message"],
618628
"metrics_file": result["metrics_file"],
@@ -637,7 +647,7 @@ async def visualize_dimensions_api(
637647
try:
638648
# Define file paths
639649
user_id = str(user_info['user_id'])
640-
input_file = os.path.join("code", user_id, "files", "final_selected_features_auprc.csv")
650+
input_file = os.path.join("code", user_id, "files", "final_selected_biomarker_algorithms_df.csv")
641651
output_dir = os.path.join("code", user_id, "files")
642652

643653
# Ensure the input file exists
@@ -677,7 +687,7 @@ async def evaluate_final_model_api(
677687
try:
678688
# Define file paths
679689
user_id = str(user_info['user_id'])
680-
final_df_path = os.path.join("code", user_id, "files", "final_selected_features_auprc.csv")
690+
final_df_path = os.path.join("code", user_id, "files", "final_selected_biomarker_algorithms_df.csv")
681691
output_dir = os.path.join("code", user_id, "files")
682692

683693
# Ensure the input file exists

0 commit comments

Comments
 (0)