# GEMPD: Machine Learning Pipeline for Transcriptomic data
## Author: Felipe Villena, PhD
### Date:  08/07/2024

# 1 - Loading libraries

In [1]:
# Import ProcessSalmonData Modules

# S1: Script to Load the config file
from S1_LoadConfigFile.load_config import *

# S2: Scripts to consolidate salmon data into tabular format

from S2_ProcessSalmonData.ProcessSalmon import *

# S4 - Running multiple ML models in R

import subprocess
import os
import sys

# 2 - Run pipeline

## 2.1 - Load config file and create an output folder

In [2]:
# Load config

config = load_config('config.yaml')

# Make output directory
output_folder = config['output_folder']
os.makedirs(output_folder, exist_ok=True)

Configuration loaded successfully from config.yaml


## 2.2 - Process salmon data

In [3]:
filtered_file = process_and_filter_salmon_data(config) # output is the '/home/jovyan/mounted_data/GEML_pipe/Outputs/filtered_data.tsv'

Number of patient IDs: 1795
Number of top genes: 151
Using value column: TPM
Number of .sf files found: 4756


Processing files: 100%|██████████| 4756/4756 [00:36<00:00, 129.86it/s]


Filtered data saved to /home/jovyan/mounted_data/GEML_pipe/Outputs/filtered_data.tsv
Processed 1768 patients out of 1795 total patients
Number of missing patients: 27
Missing patients: 5104-SL-0115, 5104-SL-0117, 5104-SL-0119, 5104-SL-0457, 5104-SL-1524, 5104-SL-2055, 5104-SL-2228, 5104-SL-2310, 5104-SL-2324, 5104-SL-2492, 5104-SL-2526, 5104-SL-2527, 5104-SL-2529, 5104-SL-2530, 5104-SL-2531, 5104-SL-2532, 5104-SL-2533, 5104-SL-2535, 5104-SL-2537, 5104-SL-2628, 5104-SL-3324, 5104-SL-3337, 5104-SL-3486, 5104-SL-4442, 5104-SL-4685, 5104-SL-4733, 5104-SL-4857

Number of genes included in the output: 150
Number of genes removed due to no data: 1
Removed genes: ENSG00000207389.1

Number of 'NA' values in the final data: 0





## 2.3 - Evaluating multiple ML models

In [4]:
# Ensure all paths are absolute

r_script_path = os.path.abspath(os.path.join("S3_MLeval", "ml_model_testing.R"))
filtered_data_path = os.path.abspath(os.path.join(config['output_folder'], config['filtered_data_filename']))
metadata_path = os.path.abspath(config['metadata_file'])
patients_path = os.path.abspath(config['patients_file'])

print("Python script: Preparing to run R script")
print(f"R script path: {r_script_path}")
print(f"Filtered data path: {filtered_data_path}")
print(f"Metadata path: {metadata_path}")
print(f"Patients path: {patients_path}")

# Run the R script

try:
    print("Python script: Attempting to run R script")
    result = subprocess.run([
        "Rscript",
        r_script_path,
        filtered_data_path,
        metadata_path,
        patients_path
    ], check=True, capture_output=True, text=True)
    
    print("Python script: R script execution completed")
    print("R script standard output:")
    print(result.stdout)
    
    if result.stderr:
        print("R script standard error:")
        print(result.stderr)
    
    print("Python script: ML evaluation executed successfully!")

except subprocess.CalledProcessError as e:
    print("Python script: R script execution failed")
    print(f"Return code: {e.returncode}")
    print("Standard output:")
    print(e.stdout)
    print("Standard error:")
    print(e.stderr)
    sys.exit(1)

except Exception as e:
    print(f"Python script: An unexpected error occurred: {str(e)}")
    sys.exit(1)

print("Python script: Execution completed")

Python script: Preparing to run R script
R script path: /home/jovyan/mounted_data/GEML_pipe/S3_MLeval/ml_model_testing.R
Filtered data path: /home/jovyan/mounted_data/GEML_pipe/Outputs/filtered_data.tsv
Metadata path: /home/jovyan/mounted_data/meta_data.11192021.csv
Patients path: /home/jovyan/mounted_data/PatientsSelected.tsv
Python script: Attempting to run R script
Python script: R script execution completed
R script standard output:
R script started
R version: R version 4.3.1 (2023-06-16) 
Entering main function
Number of arguments received: 3 
Arguments received:
  Argument 1 : /home/jovyan/mounted_data/GEML_pipe/Outputs/filtered_data.tsv 
    File exists
  Argument 2 : /home/jovyan/mounted_data/meta_data.11192021.csv 
    File exists
  Argument 3 : /home/jovyan/mounted_data/PatientsSelected.tsv 
    File exists

Parsed arguments:
  filtered_data_path: /home/jovyan/mounted_data/GEML_pipe/Outputs/filtered_data.tsv 
  metadata_path: /home/jovyan/mounted_data/meta_data.11192021.csv 
