In [1]:
import io
import subprocess
import os
import glob
import sys
import pwd

import pandas as pd
import numpy as np
import scipy
import scipy.stats as stats
from sklearn.decomposition import PCA
import pickle
import re
import math
import plotnine as pn
import seaborn as sns
import glob
import subprocess
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from numpy import log
from collections import defaultdict,Counter
import re
import statsmodels
import warnings
import json

from pathlib import Path
from midas import m
from datetime import datetime, date
import altair as alt

# gh specific
sys.path.append('/ghsfa/projects/pharma/shared_scripts/')
import bibp.functions as functions
import hgvslib.pHGVS as pHGVS  ## do a pull on the hgvslib git repo

# !pip install statannotations==0.4.0
# #!pip uninstall statannotations --yes
# from statannotations.Annotator import Annotator

warnings.filterwarnings("ignore")
pd.options.display.max_rows = None
pd.options.display.max_columns=None
pd.options.display.max_colwidth=None
work_dir="/ghsfa/projects/pharma/projects/sirius_pharma/hazhang_projects/Treatement_Effect_RUOMRD_CSO_call_05052024"

# Data preocessing

## Pull all RUO/IST data from pharma data warehouse

In [12]:
# File name list to handle corner case of data readin
header1_list = ["20230627_JNJ_SQ01_17Samples_RevealReport.csv"]
header0_list = [
 '20230928_AstraZeneca_SOW164_46Samples_InfinityReport.csv',
 '20231106_AstraZeneca_SOW164_46Samples_InfinityReport.csv',
 '20231130_BoundlessBio_SOW02_10Samples_InfinityReport.csv',
 '20231207_AstraZeneca_SOW170_188Samples_InfinityReport.csv',
 '20231225_BristolMyersSquibb_SOW29_14Samples_InfinityReport.csv',
 '20231227_CL003_IST01_170Samples_InfinityReport.csv',
 '20240111_AstraZeneca_SOW174_35Samples_InfinityReport.csv',
 '20240116_AstraZeneca_SOW182_40Samples_InfinityReport.csv',
 '20240126_NCC_IST14_35Samples_InfinityReport.csv',
 '20240401_Cyclacel_SOW01_12Samples_InfinityReport.csv',
 '20240422_AstraZeneca_SOW176Monitoring_998Samples_InfinityReport.csv',
 '20221024_SMC_IST_01_198samples_InfinityReport.csv'
]

# Define the folder paths
folders = [
    "/ghsfa/projects/pharma/datawarehouse/Infinity",
    "/ghsfa/projects/pharma/datawarehouse/infinity",
    "/ghsfa/projects/pharma/datawarehouse/data/infinity_ist_report_warehouse",
    "/ghsfa/projects/pharma/datawarehouse/data/infinity_mrd_report_warehouse",
    "/ghsfa/projects/pharma/datawarehouse/data/infinity_report_warehouse"
]

# List to store the dataframes for final merge
dfs = []

# Iterate over each folder
for folder in folders:
    # List all csv files in the folder
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        # Check if the file is a CSV
        if file.endswith('.csv'):
            try:
                # Determine the header row based on the presence of "MRD" in the filename
                # Some corner cases
                if file in header0_list:
                    df = pd.read_csv(file_path, header=0)
                elif file in header1_list:
                    df = pd.read_csv(file_path, header=1)
                elif "MRD" in file.upper():
                    df = pd.read_csv(file_path, header=0)
                elif "Reveal" in file:
                    df = pd.read_csv(file_path, header=0)
                else:
                    df = pd.read_csv(file_path, header=1)

                # Select specified columns, handling cases where some columns might not exist
                columns = ['GHRequestID', 'GHSampleID', 'Patient_ID', 'Visit_name', 
                           'Sample_status', 'Sample_comment', 'cfDNA_ng', 'Plasma_ml_input', 'Cancertype']
                df_sub = df.loc[:, df.columns.isin(columns)].copy()

                # Drop duplicates based on 'GHSampleID'
                df_sub_dedup = df_sub.drop_duplicates(subset=["GHSampleID"])

                # Add a column for the CSV file name
                df_sub_dedup['csv_file_name'] = file

                # Get the UID of the file owner and convert it to a username
                file_stats = os.stat(file_path)
                file_uid = file_stats.st_uid
                file_creator = pwd.getpwuid(file_uid).pw_name
                df_sub_dedup['file_creator'] = file_creator

                # Append the processed dataframe to the list
                dfs.append(df_sub_dedup)
            except pd.errors.ParserError:
                print(f"Error reading file: {file_path} - possibly incorrect header.")
            except Exception as e:
                print(f"An unexpected error occurred with file: {file_path} - {str(e)}")

# Merge all dataframes into a final DataFrame
df_merged = pd.concat(dfs, ignore_index=True)
df_merged.head()


Error reading file: /ghsfa/projects/pharma/datawarehouse/infinity/20231102_RepareTherapeutics_SOW01_4_22_infinityReport.csv - possibly incorrect header.


Unnamed: 0,GHRequestID,GHSampleID,Patient_ID,Visit_name,Sample_status,Sample_comment,cfDNA_ng,Plasma_ml_input,Cancertype,csv_file_name,file_creator
0,A0800783,B00459142,10010348,Screening,SUCCESS,,85.28,4.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser
1,A0800784,B00459144,10040388,Cycle 1 Day 1,SUCCESS,,421.72,2.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser
2,A0800785,B00459143,10040457,Cycle 1 Day 1,SUCCESS,Methylation Suppression Warning; Low Diversity Warning,0.31,1.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser
3,A0800786,B00459146,10040474,Cycle 1 Day 1,SUCCESS,,23.61,3.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser
4,A0800787,B00459145,10050383,Cycle 1 Day 1,SUCCESS,,208.0,3.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser


In [13]:
# Filter out rows where GHSampleID is NaN
nan_sample_ids = df_merged[df_merged['GHSampleID'].isna()]
print("Total # of this RUO summary data missing GHSampleID is " + str(nan_sample_ids.shape[0]) + "\n") 

# Group by csv_file_name and take the first row from each group
unique_csv_with_nan = nan_sample_ids.groupby('csv_file_name').first().reset_index()
unique_csv_with_nan

Total # of this RUO summary data missing GHSampleID is 4



Unnamed: 0,csv_file_name,GHRequestID,GHSampleID,Patient_ID,Visit_name,Sample_status,Sample_comment,cfDNA_ng,Plasma_ml_input,Cancertype,file_creator
0,20230607_AstraZenaca_SOW01_2Samples_infinityReport.csv,A0730247,,3005.0,SCREEN,Released,,,,,psreedhar
1,20231026_RepareTherapeutics_SOW01_4_23_infinityReport.csv,A0800783,,10010348.0,Screening,SUCCESS,,,,,produser
2,20231106_GHIResearch_SOW10_2_3_infinityReport.csv,A0776833,,S3_CGP_E2E_074,,SUCCESS,,,,Colorectal adenocarcinoma,psreedhar
3,20231211_GHIResearch_E2E01_Batch1_1Samples_InfinityReport.csv,A0542330,,,,SUCCESS,,60.32,,,psreedhar


In [14]:
# Readin samples 
df_merged_1 = df_merged.copy()

# Filter for "Sample_status" == "SUCCESS"
df_merged_1 = df_merged_1[df_merged_1['Sample_status'] == "SUCCESS"]
print("Total size of this RUO summary data is " + str(df_merged_1.shape[0]) + "\n") 
print("Total unique sample size of this RUO/IST summary data is " + str(df_merged_1.GHSampleID.nunique()) + "\n")
print("Total NaN in cancer type of this RUO/IST  summary data is " + str(df_merged_1.Cancertype.isnull().sum()) + "\n")

# Drop duplicates 
df_merged_dedup = df_merged_1.drop_duplicates(subset=["GHSampleID"])
print("Total size of this RUO/IST summary data is " + str(df_merged_dedup.shape[0]) + "\n") 

# Regular expression to extract 'Account' and 'SOW_num'
# pattern = r'_(?P<Account>[^_]+)_SOW(?P<SOW_num>\d+)'
# pattern = r'_(?P<Account>[^_]+?)(?:_.+?)?_SOW(?P<SOW_num>\d+)'

# Regular expression to extract 'Account', 'SOW_num', and 'report_date'
pattern = r'(?P<report_date>^\d+)_?(?P<Account>[^_]+)_([^_]+)'

# Using str.extract() to create new columns in the DataFrame
df_merged_dedup[['report_date', 'Account', 'SOW_num']] = df_merged_dedup['csv_file_name'].str.extract(pattern)

df_merged_dedup.head()

Total size of this RUO summary data is 44496

Total unique sample size of this RUO/IST summary data is 22216

Total NaN in cancer type of this RUO/IST  summary data is 28215

Total size of this RUO/IST summary data is 22217



Unnamed: 0,GHRequestID,GHSampleID,Patient_ID,Visit_name,Sample_status,Sample_comment,cfDNA_ng,Plasma_ml_input,Cancertype,csv_file_name,file_creator,report_date,Account,SOW_num
0,A0800783,B00459142,10010348,Screening,SUCCESS,,85.28,4.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01
1,A0800784,B00459144,10040388,Cycle 1 Day 1,SUCCESS,,421.72,2.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01
2,A0800785,B00459143,10040457,Cycle 1 Day 1,SUCCESS,Methylation Suppression Warning; Low Diversity Warning,0.31,1.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01
3,A0800786,B00459146,10040474,Cycle 1 Day 1,SUCCESS,,23.61,3.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01
4,A0800787,B00459145,10050383,Cycle 1 Day 1,SUCCESS,,208.0,3.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01


In [15]:
# Check the edge case the Account name or SOW name is not extractable
df_unextractable = df_merged_dedup[df_merged_dedup.Account.isna()]
print("Total unique unextractable csv files of this RUO/IST summary data is " + str(df_unextractable.csv_file_name.nunique()) + "\n")
df_unextractable.csv_file_name.unique()

Total unique unextractable csv files of this RUO/IST summary data is 3



array(['YCC02_IST01_10242022_tb_changed_RUO_report_MRD.csv',
       'NOC01_IST01_batch2_7sample_12142022_RUO_report_MRD.csv',
       'NOC01_IST01_12112022_tb_changed_RUO_report_MRD.csv'], dtype=object)

In [16]:
# Merge cancer type information from /ghsfa/projects/pharma/datawarehouse/pdw/raw/latest/fact_accessions.tsv
df_cancer_type_fact = pd.read_table("/ghsfa/projects/pharma/datawarehouse/pdw/raw/latest/fact_accessions.tsv")
df_cancer_type_fact_1 = df_cancer_type_fact.copy()

#merge the "runid" information from the fact_accessions.tsv
df_cancer_type_fact_1.rename(columns={'accn_id': 'GHRequestID'}, inplace=True)
df_merged_1 = df_merged_dedup.merge(df_cancer_type_fact_1[["GHRequestID", "cancer_category", "cancerother", "cancerstage", "cancertype"]], on = "GHRequestID", how = "left")
print("Total sample size of the RUO/IST summary is " + str(df_merged_1.shape[0]) + "\n") 
print('Total NaN in cancer type of the column "cancerother" in this RUO/IST summary data is ' + str(df_merged_1.cancerother.isnull().sum()) + "\n")
df_merged_1.head()


# Save the dataframe
df_merged_1.to_csv(f"{work_dir}/df_merged_warehouse.csv", index = False)

Total sample size of the RUO/IST summary is 22217

Total NaN in cancer type of the column "cancerother" in this RUO/IST summary data is 12149



## Pull data from gh_sample database

In [34]:
# # Define the IDs you want to query for
# ids = ('B00459143', 'B00459152', 'B00651690')

# # Generate the SQL query string
# query = "SELECT * FROM gh_sample WHERE run_sample_id IN {}".format(ids)
# # query = "SELECT * FROM gh_sample WHERE run_sample_id IN {}".format(tuple(mrd_all_cols_df['run_sample_id'].unique()))

# # Run the query using your existing function
# sample_df = functions.run_sql_query(query)
# sample_df.head()

Unnamed: 0,runid,run_sample_id,sample_index,gender,sample_type,timestamp,panel,control_lot,bip_version,git_version,run_mode,product,project,cancer_type,patient_age,parameter_set,lunar_caller,comment,provided_tumor_type,request_id,use_case
0,230810_A01902_0186_AH5VV7DSX7,B00459143,ATTACCATGT,Unknown,Research,2023-08-13 00:37:40+00:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800785,cgp
1,230810_A01902_0185_BH5WTKDSX7,B00459152,CTGCAATTCT,Unknown,Research,2023-08-13 00:31:14+00:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800804,cgp
2,240427_A01613_0374_AH3FTYDSXC,B00651690,TCCTTACCGG,Unknown,Research,2024-04-29 20:19:29+00:00,Sirius_v1.0,,Sirius-1.1.4-RLS,,,Guardant 360,REV_12,,,Sirius_LDT_Soft_v2.0,,,,A1003612,cgp


In [17]:
# SQL query
query = """
SELECT *
FROM gh_sample
WHERE panel LIKE '%Sirius%' AND
      sample_type = 'Research' AND
      product = 'Guardant 360'
"""

# Run the query using your existing function
sample_df = functions.run_sql_query(query)

print("Total size of this RUO summary data is " + str(sample_df.shape[0]) + "\n") 
print("Total unique sample size of this RUO summary data is " + str(sample_df.run_sample_id.nunique()) + "\n")
print("Total NaN in cancer type of this RUO summary data is " + str(sample_df.cancer_type.isnull().sum()) + "\n")

# Drop duplicates 
df_gh_sample_dedup = sample_df.drop_duplicates(subset=["run_sample_id"])
print("Total size of this RUO summary data that pulled from gh_sample database is " + str(df_gh_sample_dedup.shape[0]) + "\n") 
df_gh_sample_dedup.head()

Total size of this RUO summary data is 28428

Total unique sample size of this RUO summary data is 26939

Total NaN in cancer type of this RUO summary data is 27945

Total size of this RUO summary data that pulled from gh_sample database is 26939



Unnamed: 0,runid,run_sample_id,sample_index,gender,sample_type,timestamp,panel,control_lot,bip_version,git_version,run_mode,product,project,cancer_type,patient_age,parameter_set,lunar_caller,comment,provided_tumor_type,request_id,use_case
0,221109_A00770_0586_BHNLGCDSX3,B00226007,ACCTTACTGT,Male,Research,2022-11-11 23:38:12-08:00,Sirius_v1.0,,Sirius-1.0.0-RLS,,,Guardant 360,VUM_01_V3,,68.0,Sirius_v1.0,,,,A0600597,
1,221109_A00770_0587_AHNLKJDSX3,B00225963,TGACTTCTGG,Male,Research,2022-11-11 23:34:55-08:00,Sirius_v1.0,,Sirius-1.0.0-RLS,,,Guardant 360,VUM_01_V3,,72.0,Sirius_v1.0,,,,A0600576,
2,221109_A00770_0587_AHNLKJDSX3,B00226023,TGTCGCCCTT,Female,Research,2022-11-11 23:34:55-08:00,Sirius_v1.0,,Sirius-1.0.0-RLS,,,Guardant 360,VUM_01_V3,,78.0,Sirius_v1.0,,,,A0600584,
3,221109_A00770_0587_AHNLKJDSX3,B00225965,ATCAAGGCCG,Male,Research,2022-11-11 23:34:55-08:00,Sirius_v1.0,,Sirius-1.0.0-RLS,,,Guardant 360,VUM_01_V3,,60.0,Sirius_v1.0,,,,A0600577,
4,221109_A00770_0586_BHNLGCDSX3,B00226004,ACGTTGATGC,Male,Research,2022-11-11 23:38:12-08:00,Sirius_v1.0,,Sirius-1.0.0-RLS,,,Guardant 360,VUM_01_V3,,67.0,Sirius_v1.0,,,,A0600596,


In [18]:
# Merge the runid, fc_dir and other information from gh_sample database

#rename the columns to indicate the caller name
df_gh_sample_dedup = df_gh_sample_dedup.rename(columns={"run_sample_id": "GHSampleID"})

df_complete_2 = df_merged_1.merge(df_gh_sample_dedup, on="GHSampleID", how="left")
print(df_complete_2.shape)
df_complete_2.head()

(22217, 38)


Unnamed: 0,GHRequestID,GHSampleID,Patient_ID,Visit_name,Sample_status,Sample_comment,cfDNA_ng,Plasma_ml_input,Cancertype,csv_file_name,file_creator,report_date,Account,SOW_num,cancer_category,cancerother,cancerstage,cancertype,runid,sample_index,gender,sample_type,timestamp,panel,control_lot,bip_version,git_version,run_mode,product,project,cancer_type,patient_age,parameter_set,lunar_caller,comment,provided_tumor_type,request_id,use_case
0,A0800783,B00459142,10010348,Screening,SUCCESS,,85.28,4.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,,230810_A01902_0186_AH5VV7DSX7,CAGGCGTTGC,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800783,cgp
1,A0800784,B00459144,10040388,Cycle 1 Day 1,SUCCESS,,421.72,2.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,,230810_A01902_0186_AH5VV7DSX7,GCAAAGAGCT,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800784,cgp
2,A0800785,B00459143,10040457,Cycle 1 Day 1,SUCCESS,Methylation Suppression Warning; Low Diversity Warning,0.31,1.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,,230810_A01902_0186_AH5VV7DSX7,ATTACCATGT,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800785,cgp
3,A0800786,B00459146,10040474,Cycle 1 Day 1,SUCCESS,,23.61,3.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,,230810_A01902_0186_AH5VV7DSX7,TATTAACATG,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800786,cgp
4,A0800787,B00459145,10050383,Cycle 1 Day 1,SUCCESS,,208.0,3.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,,230810_A01902_0186_AH5VV7DSX7,ATTTAAGGAC,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800787,cgp


In [19]:
# Check the samples with missing runid
df_missing_runid = df_complete_2[df_complete_2["runid"].isna()]
df_missing_runid.shape

(4008, 38)

In [20]:
# Query data from "Integration Database" as there are some runid missing in the "Production Database" "gh_sample" database
# This is based on Jessica Kurata's confluence page https://guardanthealth.atlassian.net/wiki/spaces/LUN/pages/1329006199/How+To+Fetch+Data+From+the+Sample+Tracker
import sys
import pandas as pd
import psycopg2 as pg2

# Fetch db credentials
sys.path.append("/ghds/projects/mrd/analyses/notebooks/jkurata/gh_sample_tracker/sampletracker/sampletracker/credentials/")
from env import SAMP_DB_VIEW_USERNAME, SAMP_DB_VIEW_PASSWORD

def sql_query_to_df(query):
    """Run an SQL query on the Guardant PostgreSQL server and return the
    result as a pandas dataframe.
    Args:
        query (str): SQL query string.
    Returns:
        pandas.core.frame.DataFrame: pandas dataframe containing SQL
        query.
    """
    connect = pg2.connect(host="ghbi-live-lunar-sample-metadata-db.clrdmmintk6b.us-west-2.rds.amazonaws.com",
                          database="lunar_sample_metadata_db",
                          user=SAMP_DB_VIEW_USERNAME,
                          password=SAMP_DB_VIEW_PASSWORD)
    df = pd.read_sql_query(query, connect)
    connect.close()
    
    return df

In [21]:
# Drop any NaN values and convert to unique IDs (nan values will be dropped as it will break the SQL query)
unique_ids = df_missing_runid['GHSampleID'].dropna().unique()

# Check if unique_ids is empty to prevent errors
if len(unique_ids) == 0:
    # Handle case with no IDs gracefully
    df_intergration_database = pd.DataFrame()
    print("No valid IDs to query.")
else:
    # Format the tuple correctly for SQL query
    # For a single ID, make it ('ID',) instead of ('ID')
    ids_tuple = tuple(unique_ids)
    if len(ids_tuple) == 1:
        ids_tuple = f"('{ids_tuple[0]}')"
    else:
        ids_tuple = str(ids_tuple)  # This will ensure tuples are formatted with commas

    # Define the SQL query using the formatted tuple
    query = f"SELECT * FROM sample_summary WHERE run_sample_id IN {ids_tuple}"

    # Run the query using your existing function to convert SQL query to DataFrame
    df_intergration_database = sql_query_to_df(query)

print(df_intergration_database.shape)
df_intergration_database.head()

(3999, 29)


Unnamed: 0,cohort_id,patient_id,cancer_type,cancer_subtype,stage_group,recur,days_to_recurrence,days_to_last_follow_up,visit_name,specimen_type,input_ng,run_sample_id,panel_name,sequenced_sample_qc,mbd_sample_qc,sequenced_sample_description,sequenced_sample_comment,runid,flowcell_qc,sample_id,batch_id,assay_version,patient_comment,extraction_method,fragmentation_method,size_selection,sample_type,sample_well_id,ex_quant_well_id
0,,,,,,,,,,,30.0,B00143335,sirius_v1.0,,,,,220622_A01744_0010_BHVJTCDSX3,,B00143335,PROD-19,Sirius_EAP,,,,,clinical,H7,
1,,,,,,,,,,,30.0,B00143364,sirius_v1.0,,,,,220622_A01744_0010_BHVJTCDSX3,,B00143364,PROD-19,Sirius_EAP,,,,,clinical,A6,
2,,,,,,,,,,,13.46,B00143313,sirius_v1.0,,,,,220622_A01744_0010_BHVJTCDSX3,,B00143313,PROD-19,Sirius_EAP,,,,,clinical,H4,
3,,,,,,,,,,,30.0,B00143316,sirius_v1.0,,,,,220622_A01744_0010_BHVJTCDSX3,,B00143316,PROD-19,Sirius_EAP,,,,,clinical,A5,
4,,,,,,,,,,,30.0,B00143332,sirius_v1.0,,,,,220622_A01744_0010_BHVJTCDSX3,,B00143332,PROD-19,Sirius_EAP,,,,,clinical,A8,


In [22]:
# Merge the runid and other information from sample_summary in Integration database
df_integration_database_selected = df_intergration_database[['run_sample_id', 'runid', 'cancer_type']]

#rename the columns to indicate the caller name
df_integration_database_selected = df_integration_database_selected.rename(columns={"run_sample_id": "GHSampleID", "cancer_type": "cancerother"})

# Merge the runid and other information from sample_summary in Integration database
df_complete_3 = df_complete_2.merge(df_integration_database_selected, on="GHSampleID", how="left")

# Handle the 'runid' and 'cancerother' duplication issue caused by merging
df_complete_3['runid'] = np.where(df_complete_3['runid_y'].notna(), df_complete_3['runid_y'], df_complete_3['runid_x'])
df_complete_3['cancerother'] = np.where(df_complete_3['cancerother_y'].notna(), df_complete_3['cancerother_y'], df_complete_3['cancerother_x'])

# Drop the original 'runid_x', 'runid_y', 'cancerother_x', and 'cancerother_y' columns
df_complete_3 = df_complete_3.drop(columns=['runid_x', 'runid_y', 'cancerother_x', 'cancerother_y'])

# Drop nan
df_complete_3 = df_complete_3[~df_complete_3["runid"].isna()]

print("Total size of the merged data is " + str(df_complete_3.shape[0]) + "\n") 
print("Total size of the merged data with missing runid is " + str(df_complete_3.runid.isnull().sum()) + "\n") 
print("Total size of the merged data with missing GHSampleID is " + str(df_complete_3.GHSampleID.isnull().sum()) + "\n") 

df_complete_3.head()

Total size of the merged data is 22208

Total size of the merged data with missing runid is 0

Total size of the merged data with missing GHSampleID is 0



Unnamed: 0,GHRequestID,GHSampleID,Patient_ID,Visit_name,Sample_status,Sample_comment,cfDNA_ng,Plasma_ml_input,Cancertype,csv_file_name,file_creator,report_date,Account,SOW_num,cancer_category,cancerstage,cancertype,sample_index,gender,sample_type,timestamp,panel,control_lot,bip_version,git_version,run_mode,product,project,cancer_type,patient_age,parameter_set,lunar_caller,comment,provided_tumor_type,request_id,use_case,runid,cancerother
0,A0800783,B00459142,10010348,Screening,SUCCESS,,85.28,4.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,CAGGCGTTGC,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800783,cgp,230810_A01902_0186_AH5VV7DSX7,
1,A0800784,B00459144,10040388,Cycle 1 Day 1,SUCCESS,,421.72,2.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,GCAAAGAGCT,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800784,cgp,230810_A01902_0186_AH5VV7DSX7,
2,A0800785,B00459143,10040457,Cycle 1 Day 1,SUCCESS,Methylation Suppression Warning; Low Diversity Warning,0.31,1.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,ATTACCATGT,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800785,cgp,230810_A01902_0186_AH5VV7DSX7,
3,A0800786,B00459146,10040474,Cycle 1 Day 1,SUCCESS,,23.61,3.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,TATTAACATG,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800786,cgp,230810_A01902_0186_AH5VV7DSX7,
4,A0800787,B00459145,10050383,Cycle 1 Day 1,SUCCESS,,208.0,3.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,ATTTAAGGAC,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800787,cgp,230810_A01902_0186_AH5VV7DSX7,


In [23]:
# TODO: There are samples with missing runid. Need to check 
df_miss_runid = df_complete_3[df_complete_3["runid"].isna()]
df_miss_runid.csv_file_name.value_counts()
print(df_miss_runid.shape)

df_complete_3 = df_complete_3[~df_complete_3["runid"].isna()]
print("Total size of the merged data with missing runid is " + str(df_complete_3.runid.isnull().sum()) + "\n") 
print("Total size of the merged data with missing GHSampleID is " + str(df_complete_3.GHSampleID.isnull().sum()) + "\n") 

df_complete_3.shape

(0, 38)
Total size of the merged data with missing runid is 0

Total size of the merged data with missing GHSampleID is 0



(22208, 38)

In [34]:
# Prepare the "fc_dir" columns for the data
df_complete_4 = df_complete_3.copy()

# Define the path
# Path to original BIP output dir
fc_dir_all = "/ghsfa/ivd/flowcentral"
fc_dir_omni = "/ghsfa/projects/omni_v2/flowcells"

# Initialize a list to store rows that no BIP ouput found in either of the sources above
rows_no_found = []

# Initialize new columns in the DataFrame
df_complete_4['fc_dir'] = ''

# Iterate over the DataFrame rows
for index, row in df_complete_4.iterrows():
    runid = row["runid"]

    # Find original folder
    pattern_ori = f"{fc_dir_all}/*{runid}*"
    folders = glob.glob(pattern_ori)
    if folders:
        df_complete_4.loc[index, 'fc_dir'] = folders[0]
    else:
        # Search in the alternative directory if no folder found in the first directory
        pattern_omni = f"{fc_dir_omni}/*{runid}*"
        folders_omni = glob.glob(pattern_omni)
        if folders_omni:
            df_complete_4.loc[index, 'fc_dir'] = folders_omni[0]
        else:
            # Print warning if no folder is found in either location
            rows_no_found.append(row)
            # print(f"No suitable folder found for {runid}")
            
# Create a DataFrame with rows where folders were not found in any sources
df_no_fc_dir = pd.DataFrame(rows_no_found)

df_complete_4.head()


Unnamed: 0,GHRequestID,GHSampleID,Patient_ID,Visit_name,Sample_status,Sample_comment,cfDNA_ng,Plasma_ml_input,Cancertype,csv_file_name,file_creator,report_date,Account,SOW_num,cancer_category,cancerstage,cancertype,sample_index,gender,sample_type,timestamp,panel,control_lot,bip_version,git_version,run_mode,product,project,cancer_type,patient_age,parameter_set,lunar_caller,comment,provided_tumor_type,request_id,use_case,runid,cancerother,fc_dir
0,A0800783,B00459142,10010348,Screening,SUCCESS,,85.28,4.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,CAGGCGTTGC,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800783,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230
1,A0800784,B00459144,10040388,Cycle 1 Day 1,SUCCESS,,421.72,2.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,GCAAAGAGCT,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800784,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230
2,A0800785,B00459143,10040457,Cycle 1 Day 1,SUCCESS,Methylation Suppression Warning; Low Diversity Warning,0.31,1.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,ATTACCATGT,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800785,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230
3,A0800786,B00459146,10040474,Cycle 1 Day 1,SUCCESS,,23.61,3.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,TATTAACATG,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800786,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230
4,A0800787,B00459145,10050383,Cycle 1 Day 1,SUCCESS,,208.0,3.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,20240412,RepareTherapeutics,SOW01,Other,,,ATTTAAGGAC,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800787,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230


In [35]:
# Save file 
df_complete_4.to_csv(f"{work_dir}/df_complete_4_05212024.csv", index=False)
df_no_fc_dir.to_csv(f"{work_dir}/df_no_fc_dir_05212024.csv", index=False)

In [36]:
# Drop missing fc_dir
df_complete_4 = pd.read_csv(f"{work_dir}/df_complete_4_05212024.csv", header = 0)
df_no_fc_dir = pd.read_csv(f"{work_dir}/df_no_fc_dir_05212024.csv", header = 0)

df_complete_4_fc_dir_dropna = df_complete_4[df_complete_4["fc_dir"] != ""]
print("Total size of RUO summary with fc_dir info is " + str(df_complete_4_fc_dir_dropna.shape[0]) + "\n") 
print("Total size of RUO summary with fc_dir missing is " + str(df_no_fc_dir.shape[0]) + "\n") 
df_no_fc_dir.head()

Total size of RUO summary with fc_dir info is 22208

Total size of RUO summary with fc_dir missing is 30



Unnamed: 0,GHRequestID,GHSampleID,Patient_ID,Visit_name,Sample_status,Sample_comment,cfDNA_ng,Plasma_ml_input,Cancertype,csv_file_name,file_creator,report_date,Account,SOW_num,cancer_category,cancerstage,cancertype,sample_index,gender,sample_type,timestamp,panel,control_lot,bip_version,git_version,run_mode,product,project,cancer_type,patient_age,parameter_set,lunar_caller,comment,provided_tumor_type,request_id,use_case,runid,cancerother,fc_dir
0,A0621543,B00258552,10361005,Phase 3: Cycle 1 Day 15,SUCCESS,,51.545,2.0,,20230113_Pfizer_SOW46_71Samples_InfinityReport.csv,mcai,20230113,Pfizer,SOW46,Other,,,,,,,,,,,,,,,,,,,,,,221220_A01422_0280_AHKWMFDSX5,,
1,A0621544,B00258554,10361006,Phase 3: Cycle 1 Day 15,SUCCESS,,50.635,2.0,,20230113_Pfizer_SOW46_71Samples_InfinityReport.csv,mcai,20230113,Pfizer,SOW46,Other,,,,,,,,,,,,,,,,,,,,,,221220_A01422_0280_AHKWMFDSX5,,
2,A0621545,B00258540,10371002,Phase 3: Cycle 1 Day 15,SUCCESS,,18.005,2.0,,20230113_Pfizer_SOW46_71Samples_InfinityReport.csv,mcai,20230113,Pfizer,SOW46,Other,,,,,,,,,,,,,,,,,,,,,,221220_A01422_0280_AHKWMFDSX5,,
3,A0621546,B00258542,10371003,Phase 3: Cycle 1 Day 15,SUCCESS,,29.835,2.0,,20230113_Pfizer_SOW46_71Samples_InfinityReport.csv,mcai,20230113,Pfizer,SOW46,Other,,,,,,,,,,,,,,,,,,,,,,221220_A01422_0280_AHKWMFDSX5,,
4,A0621547,B00258544,10371004,Phase 3: Cycle 1 Day 15,SUCCESS,,41.86,2.0,,20230113_Pfizer_SOW46_71Samples_InfinityReport.csv,mcai,20230113,Pfizer,SOW46,Other,,,,,,,,,,,,,,,,,,,,,,221220_A01422_0280_AHKWMFDSX5,,


In [19]:
# TODO: Check the samples with missing fc_dir based on its runid
print("Total size of RUO summary with fc_dir missing is " + str(df_no_fc_dir.shape[0]) + "\n") 


Total size of RUO summary with fc_dir missing is 30



## Group cancer type and define baseline samples

In [25]:
# import re

# df_complete_5 = df_complete_4_fc_dir_dropna.copy()

# # Regex patterns for each cancer type
# regex_patterns = {
#     'breast': r'(?i)\bbreast|DCIS|ductal|infiltrating|Triple\s*Negative|TNBC',
#     'prostate': r'(?i)\bprostate|Castration-Resistant|mCRPC',
#     'esophageal': r'(?i)\besophagus|esophageal',
#     'ovarian': r'(?i)\bovarian|BRCA1|Sertoli-Leydig',
#     'lung': r'(?i)\blung|NSCLC|SCLC|ADC|SCC|ALK-expressing|adenosquamous|bronchioloalveolar|mixed subtype|papillary type|invasive|small cell|non[-\s]*small[-\s]*cell',
#     'bladder': r'(?i)\bbladder|urothelial|transitional\s*cell',
#     'pancreatic': r'(?i)\bpancreas|pancreatic|adenocarcinoma',
#     'hepatocellular': r'(?i)\bhepatocellular|cholangiocarcinoma|hepatocholangiocarcinoma|intrahepatic',
#     'crc': r'(?i)\bcrc|colorectal|colon|rectal|mucinous|rectum',
#     'Head_and_Neck': r'(?i)\bhead\s*and\s*neck|HNSCC|adenocarcinoma\sof\ssalivary\sgland',
#     'renal': r'(?i)\brenal|kidney',
#     'gastric': r'(?i)\bgastric|stomach',
#     'endometrial': r'(?i)\bendometrial|uterine|leiomyosarcoma|endometrium',
#     'melanoma': r'(?i)\bmelanoma',
#     'Gastrointestinal': r'(?i)\bgastrointestinal|gastrooesophageal|gastroesophageal\sjunction|small\sbowel'
# }

# # Function to categorize cancer based on regex patterns
# def categorize_cancer(text):
#     if pd.isnull(text):
#         return 'unknown'
#     for cancer_type, pattern in regex_patterns.items():
#         if re.search(pattern, text):
#             return cancer_type
#     if re.search(r'(?i)unknown', text):  # Check if 'unknown' is anywhere in the text
#         return 'unknown'
#     return 'other'  # Assign 'other' if no patterns match and it's not 'unknown'

# # Apply the categorization function to the cancerother column
# df_complete_5['new_defined_cancer_type'] = df_complete_5['cancerother'].apply(categorize_cancer)


# df_complete_5['new_defined_cancer_type'].value_counts()


unknown             11816
lung                 3625
endometrial          2142
crc                   908
melanoma              666
breast                632
bladder               439
other                 434
renal                 372
pancreatic            310
prostate              281
hepatocellular        149
esophageal            143
ovarian               101
gastric                91
Head_and_Neck          12
Gastrointestinal       11
Name: new_defined_cancer_type, dtype: int64

In [27]:
df_test = df_complete_5[df_complete_5["csv_file_name"] == "20240215_MerckKGaA_SOW31_Batch6_3Samples_InfinityReport.csv"]
df_test.shape

(2, 40)

In [37]:
import re

df_complete_5 = df_complete_4_fc_dir_dropna.copy()

# Combine 'cancerother' and 'Cancertype' columns
df_complete_5['cancerother'] = df_complete_5.apply(
    lambda row: row['Cancertype'] if pd.isnull(row['cancerother']) or row['cancerother'].lower() == 'unknown' else row['cancerother'],
    axis=1
)

# Regex patterns for each cancer type
regex_patterns = {
    'breast': r'(?i)\bbreast|DCIS|ductal|infiltrating|Triple\s*Negative|TNBC',
    'prostate': r'(?i)\bprostate|Castration-Resistant|mCRPC',
    'esophageal': r'(?i)\besophagus|esophageal',
    'ovarian': r'(?i)\bovarian|BRCA1|Sertoli-Leydig',
    'lung': r'(?i)\blung|NSCLC|SCLC|ADC|SCC|ALK-expressing|adenosquamous|bronchioloalveolar|mixed subtype|papillary type|invasive|small cell|non[-\s]*small[-\s]*cell',
    'bladder': r'(?i)\bbladder|urothelial|transitional\s*cell',
    'pancreatic': r'(?i)\bpancreas|pancreatic|adenocarcinoma',
    'hepatocellular': r'(?i)\bhepatocellular|cholangiocarcinoma|hepatocholangiocarcinoma|intrahepatic',
    'crc': r'(?i)\bcrc|colorectal|colon|rectal|mucinous|rectum',
    'Head_and_Neck': r'(?i)\bhead\s*and\s*neck|HNSCC|adenocarcinoma\sof\ssalivary\sgland',
    'renal': r'(?i)\brenal|kidney',
    'gastric': r'(?i)\bgastric|stomach',
    'endometrial': r'(?i)\bendometrial|uterine|leiomyosarcoma|endometrium',
    'melanoma': r'(?i)\bmelanoma',
    'Gastrointestinal': r'(?i)\bgastrointestinal|gastrooesophageal|gastroesophageal\sjunction|small\sbowel'
}

# Function to categorize cancer based on regex patterns
def categorize_cancer(text):
    if pd.isnull(text):
        return 'unknown'
    for cancer_type, pattern in regex_patterns.items():
        if re.search(pattern, text):
            return cancer_type
    if re.search(r'(?i)unknown', text):  # Check if 'unknown' is anywhere in the text
        return 'unknown'
    return 'other'  # Assign 'other' if no patterns match and it's not 'unknown'

# Apply the categorization function to the cancerother column
df_complete_5['new_defined_cancer_type'] = df_complete_5['cancerother'].apply(categorize_cancer)


df_complete_5['new_defined_cancer_type'].value_counts()


unknown             11309
lung                 3901
endometrial          2146
crc                   960
breast                763
melanoma              668
other                 483
bladder               443
renal                 375
pancreatic            338
prostate              291
hepatocellular        163
esophageal            147
ovarian               102
gastric                91
Head_and_Neck          16
Gastrointestinal       12
Name: new_defined_cancer_type, dtype: int64

In [38]:
# Check the cancer category for "other"
df_complete_other_cancer = df_complete_5[df_complete_5["new_defined_cancer_type"] == "other"]
df_complete_other_cancer.shape

unique_other_cancer_values = df_complete_other_cancer['cancerother'].unique()
unique_other_cancer_values_list = unique_other_cancer_values.tolist()
print(unique_other_cancer_values_list)

['GEA', 'Merkel cell carcinoma (MCC)', 'biliary tract cancer', 'Acinic cell carcinoma', 'MTC', 'Papillary thyroid', 'Metastatic Uveal', 'Metastatic Sertoli Cell Testicular Tumor', 'Large Cell Neuroendocrine', 'Squamous cell carcinoma', 'Synovial sarcoma', 'Large cell neuroendocrine carcinoma', 'Granuloma', 'Benign tumor', 'Carcinoid tumor', 'NUT CARCINOMA', 'Pleomorphic carcinoma', 'Endobronchial carcinoid tumor', 'ASC', 'Pleomorphic adenoma', 'Atypical carcinoid tumor', 'Consistent with atypical carcinoid', 'LCNEC', 'MTC ', 'Liposarcoma', 'Glioblastoma', 'Right frontal glioblastoma', 'Synovialosarcoma', 'Peritoneal Carcinosis', 'pending', 'Extranodal NK/T-cell lymphoma', 'Peripheral T-cell Lymphoma NOS', 'Angioimmunoblastic T-cell lymphoma', 'Associated Immunodeficiency-related LPD', 'Extranodal NK T-Cell Lymphoma (Nasal Type', 'Diffuse Large B Cell Lymphoma', 'Classical Hodgkin lymphoma', 'Ampullary carcinoma', 'Pulmonary Pleomorphic carcinoma', 'mesothelioma', 'Brain Cancer, Gliobla

In [39]:
# Baseline defination is based on Kyle's script: /ghsfa/projects/omni_v2/users/kchang/pancancerTF/clean_visit_name.py
import re

def categorize_visit(visit_name):
    if pd.isna(visit_name):
        return 'unknown'
    
    visit_name_clean = str(visit_name).strip().upper()

    # Consolidated Baseline pattern using regular expression
    baseline_pattern = r'\b(pre-op|pre[- ]surgery|pre[- ]op|pre[- ]treatment|pre[- ]treatment|baseline|base|screen|screening|prescreening|scr|scrn|phase 3: cycle 1 day 1|c1d1|c1d1pre|c1d1 part 2-4|p1c1d1|v001d1|0|1|timepoint1|v001d1uso|t1|scrn|c1d1$|c1$|^w1$|week0$|week1$|wk0$|visit01|^v1$|pretreat|^c0$|^c0d1$|^1d1$|^d1$|^d1week1$|^d1visit1$)\b'
    if re.search(baseline_pattern, visit_name_clean, re.IGNORECASE):
        return 'Baseline'

    # Treatment categorization rules using regular expressions
    treatment_pattern = r'\b(treatment|escalation|c\d+d\d+|mono|ccrt|maintc|tx naïve|maintc1d1|maintc8d1|maint|BC2D1|AC2D1)\b'
    if re.search(treatment_pattern, visit_name_clean, re.IGNORECASE):
        return 'Treatment'

    # Post-op categorization rules using regular expressions
    post_op_pattern = r'\b(post-op|post-surgery|post-resection)\b'
    if re.search(post_op_pattern, visit_name_clean, re.IGNORECASE):
        return 'Post-op'

    # Patterns for unknown category
    unknown_pattern = r'\b(progression|pd|unknown|discontinuation|end of treatment)\b'
    if re.search(unknown_pattern, visit_name_clean, re.IGNORECASE):
        return 'unknown'

    # Catch-all other category for unmatched cases
    return 'other'

# Apply the categorize_visit function to the 'Visit_name' column
df_complete_5['treatment'] = df_complete_5['Visit_name'].apply(categorize_visit)

# Verify mapping and count values
df_complete_5['treatment'].value_counts()


Baseline     9140
Treatment    5715
other        5112
unknown      1559
Post-op       682
Name: treatment, dtype: int64

In [40]:
# Check the treatment category for "Treatment"
df_complete_treatment = df_complete_5[df_complete_5["treatment"] == "Treatment"]

unique_visit_other_values = df_complete_treatment['Visit_name'].unique()
unique_visit_other_values_list = unique_visit_other_values.tolist()
print(unique_visit_other_values_list)

['End of Treatment', 'C2D1', 'C3D1', 'C1D15', 'C5D1', 'C6D1', 'C3D1*', 'C2D1*', 'QD: C2D1', 'C3D1 PART 2-4', 'C2D1 PREDOSE', 'AMD2 C5D1', 'C9D1', 'C7D1', 'ON-TREATMENT', 'On-treatment', 'TREATMENT CHG/ PROGESSION', 'C13D1', 'C02D01', 'C01D01', 'C07D01', 'C03D01', 'C05D01', 'C2D8', 'C07D1', 'C11D1', 'C16D1', 'C4D1', 'C44D1', 'C47D1', 'C18D1', 'C22D1', 'C25D1', 'C29D1', 'C34D1', 'C38D1', 'C42D1', 'C50D1', 'C55D1', 'C59D1', 'C17D1', 'C1D8', 'C23D1', 'C28D1', 'C31D1', 'C35D1', 'C39D1', 'Tx Naïve', 'C2D1 7dy+/7dy PREDOSE', 'B1 C2D1 PREDOSE', 'C10D1', 'C8D1', 'C05D1', 'C14D1', 'C3D1 PART 2-4*', '3 weeks post treatment', 'C15D1', 'MAINTC8D1', 'MAINTC1D1', 'ARM C MAINT C1/8 D1', 'MAINT C1/8 D1', 'MAINT C3 D1', 'C4D1 PREDOSE', '30-DAY FOLLOW-UP ESCALATION', 'C3D1 EXPANSION', 'AC2D1', 'BC2D1', 'C1D22', 'C3D22', 'C8D15', 'c1d2', 'C9D1*', 'C12D1', 'C3D1 ESCALATION', 'B1 C2D15 PREDOSE', 'B1 C3D1  PREDOSE', 'C19D1', 'C20D1', 'C21D1', 'C04D01', 'C06D01', 'C10D01', 'C11D01', 'C02D1', 'PH1 C4D1', 'C2D3

## Prepare the MBD path where saves ....msre_caller_mr_features.hdr.tsv file 

### Note that: 
1. ....msre_caller_mr_features.hdr.tsv region_id should be labeled as merge_0, merge_1.... Only BIP output after 04/2024 has the correct version of ....msre_caller_mr_features.hdr.tsv file
2. In Sai's MB caller rerun folder, some old batches were generated with the correct ....msre_caller_mr_features.hdr.tsv file. This is the dir:  "/ghsfa/projects/omni_v2/users/schen/methylation_data/s3-mcm-mbd-counts/mb_crc_12cpg_regular"
3. For the samples whose ....msre_caller_mr_features.hdr.tsv file could not be found either of the above 2 sources. MB caller will be applied on those to generate the ...msre_caller_mr_features.hdr.tsv file. The output dir is: output_dir = f"{workdir}/MB_lung_v4_all_output"

### Rerun MB caller to generate the ...msre_caller_mr_features.hdr.tsv file (For some samples that miss msre_caller_mr_features.hdr.tsv file) 

In [None]:
import epicaller
import epicaller.methylation.methyl_caller

def run_mb_lung_v4_caller_wrapper_slurm(sample_meta_df, mr_model_file, workdir):
    region_file='/screening/notebooks/yhe/repo/bip-products/models/Lunar2-0.2/methylation_caller/mafband_v2023_06_30/v05_classification_space.tsv'
    control_region_file='/screening/notebooks/yhe/repo/bip-products/models/Lunar2-0.2/methylation_caller/mafband_v2023_06_30/v05_positive_control_regions.tsv'
    lr_model_file='/screening/notebooks/yhe/repo/bip-products/models/Lunar2-0.2/methylation_caller/mafband_v2023_06_30/msre_lr_model.tsv'

    jobs_dir = f"{workdir}/MB_lung_v4_all_jobs"
    output_dir = f"{workdir}/MB_lung_v4_all_output"

    # Check and create directories if not exists
    if not os.path.exists(jobs_dir):
        os.makedirs(jobs_dir)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for _, row in sample_meta_df.iterrows():
        run_sample_id = row["GHSampleID"]
        fc_dir = row["fc_dir"]
        flowcell_id = row["runid"]
        mbd_hyper_molecules = f"{fc_dir}/{run_sample_id}/{run_sample_id}.mbd_hyper_molecules.tsv"
        mbd_residual_molecules = f"{fc_dir}/{run_sample_id}/{run_sample_id}.mbd_residual_molecules.tsv"
        
        # Check if the molecule file exists
        if not os.path.exists(mbd_hyper_molecules):
            continue

        job_file = f"{jobs_dir}/MB_call_{run_sample_id}.job"
        shebang_content = (f"#!/bin/bash\n"
                           f"#SBATCH --job-name=MB_call_{run_sample_id}\n"
                           f"#SBATCH --nodes=1\n"
                           f"#SBATCH --ntasks=1\n"
                           f"#SBATCH --cpus-per-task=1\n"
                           f"#SBATCH --mem=30gb\n"
                           f"#SBATCH --partition=dev\n"
                           f"#SBATCH --time=00:10:00\n"
                           f"#SBATCH --output={jobs_dir}/MB_call_{run_sample_id}.log\n\n")
        
        with open(job_file, "w") as job_output:
            job_output.write(shebang_content)
            job_output.write("python3 -m epicaller.methylation.methyl_caller " + 
                             f"--hyper-molecule-file {mbd_hyper_molecules} " + 
                             f"--residual-molecule-file {mbd_residual_molecules} " + 
                             f"--region-file {region_file} " +
                             f"--control-region-file {control_region_file} " + 
                             f"--lr-model-file {lr_model_file} " + 
                             f"--mr-model-file {mr_model_file} " +
                             "--min-frag-len 120 --max-frag-len 240 --min-cg-count 12 --max-cg-count 30 --cutsite-count-cutoff 1 --min-mapq 60 " + 
                             f"--sample-id {run_sample_id} --flowcell-id {flowcell_id} --outdir {output_dir}/")
            # print(job_output)    
        os.system(f"sbatch {job_file}")


In [None]:
#run the function to generate all the MB Lung v4 caller ouputs for df_complete_MB_rerun
# df_complete_MB_rerun was generated based on 05062024 run where all the samples were reran MB caller as long as the MBD file was not found in Sai's rerun folder
run_mb_lung_v4_caller_wrapper_slurm(df_complete_MB_rerun, 
                      "/home/byounggreenwald/LungPan_v4_Caller.model_file.98_spec.tsv", 
                      f"{work_dir}")

In [None]:
#run the function to generate all the MB Lung v4 caller ouputs for df_complete_MB_rerun
# df_no_mbd_dir was generated from the next block of cells to complete the MBD files list in the folder f"{workdir}/MB_lung_v4_all_output"
run_mb_lung_v4_caller_wrapper_slurm(df_no_mbd_dir, 
                      "/home/byounggreenwald/LungPan_v4_Caller.model_file.98_spec.tsv", 
                      f"{work_dir}")

### Identify MBD path for CSO caller methly data prep

In [41]:
from datetime import datetime

# Prepare the "mbd_dir" columns for the data and need to make sure not only the mbd_dir exits but also it include the ....msre_caller_mr_features.hdr.tsv file
df_complete_6 = df_complete_5.copy()

# Path setup
# Path to MB caller rerun by Sai which include the correct "....msre_caller_mr_features.hdr.tsv" for CSO caller and the path to MB rerun dir for this project fc_dir_mb_rerun = "/ghsfa/projects/omni_v2/users/schen/methylation_data/s3-mcm-mbd-counts/mb_crc_12cpg_regular"
fc_dir_mb_rerun = "/ghsfa/projects/omni_v2/users/schen/methylation_data/s3-mcm-mbd-counts/mb_crc_12cpg_regular" # Sai's MB rerun folder
mb_rerun_new = f"{work_dir}/MB_lung_v4_all_output" # Haiyang's

# Initialize a list to store rows that no ....msre_caller_mr_features.hdr.tsv file found in either of the sources above
rows_no_found_mbd = []

# Initialize new columns in the DataFrame
df_complete_6['mbd_dir'] = ''
df_complete_6['unique_sample_id'] = ''

# Date handling
# If the "report_date" is later than April 2024, the original BIP fc_dir should include the correct ....msre_caller_mr_features.hdr.tsv file to use
df_complete_6['report_date'] = pd.to_datetime(df_complete_6['report_date'], format='%Y%m%d', errors='coerce')
target_date = pd.Timestamp('2024-04-01')

for index, row in df_complete_6.iterrows():
    runid = row["runid"]
    unique_sample_id = row["GHSampleID"] + "_" + runid
    df_complete_6.loc[index, 'unique_sample_id'] = unique_sample_id

    # File checks to make sure not only the folder exists but also the MBD file. Some folders has .gz version
    mbd_file = f"{row['GHSampleID']}.msre_caller_mr_features.hdr.tsv"
    mbd_file_gz = f"{mbd_file}.gz"
    found = False

    # If the "report_date" is later than April 2024, the original BIP fc_dir should include the correct ....msre_caller_mr_features.hdr.tsv file to use
    if pd.notna(row['report_date']):
        if row['report_date'] > target_date:
            # Check original BIP fc_dir
            mbd_path = f"{row['fc_dir']}/{row['GHSampleID']}/{mbd_file}"
            mbd_path_gz = f"{row['fc_dir']}/{row['GHSampleID']}/{mbd_file_gz}"
            if glob.glob(mbd_path) or glob.glob(mbd_path_gz):
                df_complete_6.loc[index, "mbd_dir"] = row['fc_dir'] + "/" + row['GHSampleID']
                found = True
        else:
            # Check mb_rerun_new directory
            mbd_path_new = f"{mb_rerun_new}/{mbd_file}"
            mbd_path_new_gz = f"{mb_rerun_new}/{mbd_file_gz}"
            if glob.glob(mbd_path_new) or glob.glob(mbd_path_new_gz):
                df_complete_6.loc[index, "mbd_dir"] = mb_rerun_new
                found = True

            # Check fc_dir_mb_rerun directory if not found
            if not found:
                pattern_rerun = f"{fc_dir_mb_rerun}/*{runid}*"
                folders = glob.glob(pattern_rerun)
                if folders:
                    mbd_path_rerun = f"{folders[0]}/{mbd_file}"
                    mbd_path_rerun_gz = f"{folders[0]}/{mbd_file_gz}"
                    if glob.glob(mbd_path_rerun) or glob.glob(mbd_path_rerun_gz):
                        df_complete_6.loc[index, "mbd_dir"] = folders[0]
                        found = True

        # If no folder or file is found, log the row
        if not found:
            df_complete_6.loc[index, "mbd_dir"] = np.nan
            rows_no_found_mbd.append(row)
            # print(f"No suitable folder or file found for {unique_sample_id}")

# Create a DataFrame with rows where no suitable folder or file was found
df_no_mbd_dir = pd.DataFrame(rows_no_found_mbd)
print("Missing mbd_dir:", df_no_mbd_dir.shape[0])
print("NaN in mbd_dir:", df_complete_6['mbd_dir'].isnull().sum())

df_complete_6.head()


Missing mbd_dir: 219
NaN in mbd_dir: 219


Unnamed: 0,GHRequestID,GHSampleID,Patient_ID,Visit_name,Sample_status,Sample_comment,cfDNA_ng,Plasma_ml_input,Cancertype,csv_file_name,file_creator,report_date,Account,SOW_num,cancer_category,cancerstage,cancertype,sample_index,gender,sample_type,timestamp,panel,control_lot,bip_version,git_version,run_mode,product,project,cancer_type,patient_age,parameter_set,lunar_caller,comment,provided_tumor_type,request_id,use_case,runid,cancerother,fc_dir,new_defined_cancer_type,treatment,mbd_dir,unique_sample_id
0,A0800783,B00459142,10010348,Screening,SUCCESS,,85.28,4.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,2024-04-12,RepareTherapeutics,SOW01,Other,,,CAGGCGTTGC,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800783,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230,unknown,Baseline,,B00459142_230810_A01902_0186_AH5VV7DSX7
1,A0800784,B00459144,10040388,Cycle 1 Day 1,SUCCESS,,421.72,2.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,2024-04-12,RepareTherapeutics,SOW01,Other,,,GCAAAGAGCT,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800784,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230,unknown,Baseline,,B00459144_230810_A01902_0186_AH5VV7DSX7
2,A0800785,B00459143,10040457,Cycle 1 Day 1,SUCCESS,Methylation Suppression Warning; Low Diversity Warning,0.31,1.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,2024-04-12,RepareTherapeutics,SOW01,Other,,,ATTACCATGT,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800785,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230,unknown,Baseline,,B00459143_230810_A01902_0186_AH5VV7DSX7
3,A0800786,B00459146,10040474,Cycle 1 Day 1,SUCCESS,,23.61,3.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,2024-04-12,RepareTherapeutics,SOW01,Other,,,TATTAACATG,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800786,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230,unknown,Baseline,,B00459146_230810_A01902_0186_AH5VV7DSX7
4,A0800787,B00459145,10050383,Cycle 1 Day 1,SUCCESS,,208.0,3.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,2024-04-12,RepareTherapeutics,SOW01,Other,,,ATTTAAGGAC,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800787,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230,unknown,Baseline,,B00459145_230810_A01902_0186_AH5VV7DSX7


In [42]:
# Save data
# df_complete_5.to_csv(f"{work_dir}/df_complete_5_05152024.csv", index=False)
df_complete_6.to_csv(f"{work_dir}/df_all_ruo_05212024.csv", index=False)
df_no_mbd_dir.to_csv(f"{work_dir}/df_no_mbd_dir_05212024.csv", index=False)

df_no_mbd_dir.head()

Unnamed: 0,GHRequestID,GHSampleID,Patient_ID,Visit_name,Sample_status,Sample_comment,cfDNA_ng,Plasma_ml_input,Cancertype,csv_file_name,file_creator,report_date,Account,SOW_num,cancer_category,cancerstage,cancertype,sample_index,gender,sample_type,timestamp,panel,control_lot,bip_version,git_version,run_mode,product,project,cancer_type,patient_age,parameter_set,lunar_caller,comment,provided_tumor_type,request_id,use_case,runid,cancerother,fc_dir,new_defined_cancer_type,treatment,mbd_dir,unique_sample_id
0,A0800783,B00459142,10010348,Screening,SUCCESS,,85.28,4.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,2024-04-12,RepareTherapeutics,SOW01,Other,,,CAGGCGTTGC,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800783,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230,unknown,Baseline,,
1,A0800784,B00459144,10040388,Cycle 1 Day 1,SUCCESS,,421.72,2.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,2024-04-12,RepareTherapeutics,SOW01,Other,,,GCAAAGAGCT,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800784,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230,unknown,Baseline,,
2,A0800785,B00459143,10040457,Cycle 1 Day 1,SUCCESS,Methylation Suppression Warning; Low Diversity Warning,0.31,1.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,2024-04-12,RepareTherapeutics,SOW01,Other,,,ATTACCATGT,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800785,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230,unknown,Baseline,,
3,A0800786,B00459146,10040474,Cycle 1 Day 1,SUCCESS,,23.61,3.0,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,2024-04-12,RepareTherapeutics,SOW01,Other,,,TATTAACATG,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800786,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230,unknown,Baseline,,
4,A0800787,B00459145,10050383,Cycle 1 Day 1,SUCCESS,,208.0,3.5,,20240412_RepareTherapeutics_SOW01_Batch4_22Samples_InfinityReport.csv,produser,2024-04-12,RepareTherapeutics,SOW01,Other,,,ATTTAAGGAC,Unknown,Research,2023-08-12 17:37:40-07:00,Sirius_v1.0,,Sirius-1.1.0-RLS,,,Guardant 360,REP_SOW01,,,Sirius_LDT_Soft_v2.0,,,,A0800787,cgp,230810_A01902_0186_AH5VV7DSX7,,/ghsfa/ivd/flowcentral/230810_A01902_0186_AH5VV7DSX7.01df2ec7-cf8c-418b-b0dd-500f5a5a1a60.20230812103230,unknown,Baseline,,


In [43]:
# Readin data and remove missing "mbd_dir"
df_complete_6 = pd.read_csv(f"{work_dir}/df_all_ruo_05212024.csv", header = 0)

df_complete_7 = df_complete_6[~df_complete_6["mbd_dir"].isna()]
print(df_complete_7.shape)
df_complete_7.head()

(21082, 43)


Unnamed: 0,GHRequestID,GHSampleID,Patient_ID,Visit_name,Sample_status,Sample_comment,cfDNA_ng,Plasma_ml_input,Cancertype,csv_file_name,file_creator,report_date,Account,SOW_num,cancer_category,cancerstage,cancertype,sample_index,gender,sample_type,timestamp,panel,control_lot,bip_version,git_version,run_mode,product,project,cancer_type,patient_age,parameter_set,lunar_caller,comment,provided_tumor_type,request_id,use_case,runid,cancerother,fc_dir,new_defined_cancer_type,treatment,mbd_dir,unique_sample_id
22,A0941021,B00602822,71-12-001,C2 D1 PREDOSE EXP AMD4&5,SUCCESS,,13.1,2.5,,20240227_Merus_SOW02Amendment1_Batch1_4Samples_InfinityReport.csv,produser,2024-02-27,Merus,SOW02Amendment1,Other,,,CTGCAAACCG,Unknown,Research,2024-02-23 06:52:08-08:00,Sirius_v1.0,,Sirius-1.1.2-RLS,,,Guardant 360,MRS_02_Am1,,,Sirius_LDT_Soft_v2.0,,,,A0941021,cgp,240221_A01020_0744_BHTLNVDSX7,,/ghsfa/ivd/flowcentral/240221_A01020_0744_BHTLNVDSX7.34a6b23f-5443-4694-9adc-7f8d35e7efa5.20240222224807,unknown,other,/ghsfa/projects/pharma/projects/sirius_pharma/hazhang_projects/Treatement_Effect_RUOMRD_CSO_call_05052024/MB_lung_v4_all_output,B00602822_240221_A01020_0744_BHTLNVDSX7
23,A0941022,B00602820,71-12-001,C1 D1 PREDOSE EXP AMD3,SUCCESS,,4.81,2.5,,20240227_Merus_SOW02Amendment1_Batch1_4Samples_InfinityReport.csv,produser,2024-02-27,Merus,SOW02Amendment1,Other,,,ATGGTGATAA,Unknown,Research,2024-02-23 06:52:08-08:00,Sirius_v1.0,,Sirius-1.1.2-RLS,,,Guardant 360,MRS_02_Am1,,,Sirius_LDT_Soft_v2.0,,,,A0941022,cgp,240221_A01020_0744_BHTLNVDSX7,,/ghsfa/ivd/flowcentral/240221_A01020_0744_BHTLNVDSX7.34a6b23f-5443-4694-9adc-7f8d35e7efa5.20240222224807,unknown,other,/ghsfa/projects/pharma/projects/sirius_pharma/hazhang_projects/Treatement_Effect_RUOMRD_CSO_call_05052024/MB_lung_v4_all_output,B00602820_240221_A01020_0744_BHTLNVDSX7
24,A0941023,B00602819,71-12-003,C2 D1 PREDOSE EXP AMD4&5,SUCCESS,,14.14,3.0,,20240227_Merus_SOW02Amendment1_Batch1_4Samples_InfinityReport.csv,produser,2024-02-27,Merus,SOW02Amendment1,Other,,,CAGAGGGAAC,Unknown,Research,2024-02-23 06:52:08-08:00,Sirius_v1.0,,Sirius-1.1.2-RLS,,,Guardant 360,MRS_02_Am1,,,Sirius_LDT_Soft_v2.0,,,,A0941023,cgp,240221_A01020_0744_BHTLNVDSX7,,/ghsfa/ivd/flowcentral/240221_A01020_0744_BHTLNVDSX7.34a6b23f-5443-4694-9adc-7f8d35e7efa5.20240222224807,unknown,other,/ghsfa/projects/pharma/projects/sirius_pharma/hazhang_projects/Treatement_Effect_RUOMRD_CSO_call_05052024/MB_lung_v4_all_output,B00602819_240221_A01020_0744_BHTLNVDSX7
25,A0941024,B00602818,71-12-003,C1 D1 PREDOSE EXP AMD4&5,SUCCESS,,12.74,2.5,,20240227_Merus_SOW02Amendment1_Batch1_4Samples_InfinityReport.csv,produser,2024-02-27,Merus,SOW02Amendment1,Other,,,ACATGGCCGG,Unknown,Research,2024-02-23 06:52:08-08:00,Sirius_v1.0,,Sirius-1.1.2-RLS,,,Guardant 360,MRS_02_Am1,,,Sirius_LDT_Soft_v2.0,,,,A0941024,cgp,240221_A01020_0744_BHTLNVDSX7,,/ghsfa/ivd/flowcentral/240221_A01020_0744_BHTLNVDSX7.34a6b23f-5443-4694-9adc-7f8d35e7efa5.20240222224807,unknown,other,/ghsfa/projects/pharma/projects/sirius_pharma/hazhang_projects/Treatement_Effect_RUOMRD_CSO_call_05052024/MB_lung_v4_all_output,B00602818_240221_A01020_0744_BHTLNVDSX7
26,A1015862,B00655978,397-6001-00170,P2C2D1,SUCCESS,,28.08,,,20240513_IDEAYABiosciences_SOW03_Batch5_30Samples_InfinityReport.csv,produser,2024-05-13,IDEAYABiosciences,SOW03,Other,,,CAGAGGGAAC,Unknown,Research,2024-05-06 11:19:28-07:00,Sirius_v1.0,,Sirius-1.1.4-RLS,,,Guardant 360,IDA_03,,,Sirius_LDT_Soft_v2.0,,,,A1015862,cgp,240504_A01422_0592_AH3L2GDSXC,,/ghsfa/ivd/flowcentral/240504_A01422_0592_AH3L2GDSXC,unknown,other,/ghsfa/ivd/flowcentral/240504_A01422_0592_AH3L2GDSXC/B00655978,B00655978_240504_A01422_0592_AH3L2GDSXC


In [44]:
# Generate a cohort-level dataset for cancer-type filling 
print("Total # of this RUO summary data missing cancer type is " + str(df_complete_7.cancerother.isnull().sum()) + "\n") 

df_cancer_missing = df_complete_7[df_complete_7["new_defined_cancer_type"] == "unknown"]

df_cancer_missing_cohort = df_cancer_missing.drop_duplicates(subset = "csv_file_name")
df_cancer_missing_cohort.shape


Total # of this RUO summary data missing cancer type is 11132



(206, 43)

In [45]:
# Generate a cohort-level dataset for cancer-type filling 
print("Total # of this RUO summary data missing cancer type is " + str(df_complete_5.cancerother.isnull().sum()) + "\n") 

df_cancer_missing = df_complete_5[df_complete_5["new_defined_cancer_type"] == "unknown"]

df_cancer_missing_cohort = df_cancer_missing.drop_duplicates(subset = "csv_file_name")
df_cancer_missing_cohort.shape

df_cancer_missing_cohort.to_csv(f"{work_dir}/df_cancer_type_missing_by_cohort_05.csv", index=False)


Total # of this RUO summary data missing cancer type is 11302



(212, 41)

In [33]:
df_cancer_missing_cohort.to_csv(f"{work_dir}/df_cancer_type_missing_by_cohort_05.csv", index=False)
