In [33]:
import pandas as pd

#define functions for processing featureCount and experimental design

def process_keytable(filepath_keytable, filepath_output, sep_keytable="\t", GCS=False, print_debug=False):
    keytable = pd.read_csv(filepath_keytable, sep=sep_keytable)
    ##Need to generalize
    if 'SampleID-Lab' not in keytable.columns:
        keytable["SampleID-Lab"]=keytable["Description"].str[-10:]
    if "Sample_ID" in keytable.columns:
        keytable.rename(columns={"Sample_ID":"SampleID"}, inplace=True)
    if print_debug:
        print(keytable["SampleID"][0])
    keytable.to_csv(path_or_buf = filepath_output, sep="\t")
    return keytable
    
def get_ID_dict(keytable_df):
    ID_dict = {keytable_df["SampleID"][row]:keytable_df["SampleID-Lab"][row] for row in keytable_df.index}
    return ID_dict

def process_featureCounts(filepath_featureCount, filepath_featureCount_relabeled, ID_dict, sep_featureCount="\t", isolate=False, isolate_string="NA", GCS=False, print_debug=False):
    featureCount = pd.read_csv(filepath_featureCount, sep=sep_featureCount)
    featureCount = featureCount.reindex(sorted(featureCount.columns), axis=1)
    fc_columns = featureCount.columns
    sample_names = fc_columns[0:-2]
    if print_debug:
        print(sample_names)
    file_dict = {name:name[0:8] for name in sample_names}
    featureCount.rename(columns=file_dict, inplace=True)
    featureCount.rename(columns=ID_dict, inplace=True)
    
    #rearrange columns to put geneid and gene name at front
    columns = featureCount.columns.tolist()
    columns = columns[-2:] + columns[:-2]
    featureCount=featureCount[columns]
    
    #isolate columns if string specified
    if isolate:
        #returns columns without string
        #generalize to add option to return columns with string
        featureCount = isolate_columns(featureCount, isolate_string)
    
    #write relabeled featureCounts to tsv file
    featureCount.to_csv(path_or_buf = filepath_featureCount_relabeled, sep="\t")
    
    return featureCount
    
def generate_experiment_design_table(keytable, filepath_exp_design, keytable_column = "SampleID-Lab", \
                                     info_list=["Cell Line", "Inhibition Status","CRISPR", "MRTX", "BI", "SHP2i", "Time Point", "Population"], isolate=False, isolate_string="NA"):
    exp_design = pd.DataFrame()
    exp_design[keytable_column]=keytable[keytable_column]
    #create sample info columns
    for x in info_list:
        exp_design[x]=exp_design[keytable_column].apply(lambda row: extract_info_exp_design(row, x))
    if isolate:
        #returns rows with lab sample ID not containing string, generalize later
        exp_design = isolate_rows(exp_design, isolate_string, "SampleID-Lab")
    exp_design.to_csv(path_or_buf = filepath_exp_design, sep="\t")
    return exp_design
    
def extract_info_exp_design(ID, info):
    """Read the lab ID to extract info about the sample."""
    if info == "Cell Line":
        return ID[2:4]
    if info == "CRISPR":
        return ID[4:6]
    if len(ID) != 10:
        if info == "Inhibition Status":
            return ID[4:6] + "XX"
        if info == "MRTX" or info == "BI" or info == "SHP2i":
            return False
        if info == "Time Point":
            return 0
        if info == "Population":
            return "A"
    else:
        if info == "Inhibition Status":
            return ID[4:8]
        if info == "MRTX":
            if "M" in ID[6:8]:
                return True
            else:
                return False
        if info == "BI":
            if "B" in ID[6:8]:
                return True
            else:
                return False
        if info == "SHP2i":
            if "S" in ID[6:8]:
                return True
            else:
                return False
        if info == "Time Point":
            if int(ID[8:10]) <= 4 and ID[6:8]=="XX":
                return 0
            elif int(ID[8:10]) <= 4:
                return 6
            else: 
                return 72
        if info == "Population":
            if int(ID[8:10]) in [1,2,5,6]:
                return "A"
            else:
                return "B"

def get_gene_list(filepath_gene_list):
    #get list of genes from text file
    with open(filepath_gene_list) as f:
        genes = f.read().splitlines()
    while("" in genes) : 
        genes.remove("")
    return genes

def make_featureCount_genelist(featureCount, gene_list, filepath):
    #check that genes are named in featureCount
    good_genes = []
    print("These genes are not listed in featureCount and will not be included in the analysis:")
    for gene in gene_list:
        if gene in featureCount["gene_name"].tolist():
            good_genes.append(gene)
        else:
            print(gene)
    print("All other genes in the list will be included.")
    ###ADD PRINT NUMBER OF GENES IN LIST
    #make featureCount dataframe with existing genes
    featureCount_genes = featureCount[featureCount["gene_name"].isin(good_genes)]
    featureCount_genes.to_csv(path_or_buf = filepath, sep="\t")
    return featureCount_genes

def isolate_rows(df, string, column_name, contains = False):
    """Isolates rows whose values in the given column do or do not contain the given string. Default behavior returns rows that do not contain the string."""
    df_isolated = df[df[column_name].str.contains(string) == contains]
    return df_isolated

def isolate_columns(df, string, contains = False):
    """Isolates columns whose labels do or do not contain the given string. Default behavior returns columns that do not contain the string."""
    df_isolated = df.loc[:,df.columns.str.contains(string)==contains]
    return df_isolated


In [21]:
#import H23 featureCounts and keytable
#generalize for isolating samples and different keytable formats
keytable_H23 = process_keytable("keytables/2020_09_02-H23-keytable.csv", "20210113_H23_keytable.tsv", sep_keytable=",")
ID_dict_H23 = get_ID_dict(keytable_H23)
featureCount_H23 = process_featureCounts("featureCounts/runs_20210113-results_featureCounts_merged_gene_counts.txt", "20210113_featureCount_H23.tsv", ID_dict_H23, isolate=True, isolate_string="ES35")
exp_design_H23 = generate_experiment_design_table(keytable_H23, "20210103_experiment_design_H23.tsv", isolate=True, isolate_string="ES35")

In [23]:
#import H358 featureCounts and keytable
keytable_H358 = process_keytable("keytables/H358_keytable.txt", "20210209_H358_keytable.tsv")
ID_dict_H358 = get_ID_dict(keytable_H358)
featureCount_H358 = process_featureCounts("featureCounts/runs_20210209-results_featureCounts_merged_gene_counts.txt", "20210209_featureCount_H358.tsv", ID_dict_H358)
exp_design_H358 = generate_experiment_design_table(keytable_H358, "20210209_experiment_design_H358.tsv")

In [24]:
#H358 for 72 hr MRTX vs MRTX + BI pathway analysis
featureCount_H358

Unnamed: 0,Geneid,gene_name,ES58NTXX01,ES58NTXX02,ES58NTXX03,ES58NTXX04,ES58S1XX01,ES58S1XX02,ES58S1XX03,ES58S1XX04,...,ES58S1MX03,ES58S1MX04,ES58S2MX01,ES58S2MX02,ES58S2MX03,ES58S2MX04,ES58S2MB01,ES58S2MB02,ES58S2MB03,ES58S2MB04
0,ENSG00000223972,DDX11L1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ENSG00000227232,WASH7P,240,238,205,263,215,226,242,266,...,229,205,188,216,227,261,251,259,180,266
2,ENSG00000243485,MIR1302-10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ENSG00000237613,FAM138A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ENSG00000268020,OR4G4P,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63672,ENSG00000224240,CYCSP49,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63673,ENSG00000227629,SLC25A15P1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63674,ENSG00000237917,PARP4P1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63675,ENSG00000231514,FAM58CP,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
#isolate 72 hour samples in exp design table
exp_design_H358_72 = isolate_rows(exp_design_H358.astype("string"), "72", "Time Point", contains = True)
#remove samples with SHP2i in exp design table
exp_design_H358_72_BI = isolate_rows(exp_design_H358_72, "S", "Inhibition Status")
exp_design_H358_72_BI

Unnamed: 0,SampleID-Lab,Cell Line,Inhibition Status,CRISPR,MRTX,BI,SHP2i,Time Point,Population
16,ES58NTMX05,58,NTMX,NT,True,False,False,72,A
17,ES58NTMX06,58,NTMX,NT,True,False,False,72,A
18,ES58NTMX07,58,NTMX,NT,True,False,False,72,B
19,ES58NTMX08,58,NTMX,NT,True,False,False,72,B
35,ES58NTMB05,58,NTMB,NT,True,True,False,72,A
36,ES58NTMB06,58,NTMB,NT,True,True,False,72,A
37,ES58NTMB07,58,NTMB,NT,True,True,False,72,B
38,ES58NTMB08,58,NTMB,NT,True,True,False,72,B


In [37]:
exp_design_H358_72_BI.to_csv(path_or_buf = "20210209_experiment_design_H358_72_MB.tsv", sep="\t")