In [None]:
%load_ext blackcellmagic

In [None]:
import pandas as pd

# define functions for processing featureCount and experimental design


def process_keytable(
    filepath_keytable,
    filepath_output,
    sep_keytable="\t",
    GCS=False,
    print_debug=False,
):
    keytable = pd.read_csv(filepath_keytable, sep=sep_keytable)
    ##Need to generalize
    keytable["SampleID-Lab"] = keytable["Description"].str[-10:]
    keytable.rename(columns={"Sample_ID": "SampleID"}, inplace=True)
    if print_debug:
        print(keytable["SampleID"][0])
    keytable.to_csv(path_or_buf=filepath_output, sep="\t")
    return keytable


def get_ID_dict(keytable_df):
    ID_dict = {
        keytable_df["SampleID"][row]: keytable_df["SampleID-Lab"][row]
        for row in keytable_df.index
    }
    return ID_dict


def process_featureCounts(
    filepath_featureCount,
    filepath_featureCount_relabeled,
    ID_dict,
    sep_featureCount="\t",
    GCS=False,
    print_debug=False,
):
    featureCount = pd.read_csv(filepath_featureCount, sep=sep_featureCount)
    featureCount = featureCount.reindex(sorted(featureCount.columns), axis=1)
    fc_columns = featureCount.columns
    sample_names = fc_columns[0:-2]
    if print_debug:
        print(sample_names)
    file_dict = {name: name[0:8] for name in sample_names}
    featureCount.rename(columns=file_dict, inplace=True)
    featureCount.rename(columns=ID_dict, inplace=True)

    # rearrange columns to put geneid and gene name at front
    columns = featureCount.columns.tolist()
    columns = columns[-2:] + columns[:-2]
    featureCount = featureCount[columns]

    # write relabeled featureCounts to tsv file
    featureCount.to_csv(path_or_buf=filepath_featureCount_relabeled, sep="\t")

    return featureCount


def generate_experiment_design_table(
    keytable,
    filepath_exp_design,
    keytable_column="SampleID-Lab",
    info_list=[
        "Cell Line",
        "Inhibition Status",
        "CRISPR",
        "MRTX",
        "BI",
        "SHP2i",
        "Time Point",
        "Population",
    ],
):
    exp_design = pd.DataFrame()
    exp_design[keytable_column] = keytable[keytable_column]
    # create sample info columns
    for x in info_list:
        exp_design[x] = exp_design[keytable_column].apply(
            lambda row: extract_info_exp_design(row, x)
        )
    exp_design.to_csv(path_or_buf=filepath_exp_design, sep="\t")
    return exp_design


def extract_info_exp_design(ID, info):
    """Read the lab ID to extract info about the sample."""
    if info == "Cell Line":
        return ID[2:4]
    if info == "CRISPR":
        return ID[4:6]
    if len(ID) != 10:
        if info == "Inhibition Status":
            return ID[4:6] + "XX"
        if info == "MRTX" or info == "BI" or info == "SHP2i":
            return False
        if info == "Time Point":
            return 0
        if info == "Population":
            return "A"
    else:
        if info == "Inhibition Status":
            return ID[4:8]
        if info == "MRTX":
            if "M" in ID[6:8]:
                return True
            else:
                return False
        if info == "BI":
            if "B" in ID[6:8]:
                return True
            else:
                return False
        if info == "SHP2i":
            if "S" in ID[6:8]:
                return True
            else:
                return False
        if info == "Time Point":
            if int(ID[8:10]) <= 4 and ID[6:8] == "XX":
                return 0
            elif int(ID[8:10]) <= 4:
                return 6
            else:
                return 72
        if info == "Population":
            if int(ID[8:10]) in [1, 2, 5, 6]:
                return "A"
            else:
                return "B"

In [None]:
# import H23 featureCounts and keytable

keytable_H23 = process_keytable(
    "keytables/data_keytable.2020.10.27.csv",
    "keytables/20210113_H23_keytable.tsv",
)
ID_dict_H23 = get_ID_dict(keytable)
featureCount_H23 = process_featureCounts(
    "featureCounts/runs_20210113-results_featureCounts_merged_gene_counts.txt",
    "featureCounts/20210113_featureCount_H23.tsv",
    ID_dict_H23,
    sep_featureCount=",",
)
exp_design_H23 = generate_experiment_design_table(
    keytable, "20210103_experiment_design_H23.tsv"
)

In [None]:
# This is the one that works

# import H358 featureCounts and keytable
keytable_H358 = process_keytable(
    "keytables/H358_keytable.txt", "keytables/20210209_H358_keytable.tsv"
)
ID_dict_H358 = get_ID_dict(keytable_H358)
featureCount_H358 = process_featureCounts(
    "featureCounts/runs_20210209-results_featureCounts_merged_gene_counts.txt",
    "featureCounts/20210209_featureCount_H358.tsv",
    ID_dict_H358,
)
exp_design_H358 = generate_experiment_design_table(
    keytable_H358, "20210209_experiment_design_H358.tsv"
)