In [51]:
import pandas as pd

In [62]:
# read in response for comparisons
response = pd.read_csv('response/BeatAML2.csv', converters={'cell_line_name': str})

## Gene expression

In [3]:
# create gene expression feature file
gene_exp = pd.read_csv("gene_expression/beataml_waves1to4_norm_exp_dbgap.txt", sep="\t", index_col=0)
gene_exp = gene_exp.reset_index().drop(columns=["stable_id", "description", "biotype"]).set_index("display_label")
gene_exp_processed = gene_exp.T
gene_exp_processed = gene_exp_processed.reset_index().rename(columns={"index": "dbgap_rnaseq_sample"})
gene_exp_processed.iloc[0:5, 0:5]

display_label,dbgap_rnaseq_sample,TSPAN6,DPM1,SCYL3,C1orf112
0,BA2392R,1.52367,7.107711,3.362605,3.881649
1,BA2611R,1.856566,6.865282,3.32094,4.045935
2,BA2506R,1.701307,6.889932,3.407834,3.904311
3,BA2430R,1.870627,6.96912,3.265446,4.057645
4,BA2448R,1.699496,6.884921,3.428475,3.931659


In [4]:
mapping = pd.read_excel("beataml_waves1to4_sample_mapping.xlsx", sheet_name='sample_map')
mapping.head()

Unnamed: 0,patientId,dbgap_subject_id,labId,dbgap_rnaseq_sample,rna_seq_id,rna_control,rna_include_in_analysis,dbgap_dnaseq_sample,dna_seq_id,dna_capture_type,dna_include_in_analysis,analysisDrug
0,0,2096,00-00002,BA2392R,00-00002,Healthy pooled CD34+,yes,,,,,
1,0,2096,00-00003,BA2611R,00-00003,Healthy pooled CD34+,yes,,,,,
2,0,2096,00-00004,BA2506R,00-00004,Healthy pooled CD34+,yes,,,,,
3,0,2096,00-00005,BA2430R,00-00005,Healthy pooled CD34+,yes,,,,,
4,0,2096,00-00006,BA2448R,00-00006,Healthy pooled CD34+,yes,,,,,


In [5]:
gene_exp_processed = gene_exp_processed.merge(mapping[['dbgap_subject_id', 'dbgap_rnaseq_sample']], on='dbgap_rnaseq_sample', how='left')
gene_exp_processed = gene_exp_processed.rename(columns={'dbgap_subject_id': 'cell_line_name'})
gene_exp_processed = gene_exp_processed.drop(columns=['dbgap_rnaseq_sample'])
gene_exp_processed['cell_line_name'] = gene_exp_processed['cell_line_name'].astype(str)
gene_exp_processed = gene_exp_processed.set_index('cell_line_name')
gene_exp_processed.iloc[0:5, 0:5]

Unnamed: 0_level_0,TSPAN6,DPM1,SCYL3,C1orf112,FGR
cell_line_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2096,1.52367,7.107711,3.362605,3.881649,4.571577
2096,1.856566,6.865282,3.32094,4.045935,4.433159
2096,1.701307,6.889932,3.407834,3.904311,4.576913
2096,1.870627,6.96912,3.265446,4.057645,4.468458
2096,1.699496,6.884921,3.428475,3.931659,4.650867


In [6]:
unique_response_samples = set(response['cell_line_name'])
unique_gex_samples = set(gene_exp_processed.index)
print(f"Gene expression data missing for {len(unique_response_samples - unique_gex_samples)}/{(len(unique_gex_samples))} response samples")
print(f"Response data missing for {len(unique_gex_samples - unique_response_samples)}/{(len(unique_response_samples))} response samples")

Gene expression data missing for 82/633 response samples
Response data missing for 146/569 response samples


In [7]:
# filter to retain only cell_line_names in intersection of unique_response_samples, unique_gex_samples
gene_exp_processed = gene_exp_processed.loc[list(unique_response_samples & unique_gex_samples)]

In [8]:
# check genes
gene_list = pd.read_csv('../gene_lists/landmark_genes.csv', index_col=0)
new_gene_names = {
    "AARS": "AARS1",
    "SPATA5L1": "AFG2B",
    "KIAA0100": "BLTP2",
    "UHRF1BP1L": "BLTP3B",
    "KIAA2026": "BRD10",
    "BACH1": "BRIP1",
    "NARFL": "CIAO3",
    "CLECL1": "CLECL1P",
    "FAM69A": "DIPK1A",
    "TCTEX1D4": "DYNLT4",
    "FAM102B": "EEIG2",
    "EPRS": "EPRS1",
    "ALS2CR12": "FLACC1",
    "KIAA0355": "GARRE1",
    "WRB": "GET1",
    "TSTA3": "GFUS",
    "H2AFV": "H2AZ2",
    "HIST1H2BK": "H2BC12",
    "HIST2H2BE": "H2BC21",
    "IARS": "IARS1",
    "SSFA2": "ITPRID2",
    "KIAA0556": "KATNIP",
    "HK2": "KLK2",
    "MARCH3": "MARCHF3",
    "CLGN": "MMP1",
    "MAL": "MRTFA",
    "ARMC4": "ODAD2",
    "QARS": "QARS1",
    "SKIV2L": "SKIC2",
    "WDR61": "SKIC8",
    "DIRC2": "SLC49A4",
    "FAM19A2": "TAFA2",
    "TARSL2": "TARS3",
    "PAPD7": "TENT4A",
    "FAM57A": "TLCD3A",
    "FAM57B": "TLCD3B",
    "VNN3": "VNN3P",
    "WARS": "WARS1",
}
unmapped_genes = {"ALK", "BRDT", "ERBB4", "EDNRA", "EPHA3", "GRB7", "NR1H4", "PRR15L", "ROS1", "SNAP25", "SSTR1", "TBX2"}
new_genes = [new_gene_names.get(g, g) for g in list(gene_exp_processed.columns)]
gene_exp_processed.columns = new_genes
missing_genes = set(gene_list["Symbol"]) - set(gene_exp_processed.columns) - unmapped_genes
missing_genes

set()

In [9]:
gene_list2 = pd.read_csv('../gene_lists/drug_target_genes_all_drugs.csv')
missing_genes2 = set(gene_list2["Symbol"]) - set(gene_exp_processed.columns) - set(unmapped_genes)
missing_genes2

set()

In [16]:
gene_list3 = pd.read_csv('../gene_lists/gene_list_paccmann_network_prop.csv')
missing_genes3 = set(gene_list3["Symbol"]) - set(gene_exp_processed.columns) - set(unmapped_genes)
len(missing_genes3) # this is too many to map

342

In [17]:
gene_exp_processed.to_csv('gene_expression/gene_expression.csv')

## Mutation data

In [35]:
mut = pd.read_csv('mutation/beataml_wes_wv1to4_mutations_dbgap.txt', sep='\t')
mut.head()

Unnamed: 0,dbgap_sample_id,capture_type,seqnames,pos_start,pos_end,ref,alt,genotyper,tumor_only,total_reads,...,cdna_position,cds_position,protein_position,amino_acids,codons,existing_variation,refseq,sift,polyphen,exac_af
0,BA2336D,NexteraV1.2,4,106156042,106156043,TC,T,varscan,1,151,...,1804/10166,944/6009,315/2002,S/X,tCc/tc,,,,,
1,BA2336D,NexteraV1.2,4,106190829,106190830,AG,A,varscan,1,74,...,4968/10166,4108/6009,1370/2002,G/X,Ggg/gg,rs756348991,,,,3.7e-05
2,BA2336D,NexteraV1.2,5,170837543,170837543,C,CTCTG,varscan,1,59,...,1160-1161/1758,859-860/885,287/294,L/LCX,ctc/cTCTGtc,rs758959453&COSM158604,NM_002520.6,,,8e-06
3,BA2643D,NexteraV1.2,11,32456651,32456652,GC,G,varscan,1,51,...,525/3122,240/1554,80/517,L/X,ctG/ct,,NM_024426.4&NM_024424.3,,,
4,BA2643D,NexteraV1.2,2,25457242,25457242,C,T,mutect,1,28,...,2983/4380,2645/2739,882/912,R/H,cGc/cAc,rs147001633&COSM52944&COSM442676,NM_175629.2,deleterious(0),probably_damaging(0.993),0.000593


In [36]:
# we only want mutation yes/no per gene
mut = mut[['dbgap_sample_id', 'symbol']].drop_duplicates()
# merge with mapping
mut = mut.merge(mapping[['dbgap_subject_id', 'dbgap_dnaseq_sample']], left_on='dbgap_sample_id', right_on='dbgap_dnaseq_sample', how='left')
mut = mut[['dbgap_subject_id', 'symbol']].drop_duplicates()
mut.columns = ['cell_line_name', 'symbol']
mut['cell_line_name'] = mut['cell_line_name'].astype(str)
unique_mut_samples = set(mut['cell_line_name'])
print(f"Mutation data missing for {len(unique_response_samples - unique_mut_samples)}/{(len(unique_response_samples))} response samples")
print(f"Response data missing for {len(unique_mut_samples - unique_response_samples)}/{(len(unique_mut_samples))} mutation samples")

Mutation data missing for 39/569 response samples
Response data missing for 226/756 mutation samples


In [37]:
# filter to retain only cell_line_names in intersection of unique_response_samples, unique_mut_samples
mut = mut.set_index('cell_line_name')
mut = mut.loc[list(unique_response_samples & unique_mut_samples)]

In [39]:
wide_mut_df = (
    mut.reset_index().assign(value=True)
      .pivot(index='cell_line_name', columns='symbol', values='value')
      .fillna(False)
      .astype(bool)
)
wide_mut_df.iloc[0:5, 0:5]

  .fillna(False)


symbol,A1CF,A2M,A4GALT,AADAC,AADACL3
cell_line_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2003,False,False,False,False,False
2005,False,False,False,False,False
2007,False,False,False,False,False
2008,False,False,False,False,False
2010,False,False,False,False,False


In [47]:
print(f'Missing landmark genes: {len(set(gene_list["Symbol"]) - set(wide_mut_df.columns))}/{len(gene_list)}')
print(f'Missing drug target genes: {len(set(gene_list2["Symbol"]) - set(wide_mut_df.columns))}/{len(gene_list2)}')
print(f'Missing paccman genes: {len(set(gene_list3["Symbol"]) - set(wide_mut_df.columns))}/{len(gene_list3)}')

Missing landmark genes: 760/867
Missing drug target genes: 196/242
Missing paccman genes: 1701/1957


In [42]:
wide_mut_df.to_csv('mutation/mutations.csv')

## Other files

In [63]:
# create cell line names file
cell_line_names = response.drop_duplicates("cell_line_name")[["cell_line_name"]]
cell_line_names["tissue"] = "Blood"
cell_line_names.to_csv("cell_line_names.csv", index=False)

In [64]:
# create drug names file
drug_names = response.drop_duplicates("pubchem_id")[["pubchem_id", "drug_name"]]
drug_names.to_csv("drug_names.csv", index=False)