In [1]:
import pandas as pd

## Expression data

"PDTCs derived from ten of the PDTX models were extensively characterized using WES, sWGS, and RNAexp (microarrays)."

"The PDTX samples were comprehensively molecularly characterized at several passages using sWGS (for CNAs), WES (for single nucleotide variations [SNVs]), reduced-representation bisulfite sequencing (“RRBS”) (for DNA methylation), and RNAexp (for global expression and pathway activity profiling)."

-> there are more microarray samples because they also cover PDTX, not only PDTCs.

In [2]:
microarray = pd.read_csv('gene_expression/ExpressionSamples.txt', sep='\t', index_col=0)
microarray.shape

(22044, 153)

In [3]:
# check response overlap
responses = pd.read_csv('response/Bruna.csv')
print(f'All responses samples {len(set(responses["cell_line_name"]))}')
print(f'All expression samples {len(set(microarray.columns))}')
print(f'Intersection {len(set(responses["cell_line_name"]).intersection(set(microarray.columns)))}')
print(set(responses["cell_line_name"]).intersection(set(microarray.columns)))

All responses samples 37
All expression samples 153
Intersection 3
{'HCI009-X4C', 'STG282-X1C', 'STG282-X1CR'}


In [4]:
### Export cell line names and drug names
cell_line_names = set(responses['cell_line_name'])
drug_names = responses[['pubchem_id', 'drug_name']].drop_duplicates()
cell_line_name_df = pd.DataFrame({'cell_line_name': list(cell_line_names), 'tissue': 'Breast'})

drug_names.to_csv('drug_names.csv', index=False)
cell_line_name_df.to_csv('cell_line_names.csv', index=False)

In [5]:
all_resp_samples = pd.Series(list(responses["cell_line_name"].unique()))
all_micro_samples = pd.Series(list(microarray.columns))

In [6]:
import re
# function to extract model and passage only
def normalize_id(x):
    m = re.match(r'([A-Z0-9]+-X\d+)', x)
    return m.group(1) if m else x

resp_key = all_resp_samples.apply(normalize_id)
micro_key = all_micro_samples.apply(normalize_id)

# find matches
matches = pd.merge(
    pd.DataFrame({'resp': all_resp_samples, 'key': resp_key}),
    pd.DataFrame({'micro': all_micro_samples, 'key': micro_key}),
    on='key',
    how='left'
)

In [7]:
mapping = {
    "HCI001-X4": "HCI001-X4C",
    "HCI001-X5": "HCI001-X5C",
    "HCI002-X2": "HCI002-X2C",
    # no microarray match for "HCI002-X2CR"
    "HCI002-X3R": "HCI002-X3C", # might be a bad match?
    "HCI002-X6": "HCI002-X6C",
    "HCI005-X2": "HCI005-X2C",
    "HCI008-X1": "HCI008-X1C",
    "HCI009-X1": "HCI009-X1C",
    "HCI009-X4C": "HCI009-X4C", # exact match
    # no microarray match for "HCI009-X7C"
    "HCI010-X2": "HCI010-X2C",
    "HCI010-X2R": "HCI010-X2CR",
    "HCI011-X1": "HCI011-X1C",
    "IC007-X0": "IC007-X0C",
    "STG139-X13": "STG139-X13C",
    # no matches for "STG139-X9C", "STG139M-X4C
    "STG139M-X5": "STG139M-X5C",
    "STG143-X2": "STG143-X2C",
    "STG195-X3": "STG195-X3C",
    "STG201-X3": "STG201-X3C",
    "STG282-X1C": "STG282-X1C", # exact match
    "STG282-X1CR": "STG282-X1CR", # exact match
    "STG282-X1R1": "STG282-X1CR1",
    "STG282-X3": "STG282-X3C",
    "STG316-X1": "STG316-X1C",
    "STG316-X1R": "STG316-X1CR",
    "STG335-X1": "STG335-X1C",
    # no matches for "STG335-X2C", "VHIO098-X1C", "VHIO169-X11C", "VHIO169-X3C",
    # "VHIO179-X1C", "VHIO244-X1C", "VHIO244-X1CR", "VHIO244-X2C"
}
# missing: 12/37 -> 25 matches
# rename microarray sample names according to mapping
microarray = microarray.rename(columns=mapping)
print(f'Intersection {len(set(responses["cell_line_name"]).intersection(set(microarray.columns)))}')
# only keep intersecting columns
microarray = microarray.loc[:, list(set(responses["cell_line_name"]).intersection(set(microarray.columns)))]

Intersection 25


In [8]:
microarray_transposed = microarray.T
microarray_transposed = microarray_transposed.reset_index().rename(columns={'index': 'cell_line_name'})
microarray_transposed = microarray_transposed.set_index(['cell_line_name'])

In [9]:
gene_names = pd.DataFrame({'Symbol': microarray_transposed.columns})
gene_names.to_csv('gene_expression/gene_names.csv', index=False)

In [11]:
gene_list = pd.read_csv('../gene_lists/landmark_genes.csv', index_col=0)

# map genes
new_gene_names = {
    "AARS": "AARS1",
    "SPATA5L1": "AFG2B",
    "FAM123A": "AMER2",
    "C20orf3": "APMAP",
    "KIAA0100": "BLTP2",
    "UHRF1BP1L": "BLTP3B",
    "C20orf114": "BPIFB1",
    "KIAA2026": "BRD10",
    "BACH1": "BRIP1",
    "KIAA0528": "C2CD5",
    "CAMSAP1L1": "CAMSAP2",
    "TMEM146": "CATSPERD",
    "C6orf97": "CCDC170",
    "WISP1": "CCN4",
    "CP110": "CCP110",
    "FAM190A": "CCSER1",
    "C16orf5": "CDIP1",
    "NARFL": "CIAO3",
    "ODF3L2": "CIMAP1D",
    "CLECL1": "CLECL1P",
    "PPPDE2": "DESI1",
    "FAM69A": "DIPK1A",
    "FAM69C": "DIPK1C",
    "TCTEX1D4": "DYNLT4",
    "FAM102B": "EEIG2",
    "C3orf25": "EFCAB12",
    "KIAA0494": "EFCAB14",
    "CCDC48": "EFCC1",
    "C15orf24": "EMC7",
    "C19orf63": "EMC10",
    "EPRS": "EPRS1",
    "C10orf84": "FAM204A",
    "C20orf108": "FAM210B",
    "C9orf25": "FAM219A",
    "ALS2CR12": "FLACC1",
    "KIAA0355": "GARRE1",
    "WRB": "GET1",
    "TSTA3": "GFUS",
    "H2AFV": "H2AZ2",
    "HIST1H2BK": "H2BC12",
    "GPR81": "HCAR1",
    "IARS": "IARS1",
    "IL1F7": "IL37",
    "SSFA2": "ITPRID2",
    "MYST3": "KAT6A",
    "MYST4": "KAT6B",
    "KIAA1267": "KANSL1",
    "C2orf67": "KANSL1L",
    "KIAA0556": "KATNIP",
    "KIAA1004": "KDM2A",
    "C6orf221": "KHDC3L",
    "HK2": "KLK2",
    "GRP": "LGALSL",
    "LOR": "LORICRIN",
    "MARCH3": "MARCHF3",
    "C17orf37": "MIEN1",
    "RAGE": "MOK",
    "C20orf20": "MRGBP",
    "BRP44": "MPC2",
    "HEATR7A": "MROH1",
    "C1orf129": "MROH9",
    "MAL": "MRTFA",
    "MRPL15": "MRPL19",
    "MOSC2": "MTARC2",
    "C20orf7": "NDUFAF5",
    "METTL11A": "NTMT1",
    "KIAA1486": "NYAP2",
    "ARMC4": "ODAD2",
    "C12orf48": "PARPBP",
    "GPR44": "PTGDR2",
    "QARS": "QARS1",
    "RAD51L3": "RAD51D",
    "C9orf23": "RPP25L",
    "C1orf113": "SH3D21",
    "SKIV2L": "SKIC2",
    "WDR61": "SKIC8",
    "DIRC2": "SLC49A4",
    "ANKRD43": "SOWAHA",
    "FAM75D1": "SPATA31D1",
    "C17orf46": "SPATA32",
    "C1orf124": "SPRTN",
    "FAM48A": "SUPT20H",
    "C19orf39": "SWSAP1",
    "FAM19A2": "TAFA2",
    "TARSL2": "TARS3",
    "TCP10": "TCP10L3",
    "ODZ2": "TENM2",
    "PAPD7": "TENT4A",
    "C13orf27": "TEX30",
    "FAM57A": "TLCD3A",
    "FAM57B": "TLCD3B",
    "FAM70B": "TMEM255B",
    "C19orf6": "TMEM259",
    "C3orf77": "TOPAZ1",
    "C9orf167": "TOR4A",
    "C18orf10": "TPGS2",
    "C8orf83": "TRIQK",
    "C3orf78": "UQCC5",
    "PKM2": "PKM",
    "VNN3": "VNN3P",
    "WARS": "WARS1",
    "ZFP161": "ZBTB14",
    "FAM164C": "ZC2HC1C",
    "PRF1": "ZNF395",
}
unmapped_genes = {"ADAT1", "APBB2", "ARFIP2", "CCDC85B", "CCNF", "CFLAR", "DNAJC15", "H2BC21", "HERC6", "HMGCR", "IKZF1", "MALT1", "MAPK13", "MKNK1", "MRPS16", "MYCBP", "NENF", "NFATC4", "NFKBIE", "NUP133", "PARP1", "POLR2K", "POP4", "PRSS23", "PXN", "RSU1", "RNMT", "SCARB1", "SNX6", "TLR4", "TSEN2", "UBE2J1"}

new_genes = [new_gene_names.get(g, g) for g in list(microarray_transposed.columns)]
microarray_transposed.columns = new_genes
missing_genes = set(gene_list["Symbol"]) - set(microarray_transposed.columns) - set(unmapped_genes)
missing_genes

set()

In [12]:
gene_list2 = pd.read_csv('../gene_lists/drug_target_genes_all_drugs.csv')
unmapped_genes.update({"CHUK", "EEF2K", "IRAK4", "MDM4", "MKNK1", "PARP1", "PIK3CG"})
missing_genes2 = set(gene_list2["Symbol"]) - set(microarray_transposed.columns) - set(unmapped_genes)
missing_genes2

set()

In [13]:
gene_list3 = pd.read_csv('../gene_lists/gene_list_paccmann_network_prop.csv')
unmapped_genes.update({"AARD", "AFAP1L1", "AP5M1", "ATF7", "ATG12", "ATP10B", "C1RL", "CARD6", "CASR", "CDKN2AIPNL", "CEP97", "CHRNB1", "CLN5", "CYP1A2", "CYP27C1", "DCTN5", "DRAXIN", "DYDC2", "ENDOV", "EIF2S1", "FAM111B", "FAM216B", "FANCF", "FCRL4", "FGFBP3", "FHIT", "FMO2", "GALNT13", "GCFC2", "GPR160", "HAPLN4", "HAS1", "HHLA2", "HIF1AN", "HLA-DRB5", "IDI1", "IFT57", "IGFL3", "ILDR1", "KLHL23", "LONP2", "LSG1", "LYPD3", "MAST3", "MAST4", "MDFIC", "MED29", "MEGF8", "MFSD11", "NAALAD2", "NBL1", "NPW", "NRL", "OSGIN2", "PAPSS2", "PCDHB12", "PCYOX1L", "PEAR1", "POLR1G", "PRKRIP1", "PRR4", "RAB42", "RAB6B", "RHBDL2", "RUNDC3A", "S100A1", "SAMD9", "SH3BGRL2", "SIGLEC6", "SLC29A2", "SLFN13", "SPATA12", "SPRYD7", "STPG1", "SUV39H2", "TBX19", "TFCP2L1", "TLL2", "TMEM235", "TMEM86A", "TTC19", "TVP23A", "ULK2", "UNC45B", "UQCR11", "USP45", "WBP2NL", "ZFAND1", "ZFP1", "ZNF468", "ZNF490", "ZNF510", "ZNF600", "ZNF620", "ZNF71", "ZNF772", "ZSCAN22"})
missing_genes3 = set(gene_list3["Symbol"]) - set(microarray_transposed.columns) - set(unmapped_genes)
missing_genes3

set()

In [14]:
microarray_transposed.to_csv('gene_expression/gene_expression.csv')

### Copy number variations

sWGS

In [15]:
cnv = pd.read_csv('cnv/CNASamples.txt', sep='\t')

In [16]:
cnv_key = pd.Series(cnv["ID"].unique()).apply(normalize_id)

# find matches
matches = pd.merge(
    pd.DataFrame({'resp': all_resp_samples, 'key': resp_key}),
    pd.DataFrame({'cnv': cnv["ID"].unique(), 'key': cnv_key}),
    on='key',
    how='left'
)

mapping = {
    "HCI001-X4": "HCI001-X4C",
    "HCI001-X5": "HCI001-X5C",
    # no microarray match for "HCI002-X2", "HCI002-X2CR"
    "HCI002-X3R": "HCI002-X3C", # might be a bad match?
    "HCI002-X6": "HCI002-X6C",
    "HCI005-X2R": "HCI005-X2C",  # might be a bad match?
    "HCI008-X1": "HCI008-X1C",
    "HCI009-X1R": "HCI009-X1C", # might be a bad match?
    "HCI009-X4C4": "HCI009-X4C", # might be a bad match?
    "HCI009-X7": "HCI009-X7C",
    "HCI010-X2": "HCI010-X2C",
    "HCI010-X2R": "HCI010-X2CR",
    "HCI011-X1": "HCI011-X1C",
    # no matches for "IC007-X0C"
    "STG139-X13": "STG139-X13C",
    # no matches for "STG139-X9C", "STG139M-X4C
    "STG139M-X5": "STG139M-X5C",
    "STG143-X2": "STG143-X2C",
    "STG195-X3": "STG195-X3C",
    "STG201-X3": "STG201-X3C",
    "STG282-X1": "STG282-X1C",
    "STG282-X1R": "STG282-X1CR",
    "STG282-X1R1": "STG282-X1CR1",
    # no match for "STG282-X3C"
    "STG316-X1": "STG316-X1C",
    "STG316-X1R": "STG316-X1CR",
    "STG335-X1": "STG335-X1C",
    # no matches for "STG335-X2C", "VHIO098-X1C", "VHIO169-X11C", "VHIO169-X3C",
    # "VHIO179-X1C", "VHIO244-X1C", "VHIO244-X1CR", "VHIO244-X2C"
}

# missing: 14/37 -> 23 matches
# rename cnv sample names in cnv["ID"] according to mapping
cnv["cell_line_name"] = cnv["ID"].apply(lambda x: mapping.get(x, pd.NA))

print(f'Intersection {len(set(responses["cell_line_name"]).intersection(set(cnv["cell_line_name"])))}')
# only keep intersecting columns
cnv = cnv[~pd.isna(cnv["cell_line_name"])]
cnv.head()

Intersection 23


Unnamed: 0,ID,chrom,loc.start,loc.end,num.mark,seg.mean,call,cell_line_name
5294,HCI001-X4,1,900001,40600000,357,-0.599,HETD,HCI001-X4C
5295,HCI001-X4,1,40600001,119400000,743,0.001,NEUT,HCI001-X4C
5296,HCI001-X4,1,119400001,145700000,15,0.444,GAIN,HCI001-X4C
5297,HCI001-X4,1,146500001,149200000,9,0.714,GAIN,HCI001-X4C
5298,HCI001-X4,1,149900001,151500000,15,0.849,AMP,HCI001-X4C


In [17]:
# remove call (not needed for GISTIC reprocessing), save
cnv = cnv.drop(columns=['call', 'ID'])
cnv = cnv.set_index('cell_line_name')
cnv.to_csv('cnv/bruna_cnv_for_gistic.seg.txt', sep='\t')

Upload to gistic, use Human Hg19 reference genome with default settings.

In [18]:
cnv_processed = pd.read_csv('cnv/all_thresholded.by_genes.txt', sep='\t')
cnv_processed = cnv_processed.set_index("Gene Symbol")
cnv_processed = cnv_processed.drop(columns=['Locus ID', 'Cytoband'])
cnv_processed = cnv_processed.T
cnv_processed.index.name = 'cell_line_name'
# sort columns alphabetically
cnv_processed = cnv_processed.reindex(sorted(cnv_processed.columns), axis=1)
cnv_processed = cnv_processed.reset_index()
cnv_processed.iloc[0:5, 0:5]

Gene Symbol,cell_line_name,1/2-SBSRNA4,A1BG,A1BG-AS1,A1CF
0,HCI001-X4C,-1,-1,-1,1
1,HCI001-X5C,-1,-1,-1,1
2,HCI002-X3C,0,-1,-1,0
3,HCI002-X6C,0,0,0,0
4,HCI005-X2C,-1,1,1,-1


In [19]:
# sanity check
new_genes = [new_gene_names.get(g, g) for g in list(cnv_processed.columns)]
cnv_processed.columns = new_genes

In [20]:
missing_genes = set(gene_list3["Symbol"]) - set(cnv_processed.columns)
missing_genes

{'AARD', 'AP5M1', 'DRAXIN', 'MTARC2', 'POLR1G', 'STPG1', 'TVP23A'}

In [21]:
cnv_processed.to_csv('cnv/copy_number_variation_gistic.csv', index=False)

## Mutation

In [22]:
mut = pd.read_csv('mutation/SNVsSamples.txt', sep='\t', index_col=0)
mut.head()

Unnamed: 0_level_0,ID,VAF,Depth,Location,Chrom,Pos,Ref,Alt,GERMLINE.1000G,Type,Exon,GenomicPos,Symbol,Repetitive,GERMLINE.DBSNP
Genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0/1,CAMBMT1-T2,0.25,8,chr10:100018799_C/A,chr10,100018799.0,C,A,NO,nonsynonymous SNV,"LOXL4:NM_032211:exon6:c.G888T:p.K296N,",exonic,LOXL4,NO,NO
0/1,CAMBMT1-T2,0.25,8,chr10:100018802_C/T,chr10,100018802.0,C,T,NO,synonymous SNV,"LOXL4:NM_032211:exon6:c.G885A:p.P295P,",exonic,LOXL4,NO,NO
0/1,HCI009-T,0.194444,36,chr10:100143399_C/T,chr10,100143399.0,C,T,NO,,,UTR3,PYROXD2(NM_032709:c.*156G>A),NO,NO
1/1,HCI004-X3,1.0,10,chr10:100174921_A/C,chr10,100174921.0,A,C,NO,,,UTR5,PYROXD2(NM_032709:c.-29T>G),NO,NO
0/1,AB551-T,0.222222,9,chr10:100174935_C/A,chr10,100174935.0,C,A,NO,,,UTR5,PYROXD2(NM_032709:c.-43G>T),NO,NO


In [23]:
# we only want mutation yes/no per gene
mut = mut[['ID', 'Symbol']].drop_duplicates()

In [24]:
# filter out the unused cell lines
mut_key = pd.Series(list(mut['ID'].unique())).apply(normalize_id)

# find matches
matches = pd.merge(
    pd.DataFrame({'resp': all_resp_samples, 'key': resp_key}),
    pd.DataFrame({'micro': list(mut['ID'].unique()), 'key': mut_key}),
    on='key',
    how='left'
)

In [25]:
mapping = {
    "HCI001-X4": "HCI001-X4C",
    "HCI001-X5": "HCI001-X5C",
    # no microarray match for "HCI002-X2C", "HCI002-X2CR"
    "HCI002-X3": "HCI002-X3C",
    "HCI002-X6": "HCI002-X6C",
    "HCI005-X2": "HCI005-X2C",
    "HCI008-X1": "HCI008-X1C",
    "HCI009-X1R": "HCI009-X1C", # might be a bad match?
    "HCI009-X4": "HCI009-X4C",
    "HCI009-X7": "HCI009-X7C",
    "HCI010-X2": "HCI010-X2C",
    "HCI010-X2R": "HCI010-X2CR",
    "HCI011-X1": "HCI011-X1C",
    "IC007-X0": "IC007-X0C",
    "STG139-X13": "STG139-X13C",
    # no matches for "STG139-X9C", "STG139M-X4C
    "STG139M-X5": "STG139M-X5C",
    "STG143-X2": "STG143-X2C",
    "STG195-X3": "STG195-X3C",
    "STG201-X3": "STG201-X3C",
    "STG282-X1": "STG282-X1C",
    "STG282-X1R": "STG282-X1CR",
    "STG282-X1R1": "STG282-X1CR1",
    # no match for "STG282-X3C"
    "STG316-X1": "STG316-X1C",
    "STG316-X1R": "STG316-X1CR",
    "STG335-X1": "STG335-X1C",
    # no matches for "STG335-X2C", "VHIO098-X1C", "VHIO169-X11C", "VHIO169-X3C",
    # "VHIO179-X1C", "VHIO244-X1C", "VHIO244-X1CR", "VHIO244-X2C"
}
# missing: 13/37 -> 24 matches
mut["cell_line_name"] = mut["ID"].apply(lambda x: mapping.get(x, pd.NA))

print(f'Intersection {len(set(responses["cell_line_name"]).intersection(set(mut["cell_line_name"])))}')
# only keep intersecting columns
mut = mut[~pd.isna(mut["cell_line_name"])]
mut = mut.drop(columns=['ID'])
mut.head()

Intersection 24


Unnamed: 0_level_0,Symbol,cell_line_name
Genotype,Unnamed: 1_level_1,Unnamed: 2_level_1
0/1,COX15(NM_078470:c.*18T>C),IC007-X0C
0/1,"COX15(NM_078470:exon6:c.583-1G>C,NM_004376:exo...",IC007-X0C
1/1,CUTC,STG316-X1CR
1/1,CUTC,STG316-X1C
0/1,ABCC2,IC007-X0C


In [26]:
# drop everything in Symbol that comes after (
mut['Symbol'] = mut['Symbol'].str.replace(r'\(.*', '', regex=True)
mut = mut.drop_duplicates()

In [27]:
wide_mut_df = (
    mut.assign(value=True)
      .pivot(index='cell_line_name', columns='Symbol', values='value')
      .fillna(False)
      .astype(bool)
)
wide_mut_df.iloc[0:5, 0:5]

  .fillna(False)


Symbol,A1BG,A1BG-AS1,A1CF,A2ML1,A3GALT2
cell_line_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HCI001-X4C,False,False,True,False,False
HCI001-X5C,False,False,True,False,False
HCI002-X3C,False,False,False,False,False
HCI002-X6C,False,False,False,False,False
HCI005-X2C,False,False,False,False,False


In [28]:
print(f'Missing landmark genes: {len(set(gene_list["Symbol"]) - set(wide_mut_df.columns))}/{len(gene_list)}')
print(f'Missing drug target genes: {len(set(gene_list2["Symbol"]) - set(wide_mut_df.columns))}/{len(gene_list2)}')
print(f'Missing paccman genes: {len(set(gene_list3["Symbol"]) - set(wide_mut_df.columns))}/{len(gene_list3)}')

Missing landmark genes: 556/867
Missing drug target genes: 138/242
Missing paccman genes: 1212/1957


In [26]:
wide_mut_df.to_csv('mutation/mutations.csv')