In [1]:
import pandas as pd

## What we need
- Gene node feature #genes x #features
- Model node feature #models x features
- Gene_model_edge_index_cn #edges x 2
- Gene_model_edge_index_mutations #edges x 2
- Gene_model_edge_index_geneexpressions #edges x 2
- Gene_model_edge_attributes_cn #edges x #attributes
- Gene_model_edge_attributes_mutations #edges x #attributes
- Gene_model_edge_attributes_geneexpressions #edges x #attributes

## Read human genome

In [2]:
human_genome = pd.read_csv("human_genome.tsv", sep="\t", header=None)

## Get gene symbols to human genome index dataframe (is in nested list semicolon, comma separated)

In [3]:
human_genome["human_genome_index"] = human_genome.index

In [4]:
sem_split = human_genome[3].str.split("; ")

In [5]:
comma_split = sem_split.map(lambda n: n[0]).str.split(",")

In [6]:
exploded_gene_symbols = comma_split.explode().str.strip().to_frame()

In [7]:
exploded_gene_symbols = exploded_gene_symbols.rename(columns={3:"symbol"})

In [8]:
exploded_gene_symbols["human_genome_index"] = exploded_gene_symbols.index

In [9]:
exploded_gene_symbols = exploded_gene_symbols.reset_index(drop=True)

## Read copy numbers

In [10]:
copy_numbers = pd.read_csv("WES_pureCN_CNV_genes_20220623.csv")

## Merge copy numbers with gene index to edge index

In [11]:
human_genome_index_merged_copy_numbers = pd.merge(exploded_gene_symbols, copy_numbers, on=["symbol","symbol"], how="right")

In [12]:
human_genome_index_merged_copy_numbers["index"] = human_genome_index_merged_copy_numbers.index

In [13]:
human_genome_index_merged_copy_numbers

Unnamed: 0,symbol,human_genome_index,model_name,model_id,gene_id,chr_name,chr_start,chr_end,total_copy_number,minor_copy_number,...,gene_mean,num_targets,focal,breakpoints,num_snps,gatk_mean_log2_copy_ratio,comment,source,data_type,index
0,ABCB1,8707,MEC-1,SIDM00001,SIDG00064,chr7,87503966,87600461,3.0,1.0,...,0.414125,28.0,False,0.0,1124.0,0.574127,POOR GOF (70.4%),Sanger,WES,0
1,ABL1,10859,MEC-1,SIDM00001,SIDG00150,chr9,130714099,130885948,3.0,1.0,...,0.472693,12.0,False,0.0,977.0,0.585569,POOR GOF (70.4%),Sanger,WES,1
2,BRD3,10921,MEC-1,SIDM00001,SIDG02504,chr9,134033313,134053689,3.0,1.0,...,0.705704,11.0,False,0.0,977.0,0.585569,POOR GOF (70.4%),Sanger,WES,2
3,CARD11,8252,MEC-1,SIDM00001,SIDG03455,chr7,2906361,2958783,3.0,1.0,...,0.696245,23.0,False,0.0,1124.0,0.574127,POOR GOF (70.4%),Sanger,WES,3
4,CDK6,8741,MEC-1,SIDM00001,SIDG04156,chr7,92614869,92833599,3.0,1.0,...,0.750720,7.0,False,0.0,1124.0,0.574127,POOR GOF (70.4%),Sanger,WES,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1142907,JAK3,19738,EA-hy926,SIDM01986,SIDG13376,chr19,17826466,17844694,4.0,1.0,...,0.172797,20.0,False,0.0,2308.0,0.209350,POOR GOF (75%),Sanger,WES,1142907
1142908,KAT6A,9602,EA-hy926,SIDM01986,SIDG13444,chr8,41931928,42049234,2.0,1.0,...,-0.783567,19.0,False,0.0,39.0,-0.812879,POOR GOF (75%),Sanger,WES,1142908
1142909,KAT6B,11438,EA-hy926,SIDM01986,SIDG13445,chr10,74842581,75031323,3.0,1.0,...,-0.176008,18.0,False,0.0,2542.0,-0.144490,POOR GOF (75%),Sanger,WES,1142909
1142910,KNL1,15959,EA-hy926,SIDM01986,SIDG13936,chr15,40602655,40662465,4.0,1.0,...,-0.109242,24.0,False,0.0,376.0,0.083666,POOR GOF (75%),Sanger,WES,1142910


## Generate nodes features for gene nodes

In [14]:
human_genome[0] = human_genome[0].astype("category")
human_genome[1] = human_genome[1].astype("category")
human_genome[2] = human_genome[2].astype("category")
cat_columns = human_genome.select_dtypes(['category']).columns
human_genome[cat_columns] = human_genome[cat_columns].apply(lambda x: x.cat.codes)

In [15]:
def z_scale(df_z_scaled, columns):
    for column in columns:
        df_z_scaled[column] = (df_z_scaled[column] - df_z_scaled[column].mean()) / df_z_scaled[column].std()   
    return df_z_scaled
z_scale(human_genome, [0,1,2])

Unnamed: 0,0,1,2,3,human_genome_index
0,-1.435618,0.983335,-0.020788,"MIR6859-1, hsa-mir-6859-1; microRNA 6859-1",0
1,-1.656741,0.983335,-0.136196,"MIR1302-2, MIRN1302-2, hsa-mir-1302-2; microRN...",1
2,1.243678,-0.403875,-0.104359,OR4F5; olfactory receptor family 4 subfamily F...,2
3,-1.447965,0.983335,-0.012534,"MIR6859-2, hsa-mir-6859-2; microRNA 6859-2",3
4,-1.250834,-0.403875,0.049518,uncharacterized LOC112268260,4
...,...,...,...,...,...
24684,-0.824442,-0.403875,1.582393,G antigen 7-like,24684
24685,-0.757796,-0.403875,0.831284,putative TAF11-like protein ENSP05220103537,24685
24686,-0.742643,-0.403875,1.158936,Uncharacterized LOC132932482,24686
24687,-0.741521,-0.403875,-1.188131,POC1B-DUSP6; POC1B-DUSP6 readthrough,24687


In [16]:
gene_features = human_genome.iloc[:, :3].values
gene_features

array([[-1.43561788,  0.98333456, -0.02078813],
       [-1.6567412 ,  0.98333456, -0.13619594],
       [ 1.24367835, -0.40387457, -0.1043593 ],
       ...,
       [-0.74264322, -0.40387457,  1.15893607],
       [-0.74152077, -0.40387457, -1.18813143],
       [-0.74138046, -0.40387457,  1.4434023 ]])

## Generate Model nodes

In [17]:
targets = pd.read_csv("target.csv",sep=";")

In [18]:
models = pd.read_csv("model_list_20240110.csv")

In [19]:
def remove_uninformative_columns(columns):
    ## removes columns where we have no duplicates (i.e., columns with all different categorical values like ids)
    filtered_columns = []
    for column in columns:
        if models.duplicated(subset=column).sum() > 0:
            filtered_columns.append(column)
    return filtered_columns

In [20]:
from matplotlib import pyplot as plt
columns_with_most_val_filled = models.columns[models.isnull().sum(axis=0)/ models.shape[0] <= 0.1]
filtered_columns = remove_uninformative_columns(columns_with_most_val_filled)

In [21]:
filtered_columns.append("model_id")

In [22]:
models = models[filtered_columns].fillna("Unkown")

In [23]:
def categorize_columns(df, columns):
    for column in columns:
        df[column] = df[column].astype("category")
    cat_columns = df.select_dtypes(['category']).columns
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df

In [24]:
columns_besides_id = list(filter(lambda x: x != "model_id", filtered_columns))
models = categorize_columns(models, columns_besides_id)
z_scale(models, columns_besides_id)

Unnamed: 0,sample_id,patient_id,model_name,tissue,cancer_type,cancer_type_ncit_id,tissue_status,sample_site,cancer_type_detail,model_type,growth_properties,species,gender,ethnicity,smoking_status,sample_treatment,crispr_ko_data,model_id
0,1.122774,1.137185,0.876928,0.743954,1.102971,-0.581066,-1.615904,-0.605693,0.930356,-0.268525,-0.958391,,1.993544,0.179845,0.252464,0.273661,-0.857184,SIDM01774
1,-0.654235,-0.637455,1.208447,0.082968,-0.913954,1.805786,0.510026,0.279862,1.231074,-0.268525,-0.958391,,-1.142286,-1.907582,0.252464,0.592958,1.166070,SIDM00192
2,0.796733,0.852343,1.261555,-1.239003,-0.387800,-1.295692,0.510026,-1.595431,-0.225033,-0.268525,-0.958391,,0.425629,0.179845,0.252464,0.592958,-0.857184,SIDM01447
3,0.624324,0.668695,-0.371903,0.413461,0.226047,0.062098,1.041509,0.175679,1.104456,-0.268525,-0.958391,,0.425629,0.875654,0.252464,0.273661,1.166070,SIDM01554
4,1.074977,1.097832,0.207451,-0.412771,-1.615493,1.448473,0.510026,0.019404,-1.491213,-0.268525,0.550803,,0.425629,0.179845,0.252464,0.273661,-0.857184,SIDM01689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2150,1.687798,1.716239,0.664498,1.404939,0.138355,-1.109889,-1.615904,-1.426134,0.392229,-0.268525,-0.958391,,0.425629,0.179845,0.252464,0.273661,-0.857184,SIDM02155
2151,1.739009,1.770583,1.686415,-0.578017,0.138355,1.119745,-1.615904,-0.540579,1.864164,-0.268525,-0.958391,,-1.142286,0.179845,0.252464,0.273661,-0.857184,SIDM02157
2152,1.727059,1.757466,-1.009193,-0.412771,-1.001646,-1.195644,0.510026,-1.634499,-0.968914,-0.268525,-0.958391,,0.425629,0.179845,0.252464,-3.557894,-0.857184,SIDM02159
2153,1.341273,1.345195,1.189135,-0.082278,0.050662,-0.352385,0.510026,-0.840105,-0.953087,-0.268525,-0.958391,,1.993544,0.179845,0.252464,0.273661,1.166070,SIDM01339


In [27]:
models = models.rename(columns={"model_id":"SANGER_MODEL_ID"})

In [28]:
limited_drug_target = targets[targets["DRUG_ID"] == 1862]
limited_drug_target_filtered = limited_drug_target.loc[:, ["CELL_LINE_NAME", "SANGER_MODEL_ID", "MIN_CONC", "LN_IC50"]]
limited_drug_target_filtered = categorize_columns(limited_drug_target_filtered, ["CELL_LINE_NAME"])
z_scale(limited_drug_target_filtered, list(filter(lambda x: x not in["LN_IC50", "SANGER_MODEL_ID"], limited_drug_target_filtered)))

Unnamed: 0,CELL_LINE_NAME,SANGER_MODEL_ID,MIN_CONC,LN_IC50
174561,0.900416,SIDM01132,0.549643,-2.242366
174562,-1.632898,SIDM00848,0.549643,-1.531149
174563,-1.018328,SIDM00263,-1.817486,-1.779533
174564,-1.011182,SIDM00269,0.549643,-1.506878
174565,-0.982597,SIDM00203,0.549643,-1.253906
...,...,...,...,...
175525,1.289882,SIDM00214,0.549643,-1.756688
175526,1.304175,SIDM00194,0.549643,-0.712017
175527,1.307748,SIDM00193,-1.817486,-0.733869
175528,1.318467,SIDM00498,0.549643,-1.468859


In [29]:
model_drug_information = pd.merge(limited_drug_target_filtered, models, how="left", on="SANGER_MODEL_ID")

## Define Labels

In [30]:
node_labels = model_drug_information.pop("LN_IC50").values

In [31]:
model_feature_columns = list(filter(lambda x: x != "SANGER_MODEL_ID", model_drug_information.columns))
model_features = model_drug_information[model_feature_columns].values

In [32]:
human_genome_index_merged_copy_numbers = human_genome_index_merged_copy_numbers.rename(columns={"model_id": "SANGER_MODEL_ID"})

In [33]:
model_drug_information["model_idx"] = model_drug_information.index

In [34]:
model_gene_merge = pd.merge(human_genome_index_merged_copy_numbers, model_drug_information, on="SANGER_MODEL_ID", how="right")

In [35]:
model_drug_information[model_drug_information["model_idx"] == 0]

Unnamed: 0,CELL_LINE_NAME,SANGER_MODEL_ID,MIN_CONC,sample_id,patient_id,model_name,tissue,cancer_type,cancer_type_ncit_id,tissue_status,...,cancer_type_detail,model_type,growth_properties,species,gender,ethnicity,smoking_status,sample_treatment,crispr_ko_data,model_idx
0,0.900416,SIDM01132,0.549643,-1.120252,-1.117189,0.867272,-1.239003,0.927586,-0.752576,0.510026,...,1.183592,-0.268525,-0.958391,,0.425629,0.875654,0.252464,0.273661,-0.857184,0


In [36]:
model_to_edge_copy_number_edge_index = model_gene_merge.loc[:, ["model_idx", "human_genome_index"]].values

## Edge attributes for model_id -> gene_id (copy number edges)

In [37]:
reduced_model_gene_attributes = human_genome_index_merged_copy_numbers.loc[:, ['total_copy_number',
       'minor_copy_number', 'cn_category', 'seg_mean', 'gene_mean', 'num_targets', 'num_snps', 'gatk_mean_log2_copy_ratio']]

### Categorize

In [38]:
reduced_model_gene_attributes['cn_category'] = reduced_model_gene_attributes['cn_category'].astype("category")
reduced_model_gene_attributes_cat_columns = reduced_model_gene_attributes.select_dtypes(['category']).columns
reduced_model_gene_attributes[reduced_model_gene_attributes_cat_columns] = reduced_model_gene_attributes[reduced_model_gene_attributes_cat_columns].apply(lambda x: x.cat.codes)

In [39]:
z_scale(reduced_model_gene_attributes, reduced_model_gene_attributes.columns)
copy_number_edge_attributes = reduced_model_gene_attributes.fillna(0).values

## Read mutations

In [40]:
mutations = pd.read_csv("mutations_summary_20221018.csv")

In [41]:
mutations = mutations.loc[:, ["gene_symbol", "model_id", "protein_mutation", "rna_mutation", "cdna_mutation", "cancer_driver", "cancer_predisposition_variant", "effect", "vaf"]]

In [42]:
columns_besides_ids = list(filter(lambda x: x not in ["model_id", "gene_symbol"], mutations.columns))
mutations = categorize_columns(mutations, columns_besides_ids)

In [43]:
mutations = mutations.rename(columns={"gene_symbol":"symbol"})

In [61]:
model_drug_information

Unnamed: 0,CELL_LINE_NAME,SANGER_MODEL_ID,MIN_CONC,sample_id,patient_id,model_name,tissue,cancer_type,cancer_type_ncit_id,tissue_status,...,cancer_type_detail,model_type,growth_properties,species,gender,ethnicity,smoking_status,sample_treatment,crispr_ko_data,model_idx
0,0.900416,SIDM01132,0.549643,-1.120252,-1.117189,0.867272,-1.239003,0.927586,-0.752576,0.510026,...,1.183592,-0.268525,-0.958391,,0.425629,0.875654,0.252464,0.273661,-0.857184,0
1,-1.632898,SIDM00848,0.549643,-1.576026,-1.544452,-1.644874,-1.569495,-0.563184,0.233608,0.510026,...,-0.509924,-0.268525,-0.958391,,-1.142286,0.179845,0.252464,0.273661,1.166070,1
2,-1.018328,SIDM00263,-1.817486,-0.480119,-0.455681,-1.137939,-1.569495,-0.563184,0.233608,0.510026,...,-0.509924,-0.268525,1.305400,,0.425629,0.179845,0.252464,0.273661,1.166070,2
3,-1.011182,SIDM00269,0.549643,-0.476705,-0.451933,-1.134720,-1.569495,-0.563184,0.233608,0.510026,...,-0.509924,-0.268525,1.305400,,0.425629,0.179845,0.252464,0.273661,-0.857184,3
4,-0.982597,SIDM00203,0.549643,-0.707153,-0.689926,-1.121846,-1.569495,-0.563184,0.233608,-1.615904,...,-0.509924,-0.268525,1.305400,,-1.142286,0.179845,0.252464,0.273661,-0.857184,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,1.289882,SIDM00214,0.549643,-0.659356,-0.643077,1.253508,0.082968,-0.913954,-0.066535,-1.615904,...,-0.921432,-0.268525,-0.958391,,0.425629,-1.907582,0.252464,-3.557894,-0.857184,964
965,1.304175,SIDM00194,0.549643,-0.657649,-0.641203,1.274429,0.082968,-0.913954,1.805786,-1.615904,...,1.231074,-0.268525,-0.958391,,0.425629,-1.907582,0.252464,0.592958,1.166070,965
966,1.307748,SIDM00193,-1.817486,-0.655942,-0.639329,1.292132,0.082968,-0.913954,-0.066535,0.510026,...,-0.921432,-0.268525,-0.958391,,0.425629,-1.907582,0.252464,0.592958,1.166070,966
967,1.318467,SIDM00498,0.549643,-0.090918,-0.073393,1.308225,0.082968,-0.913954,0.619506,0.510026,...,-1.269632,-0.268525,-0.958391,,-1.142286,-1.907582,0.252464,0.592958,1.166070,967


In [63]:
import numpy as np

gene_mutations_merge = pd.merge(exploded_gene_symbols, mutations, on="symbol", how="right")
gene_mutations_merge = gene_mutations_merge.rename(columns={"model_id": "SANGER_MODEL_ID"})
gene_mutations_merge
gene_mutations_model_df = pd.merge(gene_mutations_merge, model_drug_information, on = "SANGER_MODEL_ID", how = "right")
gene_mutations_model_edge_index = gene_mutations_model_df.loc[:, ["human_genome_index", "model_idx"]].values.astype(np.int64)

  gene_mutations_model_edge_index = gene_mutations_model_df.loc[:, ["human_genome_index", "model_idx"]].values.astype(np.int64)


In [77]:
gene_mutations_model_edge_attributes = z_scale(mutations.iloc[: ,2:], mutations.iloc[: ,2:].columns).values

## Gene expression data

In [83]:
rna_seq_df = pd.read_csv("rnaseq_tpm_20220624.csv",index_col=0, header = None)

  rna_seq_df = pd.read_csv("rnaseq_tpm_20220624.csv",index_col=0, header = None)


In [86]:
mutations

Unnamed: 0,symbol,model_id,protein_mutation,rna_mutation,cdna_mutation,cancer_driver,cancer_predisposition_variant,effect,vaf
0,KRAS,SIDM02080,4171,4498,1878,1,0,3,3195
1,TP53,SIDM02080,948,6978,5692,1,0,4,3195
2,SMAD4,SIDM02080,18,1385,7155,1,0,0,3195
3,PREX2,SIDM02080,1255,2399,1539,1,0,3,685
4,APC,SIDM02066,5323,3040,2667,1,0,1,3195
...,...,...,...,...,...,...,...,...,...
12248,VHL,SIDM01335,2020,599,3808,1,0,3,3099
12249,RB1,SIDM01444,6469,2139,1594,1,0,4,3081
12250,TP53,SIDM01444,5936,5869,4075,1,0,3,2134
12251,TP53,SIDM01444,356,7356,6203,1,0,3,1319


In [116]:
rna_seq_df_transposed = rna_seq_df.transpose().iloc[0:, :]
new_header = rna_seq_df_transposed.iloc[0, :] 
rna_seq_df_transposed = rna_seq_df_transposed.iloc[1:, :] 
rna_seq_df_transposed.columns = new_header
rna_seq_df_transposed = rna_seq_df_transposed.iloc[:, [0, *list(range(5, new_header.shape[0]))]]
rna_seq_df_transposed = rna_seq_df_transposed.rename(columns={rna_seq_df_transposed.columns[0]: "SANGER_MODEL_ID"})
rna_seq_df_transposed_stacked = rna_seq_df_transposed.set_index('SANGER_MODEL_ID').stack().reset_index(name='ExpressionValue')

In [136]:
rna_seq_df_transposed_stacked = rna_seq_df_transposed_stacked.rename(columns={1:'symbol'})
gene_expression_merge = pd.merge(exploded_gene_symbols, rna_seq_df_transposed_stacked, on = "symbol")
gene_expression_model = pd.merge(gene_expression_merge, model_drug_information, on="SANGER_MODEL_ID")

In [125]:
gene_expression_model_edge_index = gene_expression_model.loc[:,["human_genome_index", "model_idx"]].values.astype(np.int64)
gene_expression_model_edge_index

array([[    0,   206],
       [    0,   648],
       [    0,   862],
       ...,
       [24677,   290],
       [24677,   286],
       [24677,   959]])

In [137]:
gene_expression_model["ExpressionValue"] = gene_expression_model["ExpressionValue"].values.astype(np.float16)

  gene_expression_model["ExpressionValue"] = gene_expression_model["ExpressionValue"].values.astype(np.float16)


In [138]:
gene_expression_model_edge_attributes = gene_expression_model["ExpressionValue"].values
gene_expression_model_edge_attributes

array([0.24, 0.  , 0.  , ..., 0.  , 0.  , 0.  ], dtype=float16)

In [139]:
(gene_expression_model_edge_attributes == 0).sum()

5043949

In [133]:
pivot_targets = pd.pivot_table(targets, values="LN_IC50", index=["SANGER_MODEL_ID"],
                       columns=['DRUG_ID'])

pd.merge(targets.loc[:, ["SANGER_MODEL_ID",'PUTATIVE_TARGET', 'PATHWAY_NAME', 'MIN_CONC', 'MAX_CONC']], pivot_targets, on=["SANGER_MODEL_ID","SANGER_MODEL_ID"])

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,...,2175,2177,2359,2360,2361,2362,2438,2439,2498,2499
0,GDSC2,401,18945558,683667,PFSK-1,SIDM01132,MB,1003,Camptothecin,TOP1,...,,2.366388,,,,,,,7.846626,10.613759
1,GDSC2,401,18945796,684052,A673,SIDM00848,UNCLASSIFIED,1003,Camptothecin,TOP1,...,,1.723942,5.240599,4.432934,4.716167,4.285911,7.793623,9.156372,7.568813,7.063871
2,GDSC2,401,18946078,684057,ES5,SIDM00263,UNCLASSIFIED,1003,Camptothecin,TOP1,...,4.879849,1.901766,,,,,,,,
3,GDSC2,401,18946335,684059,ES7,SIDM00269,UNCLASSIFIED,1003,Camptothecin,TOP1,...,,2.133421,4.578782,3.893851,3.137208,2.719145,8.871504,9.302207,6.655762,9.441838
4,GDSC2,401,18946617,684062,EW-11,SIDM00203,UNCLASSIFIED,1003,Camptothecin,TOP1,...,,1.880307,5.137711,4.863771,5.007426,5.389040,10.462131,9.801610,8.228663,10.817398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242031,GDSC2,401,19187490,1659928,SNU-175,SIDM00216,COREAD,2499,N-acetyl cysteine,Metabolism,...,,1.310061,5.423870,5.397164,5.401524,4.973343,9.264933,9.211173,7.445385,10.134495
242032,GDSC2,401,19187943,1660034,SNU-407,SIDM00214,COREAD,2499,N-acetyl cysteine,Metabolism,...,,2.628609,5.042005,5.055076,4.918666,4.122141,11.121460,9.528520,8.344004,8.575555
242033,GDSC2,401,19188201,1660035,SNU-61,SIDM00194,COREAD,2499,N-acetyl cysteine,Metabolism,...,,2.866949,6.130028,5.649833,4.392357,4.382946,10.879932,10.092187,7.743591,10.520666
242034,GDSC2,401,19188741,1674021,SNU-C5,SIDM00498,COREAD,2499,N-acetyl cysteine,Metabolism,...,,2.653533,6.151265,6.022426,5.661197,4.820414,9.652950,10.181355,7.305220,10.701430


In [79]:
~pivot_targets.isnull()

DRUG_ID,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,...,2175,2177,2359,2360,2361,2362,2438,2439,2498,2499
SANGER_MODEL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIDM00003,True,True,True,True,True,False,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
SIDM00023,True,False,False,False,True,True,True,True,True,True,...,True,True,False,False,False,False,False,False,False,False
SIDM00040,True,False,False,False,True,True,True,True,True,True,...,True,True,False,False,False,False,False,False,False,False
SIDM00041,True,False,False,False,True,True,True,True,True,True,...,True,True,False,False,False,False,False,False,False,False
SIDM00042,True,True,True,True,True,True,False,True,True,True,...,False,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SIDM01248,True,True,True,True,True,True,False,True,True,True,...,False,True,True,True,True,True,True,True,True,True
SIDM01251,True,True,True,True,True,False,False,True,True,True,...,False,True,True,True,True,True,True,True,True,True
SIDM01259,True,True,True,True,True,True,False,True,True,True,...,False,True,True,True,True,True,True,True,True,True
SIDM01261,True,False,False,False,True,True,True,True,True,True,...,True,True,False,False,False,False,False,False,False,False


## Questions
- Which features from mutations and genes I listed are irrelevant?

## Problems
	
 duplicate model_id + gene_id
 model_name	model_id	symbol	gene_id	chr_name	chr_start	chr_end	total_copy_number	minor_copy_number	loh	...	seg_mean	gene_mean	num_targets	focal	breakpoints	num_snps	gatk_mean_log2_copy_ratio	comment	source	data_type
0	MEC-1	SIDM00001	ABCB1	SIDG00064	chr7	87503966	87600461	3.0	1.0	False	...	0.599167	0.414125	28.0	False	0.0	1124.0	0.574127	POOR GOF (70.4%)	Sanger	WES
844	MEC-1	SIDM00001	ABCB1	SIDG00064	chr7	87503966	87600461	3.0	1.0	False	...	0.551774	0.692527	

-> pre-filter? highest POF in comment? or take both?