## Useful libraries

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import torch_geometric

In [2]:
## TODO: https://medium.com/math-simplified/checking-for-linear-dependence-in-sympy-9776b66dbe75

## Utility functions

In [3]:
def categorize_columns(df, columns):
    for column in columns:
        df[column] = df[column].astype("category")
    cat_columns = df.select_dtypes(['category']).columns
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df

In [4]:
def z_scale(df_z_scaled, columns): ## is a bit better for neural networks
    for column in columns:
        df_z_scaled[column] = (df_z_scaled[column] - df_z_scaled[column].mean()) / df_z_scaled[column].std()   
    return df_z_scaled

In [5]:
from pandas.api.types import is_numeric_dtype
def remove_uninformative_columns(df, columns):
    ## removes categorcial columns where we have no duplicates (i.e., columns with all different categorical values like ids)
    filtered_columns = []
    for column in columns:
        all_values_in_column_identical = df[column].unique().shape[0] < 2
        if all_values_in_column_identical:
            continue
        category_has_duplicates = df.duplicated(subset=column).sum()> 0 ##is informative
        if category_has_duplicates or is_numeric_dtype(df[column]) :
            filtered_columns.append(column)
    return filtered_columns

## What we need
- Gene node feature #genes x #features
- Model node feature #models x features (for now includes drug inofrmation and the drug IC50 target for simplicity) (more complicated than I expected)
  -> later edge attribute prediction?
- Gene_model_edge_index_cn #edges x 2
- Gene_model_edge_index_mutations #edges x 2
- Gene_model_edge_index_geneexpressions #edges x 2
- Gene_model_edge_attributes_cn #edges x #attributes
- Gene_model_edge_attributes_mutations #edges x #attributes
- Gene_model_edge_attributes_geneexpressions #edges x #attributes

  Het-GAT for regression task with MSE as loss

## Read human genome

In [6]:
human_genome = pd.read_csv("human_genome.tsv", sep="\t", header=None) ## merging via gene symbol
human_genome["human_genome_index"] = human_genome.index
human_genome.head()

Unnamed: 0,0,1,2,3,human_genome_index
0,hsa:102466751,miRNA,1:complement(17369..17436),"MIR6859-1, hsa-mir-6859-1; microRNA 6859-1",0
1,hsa:100302278,miRNA,1:30366..30503,"MIR1302-2, MIRN1302-2, hsa-mir-1302-2; microRN...",1
2,hsa:79501,CDS,1:65419..71585,OR4F5; olfactory receptor family 4 subfamily F...,2
3,hsa:102465909,miRNA,1:complement(187891..187958),"MIR6859-2, hsa-mir-6859-2; microRNA 6859-2",3
4,hsa:112268260,CDS,1:complement(365134..382235),uncharacterized LOC112268260,4


## Get gene symbols to human genome index dataframe (is in nested list semicolon, comma separated)

In [7]:
def get_gene_idx_to_gene_symbol(human_genome):
    sem_split = human_genome[3].str.split("; ")
    comma_split = sem_split.map(lambda n: n[0]).str.split(",")
    exploded_gene_symbols = comma_split.explode().str.strip().to_frame()
    exploded_gene_symbols = exploded_gene_symbols.rename(columns={3:"symbol"})
    exploded_gene_symbols["human_genome_index"] = exploded_gene_symbols.index
    exploded_gene_symbols = exploded_gene_symbols.reset_index(drop=True)
    return exploded_gene_symbols

In [8]:
gene_idx_to_gene_symbol_df = get_gene_idx_to_gene_symbol(human_genome)

## Generate nodes features for gene nodes

In [9]:
def get_gene_features(human_genome):
    categorize_columns(human_genome, human_genome.columns[:3])
    z_scale(human_genome, human_genome.columns[:3])
    gene_features = human_genome.loc[:, :2].values
    return gene_features

In [10]:
gene_features = get_gene_features(human_genome)

In [11]:
gene_features.shape

(24689, 3)

## Generate node features for models (with target) and target labels

In [12]:
targets = pd.read_csv("target.csv",sep=";")
models = pd.read_csv("model_list_20240110.csv")
models.head()

Unnamed: 0,model_id,sample_id,patient_id,parent_id,model_name,synonyms,tissue,cancer_type,cancer_type_ncit_id,tissue_status,...,msh6_expression_by_ihc,braf_mutation_identified,braf_expression_by_ihc,pik3ca_mutation_identified,pten_expression_by_ihc,pten_mutation_identified,kras_mutation_identified,mismatch_repair_status,preoperative_ce_alevel,crispr_ko_data
0,SIDM01774,SIDS01659,SIDP01578,,PK-59,,Pancreas,Pancreatic Carcinoma,C3850,Metastasis,...,,,,,,,,,,False
1,SIDM00192,SIDS00612,SIDP00541,,SNU-1033,,Large Intestine,Colorectal Carcinoma,C9383,Tumour,...,,,,,,,,,,True
2,SIDM01447,SIDS01466,SIDP01347,,SNU-466,,Central Nervous System,Glioblastoma,C3058,Tumour,...,,,,,,,,,,False
3,SIDM01554,SIDS01363,SIDP01247,,IST-MES-2,,Lung,Mesothelioma,C45662,Unknown,...,,,,,,,,,,True
4,SIDM01689,SIDS01631,SIDP01557,,MUTZ-5,,Haematopoietic and Lymphoid,B-Lymphoblastic Leukemia,C8644,Tumour,...,,,,,,,,,,False


In [13]:
def get_transformed_merged_drug_model_df(models, targets):
    columns_with_most_val_filled = models.columns[models.isnull().sum(axis=0)/ models.shape[0] <= 0.1]
    filtered_columns = remove_uninformative_columns(models, columns_with_most_val_filled) ##all categorical
    filtered_columns.append("model_id") ## required for later join
    print(filtered_columns)
    models = models[filtered_columns].fillna("Unkown")
    columns_besides_id = list(filter(lambda x: x != "model_id", filtered_columns))
    models = categorize_columns(models, columns_besides_id)
    z_scale(models, columns_besides_id)
    models = models.rename(columns={"model_id":"SANGER_MODEL_ID"}) ##rename for join
    limited_drug_target = targets[targets["DRUG_ID"] == 1862] ##filter specific drug
    limited_drug_target_filtered = limited_drug_target.loc[:, ["CELL_LINE_NAME", "SANGER_MODEL_ID", "MIN_CONC", "LN_IC50"]]
    limited_drug_target_filtered = categorize_columns(limited_drug_target_filtered, ["CELL_LINE_NAME"])
    z_scale(limited_drug_target_filtered, list(filter(lambda x: x not in ["LN_IC50", "SANGER_MODEL_ID"], limited_drug_target_filtered.columns)))
    model_drug_information = pd.merge(limited_drug_target_filtered, models, how="inner", on="SANGER_MODEL_ID") ## merge drug and model information
    model_drug_information["model_idx"] = model_drug_information.index ##store index for later joins to retrieve edge index
    return model_drug_information

In [14]:
merged_drug_model_df = get_transformed_merged_drug_model_df(models,  targets)

['sample_id', 'patient_id', 'model_name', 'tissue', 'cancer_type', 'cancer_type_ncit_id', 'tissue_status', 'sample_site', 'cancer_type_detail', 'model_type', 'growth_properties', 'gender', 'ethnicity', 'smoking_status', 'sample_treatment', 'crispr_ko_data', 'model_id']


In [15]:
def get_target_label_and_model_with_target_feature(model_drug_information):
    target_labels = model_drug_information.pop("LN_IC50").values
    model_feature_columns = list(filter(lambda x: x not in ["model_idx", "SANGER_MODEL_ID"], model_drug_information.columns))
    model_and_target_features = model_drug_information[model_feature_columns].values
    return (target_labels, model_and_target_features)

In [16]:
target_labels, model_and_target_features = get_target_label_and_model_with_target_feature(merged_drug_model_df)

In [17]:
target_labels.shape, model_and_target_features.shape

((969,), (969, 18))

## Generate edge indices and edge attributes

### Read copy numbers

In [18]:
copy_numbers = pd.read_csv("WES_pureCN_CNV_genes_20220623.csv")
copy_numbers.head()

Unnamed: 0,model_name,model_id,symbol,gene_id,chr_name,chr_start,chr_end,total_copy_number,minor_copy_number,loh,...,seg_mean,gene_mean,num_targets,focal,breakpoints,num_snps,gatk_mean_log2_copy_ratio,comment,source,data_type
0,MEC-1,SIDM00001,ABCB1,SIDG00064,chr7,87503966,87600461,3.0,1.0,False,...,0.599167,0.414125,28.0,False,0.0,1124.0,0.574127,POOR GOF (70.4%),Sanger,WES
1,MEC-1,SIDM00001,ABL1,SIDG00150,chr9,130714099,130885948,3.0,1.0,False,...,0.599167,0.472693,12.0,False,0.0,977.0,0.585569,POOR GOF (70.4%),Sanger,WES
2,MEC-1,SIDM00001,BRD3,SIDG02504,chr9,134033313,134053689,3.0,1.0,False,...,0.599167,0.705704,11.0,False,0.0,977.0,0.585569,POOR GOF (70.4%),Sanger,WES
3,MEC-1,SIDM00001,CARD11,SIDG03455,chr7,2906361,2958783,3.0,1.0,False,...,0.599167,0.696245,23.0,False,0.0,1124.0,0.574127,POOR GOF (70.4%),Sanger,WES
4,MEC-1,SIDM00001,CDK6,SIDG04156,chr7,92614869,92833599,3.0,1.0,False,...,0.599167,0.75072,7.0,False,0.0,1124.0,0.574127,POOR GOF (70.4%),Sanger,WES


### Get edge index (model-[CopyNumbers]->gene)

In [19]:
def get_gene_and_model_merged_df(gene_idx_to_gene_symbol_df, merged_drug_model_df, df):
    human_genome_index_merged_df = pd.merge(gene_idx_to_gene_symbol_df, df, on="symbol", how = "inner")
    human_genome_index_merged_df = human_genome_index_merged_df.rename(columns={"model_id": "SANGER_MODEL_ID"}) ##rename for join
    model_gene_merged_df = pd.merge(human_genome_index_merged_df, merged_drug_model_df, on="SANGER_MODEL_ID", how="inner")
    return model_gene_merged_df

In [20]:
model_gene_merged_copy_number_df = get_gene_and_model_merged_df(gene_idx_to_gene_symbol_df, merged_drug_model_df, copy_numbers)
model_to_gene_copy_number_edge_index = model_gene_merged_copy_number_df.loc[:, ["model_idx", "human_genome_index"]].values.astype(np.int64)
model_to_gene_copy_number_edge_index.shape

(887292, 2)

### Get edge attributes (model-[CopyNumbers]->gene)

In [21]:
reduced_model_gene_attributes = model_gene_merged_copy_number_df.loc[:, ['total_copy_number',
       'minor_copy_number', 'cn_category', 'seg_mean', 'gene_mean', 'num_targets', 'num_snps', 'gatk_mean_log2_copy_ratio']]
reduced_model_gene_attributes = categorize_columns(reduced_model_gene_attributes, ["cn_category"])
reduced_model_gene_attributes = reduced_model_gene_attributes.fillna(0)
reduced_model_gene_attributes = z_scale(reduced_model_gene_attributes, reduced_model_gene_attributes.columns)
copy_number_edge_attributes = reduced_model_gene_attributes.values
copy_number_edge_attributes.shape

(887292, 8)

### Read mutations

In [22]:
mutations = pd.read_csv("mutations_summary_20221018.csv")
mutations = mutations.rename(columns={"gene_symbol": "symbol"})
mutations.head()

Unnamed: 0,gene_id,symbol,model_id,protein_mutation,rna_mutation,cdna_mutation,cancer_driver,cancer_predisposition_variant,effect,vaf,coding,source,model_name
0,SIDG13960,KRAS,SIDM02080,p.Q61H,r.373a>c,c.183A>C,True,False,missense,1.0,True,Sanger,HCM-SANG-1308-C25
1,SIDG38363,TP53,SIDM02080,p.E204*,r.800g>u,c.610G>T,True,False,nonsense,1.0,True,Sanger,HCM-SANG-1308-C25
2,SIDG35070,SMAD4,SIDM02080,p.?,r.1494-2a>g,c.956-2A>G,True,False,ess_splice,1.0,True,Sanger,HCM-SANG-1308-C25
3,SIDG25750,PREX2,SIDM02080,p.E553K,r.2006g>a,c.1657G>A,True,False,missense,0.2911,True,Sanger,HCM-SANG-1308-C25
4,SIDG01214,APC,SIDM02066,p.R790fs*8,r.2425_2426delAG,c.2366_2367delAG,True,False,frameshift,1.0,True,Sanger,HCM-SANG-1313-C18


### Get edge index (model-[Mutations]->gene)

In [23]:
model_gene_merged_mutations_df = get_gene_and_model_merged_df(gene_idx_to_gene_symbol_df, merged_drug_model_df, mutations)
model_to_gene_mutations_edge_index = model_gene_merged_mutations_df.loc[:, ["model_idx", "human_genome_index"]].values.astype(np.int64)
model_to_gene_mutations_edge_index.shape

(9183, 2)

In [24]:
model_gene_merged_mutations_df.loc[:, ["protein_mutation", "rna_mutation", "cdna_mutation", "cancer_driver", "cancer_predisposition_variant", "effect", "vaf"]].isnull().sum()

protein_mutation                 0
rna_mutation                     0
cdna_mutation                    0
cancer_driver                    0
cancer_predisposition_variant    0
effect                           0
vaf                              0
dtype: int64

### Get edge attributes (model-[Mutations]->gene)

In [25]:
mutations_edge_attributes = model_gene_merged_mutations_df.loc[:, ["protein_mutation", "rna_mutation", "cdna_mutation", "cancer_driver", "cancer_predisposition_variant", "effect", "vaf"]]
mutations_edge_attributes = categorize_columns(mutations_edge_attributes,["protein_mutation", "rna_mutation", "cdna_mutation", "cancer_driver", "cancer_predisposition_variant", "effect"])
mutations_edge_attributes = z_scale(mutations_edge_attributes, mutations_edge_attributes.columns).values
mutations_edge_attributes.shape

(9183, 7)

### Read Gene expression data

In [26]:
rna_seq_df = pd.read_csv("rnaseq_tpm_20220624.csv",index_col=0, header = None,low_memory=False)
rna_seq_df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,1423,1424,1425,1426,1427,1428,1429,1430,1431,1432
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
model_id,,SIDM00001,SIDM00002,SIDM00003,SIDM00005,SIDM00006,SIDM00007,SIDM00008,SIDM00009,SIDM00011,...,SIDM02071,SIDM02072,SIDM02073,SIDM02074,SIDM02075,SIDM02076,SIDM02077,SIDM02078,SIDM02079,SIDM02080
model_name,,MEC-1,NBsusSR,M14,MDA-MB-134-VI,MCC26,MCC13,MCAS,MC-1010,Malme-3M,...,HCM-SANG-1325-C15,HCM-SANG-1331-C18,HCM-SANG-1337-C18,HCM-SANG-1326-C15,HCM-SANG-1300-C18,HCM-SANG-1322-C15,HCM-SANG-1332-C18,HCM-SANG-1095-C25,HCM-SANG-1336-C15,HCM-SANG-1308-C25
dataset_name,,Sanger & Broad Cell Lines RNASeq,Sanger & Broad Cell Lines RNASeq,Sanger & Broad Cell Lines RNASeq,Sanger & Broad Cell Lines RNASeq,Sanger & Broad Cell Lines RNASeq,Sanger & Broad Cell Lines RNASeq,Sanger & Broad Cell Lines RNASeq,Sanger & Broad Cell Lines RNASeq,Sanger & Broad Cell Lines RNASeq,...,Sanger Organoid RNASeq,Sanger Organoid RNASeq,Sanger Organoid RNASeq,Sanger Organoid RNASeq,Sanger Organoid RNASeq,Sanger Organoid RNASeq,Sanger Organoid RNASeq,Sanger Organoid RNASeq,Sanger Organoid RNASeq,Sanger Organoid RNASeq
data_source,,Broad,Sanger,Sanger,Broad,Sanger,Sanger,Broad,Sanger,Broad,...,Sanger,Sanger,Sanger,Sanger,Sanger,Sanger,Sanger,Sanger,Sanger,Sanger
gene_id,symbol,,,,,,,,,,...,,,,,,,,,,


In [27]:
def get_transformed_df_in_model_gene_value_format(rna_seq_df):
    rna_seq_df_transposed = rna_seq_df.transpose().iloc[0:, :]
    new_header = rna_seq_df_transposed.iloc[0, :] 
    rna_seq_df_transposed = rna_seq_df_transposed.iloc[1:, :] 
    rna_seq_df_transposed.columns = new_header
    rna_seq_df_transposed = rna_seq_df_transposed.iloc[:, [0, *list(range(5, new_header.shape[0]))]]
    rna_seq_df_transposed = rna_seq_df_transposed.rename(columns={rna_seq_df_transposed.columns[0]: "SANGER_MODEL_ID"})
    rna_seq_df_transposed_stacked = rna_seq_df_transposed.set_index('SANGER_MODEL_ID').stack().reset_index(name='ExpressionValue')
    rna_seq_df_transposed_stacked = rna_seq_df_transposed_stacked.rename(columns={1:'symbol'})
    return rna_seq_df_transposed_stacked

In [28]:
transformed_seq_data = get_transformed_df_in_model_gene_value_format(rna_seq_df)

### Get edge index (model-[Expression]->gene)

In [29]:
model_gene_merged_exprssion_df = get_gene_and_model_merged_df(gene_idx_to_gene_symbol_df, merged_drug_model_df, transformed_seq_data)
gene_expression_model_edge_index = model_gene_merged_exprssion_df.loc[:,["human_genome_index", "model_idx"]].values.astype(np.int64)
gene_expression_model_edge_index.shape

(22177890, 2)

### Get edge attributes (model-[Expression]->gene)

In [30]:
import torch
device = torch.device("cuda:0")
gene_expression_torch = torch.from_numpy(model_gene_merged_exprssion_df.loc[:, ["ExpressionValue"]].astype(float).values).to(device)
gene_expression_model_edge_attributes_torch = (gene_expression_torch - gene_expression_torch.mean()) / gene_expression_torch.std()
gene_expression_model_edge_attributes = gene_expression_model_edge_attributes_torch.cpu().numpy()
gene_expression_model_edge_attributes.shape

(22177890, 1)

## Create graph

In [31]:
from torch_geometric.data import HeteroData

data = HeteroData()
data['model'].x = torch.from_numpy(model_and_target_features)
data['gene'].x = torch.from_numpy(gene_features)
data['model', 'copy_number', 'gene'].edge_index = torch.from_numpy(model_to_gene_copy_number_edge_index)
data['model', 'mutation', 'gene'].edge_index = torch.from_numpy(model_to_gene_mutations_edge_index)
data['model', 'expression', 'gene'].edge_index = torch.from_numpy(gene_expression_model_edge_index)

data['model', 'copy_number', 'gene'].edge_attr = torch.from_numpy(copy_number_edge_attributes)
data['model', 'mutation', 'gene'].edge_attr = torch.from_numpy(mutations_edge_attributes)
data['model', 'expression', 'gene'].edge_attr = torch.from_numpy(gene_expression_model_edge_attributes)

## Old code

In [133]:
pivot_targets = pd.pivot_table(targets, values="LN_IC50", index=["SANGER_MODEL_ID"],
                       columns=['DRUG_ID'])

pd.merge(targets.loc[:, ["SANGER_MODEL_ID",'PUTATIVE_TARGET', 'PATHWAY_NAME', 'MIN_CONC', 'MAX_CONC']], pivot_targets, on=["SANGER_MODEL_ID","SANGER_MODEL_ID"])

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,...,2175,2177,2359,2360,2361,2362,2438,2439,2498,2499
0,GDSC2,401,18945558,683667,PFSK-1,SIDM01132,MB,1003,Camptothecin,TOP1,...,,2.366388,,,,,,,7.846626,10.613759
1,GDSC2,401,18945796,684052,A673,SIDM00848,UNCLASSIFIED,1003,Camptothecin,TOP1,...,,1.723942,5.240599,4.432934,4.716167,4.285911,7.793623,9.156372,7.568813,7.063871
2,GDSC2,401,18946078,684057,ES5,SIDM00263,UNCLASSIFIED,1003,Camptothecin,TOP1,...,4.879849,1.901766,,,,,,,,
3,GDSC2,401,18946335,684059,ES7,SIDM00269,UNCLASSIFIED,1003,Camptothecin,TOP1,...,,2.133421,4.578782,3.893851,3.137208,2.719145,8.871504,9.302207,6.655762,9.441838
4,GDSC2,401,18946617,684062,EW-11,SIDM00203,UNCLASSIFIED,1003,Camptothecin,TOP1,...,,1.880307,5.137711,4.863771,5.007426,5.389040,10.462131,9.801610,8.228663,10.817398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242031,GDSC2,401,19187490,1659928,SNU-175,SIDM00216,COREAD,2499,N-acetyl cysteine,Metabolism,...,,1.310061,5.423870,5.397164,5.401524,4.973343,9.264933,9.211173,7.445385,10.134495
242032,GDSC2,401,19187943,1660034,SNU-407,SIDM00214,COREAD,2499,N-acetyl cysteine,Metabolism,...,,2.628609,5.042005,5.055076,4.918666,4.122141,11.121460,9.528520,8.344004,8.575555
242033,GDSC2,401,19188201,1660035,SNU-61,SIDM00194,COREAD,2499,N-acetyl cysteine,Metabolism,...,,2.866949,6.130028,5.649833,4.392357,4.382946,10.879932,10.092187,7.743591,10.520666
242034,GDSC2,401,19188741,1674021,SNU-C5,SIDM00498,COREAD,2499,N-acetyl cysteine,Metabolism,...,,2.653533,6.151265,6.022426,5.661197,4.820414,9.652950,10.181355,7.305220,10.701430


In [79]:
~pivot_targets.isnull()

DRUG_ID,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,...,2175,2177,2359,2360,2361,2362,2438,2439,2498,2499
SANGER_MODEL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIDM00003,True,True,True,True,True,False,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
SIDM00023,True,False,False,False,True,True,True,True,True,True,...,True,True,False,False,False,False,False,False,False,False
SIDM00040,True,False,False,False,True,True,True,True,True,True,...,True,True,False,False,False,False,False,False,False,False
SIDM00041,True,False,False,False,True,True,True,True,True,True,...,True,True,False,False,False,False,False,False,False,False
SIDM00042,True,True,True,True,True,True,False,True,True,True,...,False,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SIDM01248,True,True,True,True,True,True,False,True,True,True,...,False,True,True,True,True,True,True,True,True,True
SIDM01251,True,True,True,True,True,False,False,True,True,True,...,False,True,True,True,True,True,True,True,True,True
SIDM01259,True,True,True,True,True,True,False,True,True,True,...,False,True,True,True,True,True,True,True,True,True
SIDM01261,True,False,False,False,True,True,True,True,True,True,...,True,True,False,False,False,False,False,False,False,False


## Questions
- Which features from mutations and genes I listed are irrelevant?

## Problems
	
 duplicate model_id + gene_id
 model_name	model_id	symbol	gene_id	chr_name	chr_start	chr_end	total_copy_number	minor_copy_number	loh	...	seg_mean	gene_mean	num_targets	focal	breakpoints	num_snps	gatk_mean_log2_copy_ratio	comment	source	data_type
0	MEC-1	SIDM00001	ABCB1	SIDG00064	chr7	87503966	87600461	3.0	1.0	False	...	0.599167	0.414125	28.0	False	0.0	1124.0	0.574127	POOR GOF (70.4%)	Sanger	WES
844	MEC-1	SIDM00001	ABCB1	SIDG00064	chr7	87503966	87600461	3.0	1.0	False	...	0.551774	0.692527	

-> pre-filter? highest POF in comment? or take both?