In [37]:
import os
import pandas as pd
import numpy as np

In [38]:
# List of all the lineages we will be making dataframes for - some of the DFs don't have a col named depmapid but have depmapids, will require some cleaning
lineages = ["kidney", "breast", "blood", "liver", "urinary_tract", "colorectal", "ovary", "pancreas"]
lineage_df_dict = {}

In [39]:
# List of all the feature dataframes we are going to read into the lineage dataframes
features = ["effect", "dependency", "expression", "gene_cn"]
feature_df_dict = {}

In [40]:
sample_info_df = pd.read_csv('data/sample_info.csv')

In [41]:
folder_path = "data/"

for file_name in os.listdir(folder_path):
    for feature in features:
        if file_name.endswith('.csv') and feature in file_name:
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            df.columns = [column.split(' ')[0] for column in df.columns]
            df.rename(columns={df.columns[0]: "DepMap_ID"}, inplace=True)
            df = pd.merge(df, sample_info_df[['DepMap_ID', 'lineage']], on='DepMap_ID', how='left')
            df.drop("DepMap_ID", axis=1, inplace=True)
            feature_df_dict[feature] = df.pivot_table(index='lineage', aggfunc='mean')

In [42]:
for feature in features:
    for lineage in lineages:
        temp_df = feature_df_dict[feature].loc[feature_df_dict[feature].index == lineage].transpose()
        temp_df.index.name = 'gene'
        if lineage in lineage_df_dict:
            temp_df.rename(columns={lineage : feature}, inplace=True)
            lineage_df_dict[lineage] = pd.merge(lineage_df_dict[lineage], temp_df, on="gene", how="outer").dropna()
        else:
            lineage_df_dict[lineage] = pd.DataFrame({feature: temp_df[lineage]}, index=temp_df.index)
            lineage_df_dict[lineage].index.name = 'gene'
            lineage_df_dict[lineage].reset_index(inplace=True)

In [43]:
lineage_df_dict['colorectal']

Unnamed: 0,gene,effect,dependency,expression,gene_cn
0,A1BG,-0.015041,0.043599,0.590738,1.020882
2,A1CF,-0.068805,0.083945,1.262503,1.019519
3,A2M,0.001877,0.035082,0.285868,1.105415
5,A2ML1,0.034582,0.030788,0.309722,1.105017
9,A3GALT2,-0.046617,0.062978,0.009425,0.895263
...,...,...,...,...,...
59260,ZYG11A,-0.020647,0.046334,0.178135,0.892073
59262,ZYG11B,-0.094004,0.087456,2.296709,0.892073
59263,ZYX,-0.015553,0.045229,6.060374,1.218197
59265,ZZEF1,-0.065964,0.070773,3.277261,0.920887


In [46]:
training_dfs = {}
training_lineages = {"kidney" : "kidney", 
                     "breast" : "breast", 
                     "leukemia" : "blood", 
                     "liver" : "liver", 
                     "bladder" : "urinary_tract",
                     "colon" : "colorectal", 
                     "ovarian" : "ovary", 
                     "pancreatic" : "pancreas"}
df = pd.read_csv("training_sets_genes/leukemia_training_genes_set.csv")
# df.replace({"positive": 1, "negative":0}, inplace=True)

# leukemia_samples = []
# for negatives in np.array_split(df[df['label'] == "negative"], 10):
#     leukemia_samples.append(pd.concat([df[df['label'] == "positive"], negatives]))

# leukemia_samples

for k, v in training_lineages.items():
    # get the df from the training set
    df = pd.read_csv(f"training_sets_genes/{k}_training_genes_set.csv")
    df = df.merge(lineage_df_dict[v], on="gene", how='left').fillna(0)
    
    # drop all training genes from the prediction set
    lineage_df_dict[v] = pd.merge(lineage_df_dict[v], df, indicator=True, how='left', on=list(lineage_df_dict[v].columns)).query('_merge == "left_only"').drop('_merge', axis=1)
    lineage_df_dict[v].drop(columns=['label'], inplace=True)
    df.set_index('gene', inplace=True)
    lineage_df_dict[v].set_index('gene', inplace=True)
    
    # split into 10 samples for 50-50 pos neg split
    samples = []
    for negatives in np.array_split(df[df['label'] == 'negative'], 10):
        samples.append(pd.concat([df[df['label'] == "positive"], negatives]))
    
    training_dfs[k] = samples

  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)


In [45]:
training_dfs['leukemia']

[            label    effect  dependency  expression   gene_cn
 gene                                                         
 ERG      positive -0.139621    0.175980    2.580228  1.165784
 TCL1A    positive  0.004238    0.036908    2.204333  0.996280
 CCND1    positive -0.200037    0.209095    1.523227  1.027428
 FLT3     positive -0.181786    0.178584    2.408897  1.176579
 HOXA11   positive -0.197790    0.199927    1.024018  0.976899
 ...           ...       ...         ...         ...       ...
 S100A13  negative  0.065586    0.022958    3.877467  1.085398
 THUMPD2  negative -0.072392    0.077413    3.796867  1.000263
 PDS5B    negative  0.000031    0.056013    4.343029  1.013530
 RAB32    negative  0.053501    0.021129    3.034897  1.116097
 AEBP1    negative -0.091084    0.088426    3.931467  0.982171
 
 [456 rows x 5 columns],
            label    effect  dependency  expression   gene_cn
 gene                                                        
 ERG     positive -0.139621   