# Data Filtration Criteria

#### Response Data:
- Filtered data must form a plateaus shape (S-shape), this allows a better measure of IC50, the dataset also assumes a sigmoidal shape, so it is important to filter those out.

#### Cell Line Data:
- Remove redundant features

In [209]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
_FOLDER = "../data/"

### Response Data

In [210]:
def FilterResponsesLEQOne(drug_curves, resp_labels):
    drug_curves = drug_curves.copy()
    moreThan1 = []
    for lbl in resp_labels:
        if sum(drug_curves[lbl]>1)>0:
            moreThan1.extend(drug_curves[drug_curves[lbl]>1].index)
    lessThan1 = set(drug_curves.index) - set(moreThan1)
    drug_curves = drug_curves.loc[lessThan1, :].copy()
    
    return drug_curves

def FilterResponsesPlateau(drug_curves, resp_labels, tol=0.05):
    drug_curves = drug_curves.copy()
    drug_curves["dif_first"]=abs(drug_curves[resp_labels[0]] - drug_curves[resp_labels[1]])
    drug_curves["dif_last"]=abs(drug_curves[resp_labels[-1]] - drug_curves[resp_labels[-2]])
    drug_curves = drug_curves[(drug_curves["dif_first"]<= tol) & (drug_curves["dif_last"]<= tol)]
    drug_curves.drop(['dif_first', 'dif_last'], axis=1)
    drug_curves = drug_curves.drop(columns=['dif_first', 'dif_last'])

    
    return drug_curves

def FilterPlateauLocation(drug_curves, resp_labels, firstLowerLim=0.8, lastUpperLim=0.2):
    drug_curves = drug_curves[(drug_curves[resp_labels[1]] > firstLowerLim) & (drug_curves[resp_labels[-1]] < lastUpperLim)]
    return drug_curves



In [211]:

drug_curves = pd.read_csv(_FOLDER+"normalised_dose_response_data.csv")
conc_labels = ["fd_num_"+str(i) for i in range(10)]
resp_labels = ['norm_cells_'+str(i) for i in range(10)]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [212]:
filteredLessThan1 = FilterResponsesLEQOne(drug_curves, resp_labels)
filteredLessThan1.shape

(63325, 44)

In [213]:
filterPlateau = FilterResponsesPlateau(drug_curves, resp_labels)
filterPlateau.shape

(32974, 44)

In [214]:
filterPlateau = FilterPlateauLocation(filterPlateau, resp_labels)
filterPlateau.shape

(6111, 44)

In [222]:
drop_col = ["slope_"+str(i) for i in range(9)] +  ["per_slope_change_"+str(i) for i in range(8)] + ['Unnamed: 0','FOLD_DILUTION']
filterPlateau = filterPlateau.drop(columns=drop_col)

KeyError: "['slope_0' 'slope_1' 'slope_2' 'slope_3' 'slope_4' 'slope_5' 'slope_6'\n 'slope_7' 'slope_8' 'per_slope_change_0' 'per_slope_change_1'\n 'per_slope_change_2' 'per_slope_change_3' 'per_slope_change_4'\n 'per_slope_change_5' 'per_slope_change_6' 'per_slope_change_7'\n 'Unnamed: 0' 'FOLD_DILUTION'] not found in axis"

In [216]:
# Remove drugs which have less than 10 profiles
drugIDs = list(filterPlateau['DRUG_ID'].unique())
drugIDs = np.squeeze(drugIDs)
toDropDrugs = []
for drugID in drugIDs:
    if(len(filterPlateau[filterPlateau['DRUG_ID'] == drugID])) < 10:
        toDropDrugs.append(drugID)
print(toDropDrugs)
filterPlateau = filterPlateau[~filterPlateau['DRUG_ID'].isin(toDropDrugs)]
filterPlateau

[292, 309, 32, 185, 283, 223, 299, 303, 38, 52, 329, 304, 260, 34, 62, 293, 56, 155, 55, 222, 326, 51, 111, 64, 277, 147, 207, 265, 262, 312, 282, 310, 175, 71, 290, 224, 153, 199, 295, 266, 294, 249, 261, 226, 221, 29, 37, 156, 177, 255, 5, 1032, 1054, 1015, 1021, 1014, 1060, 1170, 1001, 1061, 1047, 1036, 1042, 1019, 1066, 1013, 1133]


Unnamed: 0,CELL_LINE_NAME,COSMIC_ID,DRUG_ID,DRUGID_COSMICID,MAX_CONC,fd_num_0,fd_num_1,fd_num_2,fd_num_3,fd_num_4,...,norm_cells_0,norm_cells_1,norm_cells_2,norm_cells_3,norm_cells_4,norm_cells_5,norm_cells_6,norm_cells_7,norm_cells_8,norm_cells_9
13,HDQ-P1,1290922,344,344_1290922,20.00,0,0.111111,0.222222,0.333333,0.444444,...,1,1.031800,1.071021,1.043573,0.979583,0.885889,0.811989,0.709893,0.043555,0.006031
28,HDQ-P1,1290922,136,136_1290922,16.00,0,0.111111,0.222222,0.333333,0.444444,...,1,0.988789,1.069224,0.802887,0.812852,0.576978,0.445945,0.340813,0.112626,0.101430
85,NMC-G1,908449,170,170_908449,16.00,0,0.111111,0.222222,0.333333,0.444444,...,1,0.986038,0.926005,0.942195,0.976929,0.645822,0.096651,0.033273,0.019794,0.017003
112,NMC-G1,908449,331,331_908449,10.24,0,0.111111,0.222222,0.333333,0.444444,...,1,0.983466,0.963489,0.843691,0.795708,0.612518,0.325654,0.238841,0.229492,0.187305
134,JHH-2,1240157,170,170_1240157,16.00,0,0.111111,0.222222,0.333333,0.444444,...,1,1.032505,1.004855,0.992337,0.905533,0.135428,0.028327,0.035355,0.027296,0.017211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191209,TUR,909773,1022,1022_909773,2.00,0,0.111111,0.222222,0.333333,0.444444,...,1,1.013508,0.969201,1.052870,0.982243,0.905577,0.341719,0.091523,0.056897,0.069821
191219,KOPN-8,1330933,1012,1012_1330933,10.00,0,0.111111,0.222222,0.333333,0.444444,...,1,1.044845,1.135474,0.942377,0.806087,0.564381,0.328416,0.157977,0.045599,0.055092
191411,RKN,1298539,1057,1057_1298539,0.25,0,0.111111,0.222222,0.333333,0.444444,...,1,0.995979,0.911080,0.799250,0.789640,0.540819,0.310359,0.308473,0.224216,0.195527
191448,NCI-H187,688007,1011,1011_688007,2.00,0,0.111111,0.222222,0.333333,0.444444,...,1,0.971240,0.857163,0.673827,0.303307,0.162973,0.071088,0.062577,0.043378,0.043058


In [217]:
filterPlateau.to_csv(_FOLDER +'filteredResponses.csv', index=False)

### Merging of Cell Lines with Filtered Response Data

In [218]:
cellLinesFeatures = pd.read_csv(_FOLDER +"Cell_Line_Features_PANCAN_simple_MOBEM.tsv", sep="\t")
# TRANSPOSE
cellLinesFeatures = pd.DataFrame(data= cellLinesFeatures[cellLinesFeatures.columns[1:]].values.T,
                          index= cellLinesFeatures.columns[1:], columns= cellLinesFeatures[cellLinesFeatures.columns[0]].values)
cellLinesFeatures.index = np.array(cellLinesFeatures.index, dtype = "int")

# Prepare for merge
cellLinesFeatures.index.name = 'COSMIC_ID'
mergedDF = pd.merge(left=filterPlateau, right = cellLinesFeatures, on = "COSMIC_ID")

In [219]:
mergedDF

Unnamed: 0,CELL_LINE_NAME,COSMIC_ID,DRUG_ID,DRUGID_COSMICID,MAX_CONC,fd_num_0,fd_num_1,fd_num_2,fd_num_3,fd_num_4,...,chr9:104248247-104249501(C9orf125)_HypMET,"chr9:115875199-115875738(C9orf109, C9orf110)_HypMET",chr9:123555399-123555899(FBXW2)_HypMET,chr9:140310894-140312457(EXD3)_HypMET,chr9:21974578-21975306(CDKN2A)_HypMET,chr9:35756948-35757339(MSMP)_HypMET,chr9:35791584-35791924(NPR2)_HypMET,chr9:4984543-4985630(JAK2)_HypMET,chr9:86571047-86572027(C9orf64)_HypMET,chr9:98783216-98784364(NCRNA00092)_HypMET
0,HDQ-P1,1290922,344,344_1290922,20.00,0,0.111111,0.222222,0.333333,0.444444,...,0,0,0,0,0,0,0,0,0,0
1,HDQ-P1,1290922,136,136_1290922,16.00,0,0.111111,0.222222,0.333333,0.444444,...,0,0,0,0,0,0,0,0,0,0
2,HDQ-P1,1290922,170,170_1290922,16.00,0,0.111111,0.222222,0.333333,0.444444,...,0,0,0,0,0,0,0,0,0,0
3,NMC-G1,908449,170,170_908449,16.00,0,0.111111,0.222222,0.333333,0.444444,...,0,0,0,0,0,0,0,0,1,0
4,NMC-G1,908449,331,331_908449,10.24,0,0.111111,0.222222,0.333333,0.444444,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5579,TC-YIK,946357,1011,1011_946357,2.00,0,0.111111,0.222222,0.333333,0.444444,...,0,0,0,0,0,0,0,0,0,0
5580,MKN45,925340,1149,1149_925340,5.00,0,0.111111,0.222222,0.333333,0.444444,...,0,0,0,0,0,0,0,0,0,0
5581,EC-GI-10,753555,1004,1004_753555,0.10,0,0.111111,0.222222,0.333333,0.444444,...,0,0,0,0,0,0,0,0,0,0
5582,IGROV-1,905968,1031,1031_905968,0.20,0,0.111111,0.222222,0.333333,0.444444,...,0,0,0,0,0,0,0,0,0,0


In [220]:
mergedDF.to_csv(_FOLDER +'filteredResponsesWithCCL.csv', index=False)