# Data Filtration Criteria

#### Response Data:
- Filtered data must form a plateaus shape (S-shape), this allows a better measure of IC50, the dataset also assumes a sigmoidal shape, so it is important to filter those out.

#### Cell Line Data:
- Remove redundant features

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
_FOLDER = "../data/"

### Response Data

In [2]:
def FilterResponsesLEQOne(drug_curves, resp_labels):
    drug_curves = drug_curves.copy()
    moreThan1 = []
    for lbl in resp_labels:
        if sum(drug_curves[lbl]>1)>0:
            moreThan1.extend(drug_curves[drug_curves[lbl]>1].index)
    lessThan1 = set(drug_curves.index) - set(moreThan1)
    drug_curves = drug_curves.loc[lessThan1, :].copy()
    
    return drug_curves

def FilterResponsesPlateau(drug_curves, resp_labels, tol=0.05):
    drug_curves = drug_curves.copy()
    drug_curves["dif_first"]=abs(drug_curves[resp_labels[0]] - drug_curves[resp_labels[1]])
    drug_curves["dif_last"]=abs(drug_curves[resp_labels[-1]] - drug_curves[resp_labels[-2]])
    drug_curves = drug_curves[(drug_curves["dif_first"]<= tol) & (drug_curves["dif_last"]<= tol)]
    drug_curves.drop(['dif_first', 'dif_last'], axis=1)
    drug_curves = drug_curves.drop(columns=['dif_first', 'dif_last'])

    
    return drug_curves

def FilterPlateauLocation(drug_curves, resp_labels, firstLowerLim=0.8, lastUpperLim=0.2):
    drug_curves = drug_curves[(drug_curves[resp_labels[1]] > firstLowerLim) & (drug_curves[resp_labels[-1]] < lastUpperLim)]
    return drug_curves



In [3]:

drug_curves = pd.read_csv(_FOLDER+"normalised_dose_response_data.csv")
conc_labels = ["fd_num_"+str(i) for i in range(10)]
resp_labels = ['norm_cells_'+str(i) for i in range(10)]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
filteredLessThan1 = FilterResponsesLEQOne(drug_curves, resp_labels)
filteredLessThan1.shape

(63325, 44)

In [5]:
filterPlateau = FilterResponsesPlateau(drug_curves, resp_labels)
filterPlateau.shape

(32974, 44)

In [6]:
filterPlateau = FilterPlateauLocation(filterPlateau, resp_labels)
filterPlateau.shape

(6111, 44)

In [7]:
filterPlateau

Unnamed: 0.1,Unnamed: 0,CELL_LINE_NAME,COSMIC_ID,DRUG_ID,DRUGID_COSMICID,FOLD_DILUTION,MAX_CONC,fd_num_0,fd_num_1,fd_num_2,...,per_slope_change_7,slope_0,slope_1,slope_2,slope_3,slope_4,slope_5,slope_6,slope_7,slope_8
13,13,HDQ-P1,1290922,344,344_1290922,2,20.00,0,0.111111,0.222222,...,-0.943687,0.286201,0.352989,-0.247035,-0.575905,-0.843254,-0.665096,-0.918862,-5.997048,-0.337710
28,28,HDQ-P1,1290922,136,136_1290922,2,16.00,0,0.111111,0.222222,...,-0.950934,-0.100903,0.723914,-2.397031,0.089688,-2.122865,-1.179300,-0.946190,-2.053681,-0.100765
85,85,NMC-G1,908449,170,170_908449,2,16.00,0,0.111111,0.222222,...,-0.7929,-0.125655,-0.540301,0.145714,0.312603,-2.979963,-4.942543,-0.570398,-0.121309,-0.025123
112,112,NMC-G1,908449,331,331_908449,2,10.24,0,0.111111,0.222222,...,3.51241,-0.148804,-0.179796,-1.078181,-0.431847,-1.648713,-2.581774,-0.781318,-0.084141,-0.379681
134,134,JHH-2,1240157,170,170_1240157,2,16.00,0,0.111111,0.222222,...,0.251402,0.292542,-0.248847,-0.112661,-0.781234,-6.930943,-0.963911,0.063247,-0.072531,-0.090765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191209,224526,TUR,909773,1022,1022_909773,2,2.00,0,0.111111,0.222222,...,-1.37326,0.121573,-0.398764,0.753023,-0.635646,-0.689993,-5.074723,-2.251761,-0.311639,0.116321
191219,224539,KOPN-8,1330933,1012,1012_1330933,2,10.00,0,0.111111,0.222222,...,-1.08447,0.403606,0.815657,-1.737873,-1.226606,-2.175354,-2.123688,-1.533946,-1.011404,0.085434
191411,224854,RKN,1298539,1057,1057_1298539,2,0.25,0,0.111111,0.222222,...,-0.65951,-0.036193,-0.764091,-1.006470,-0.086489,-2.239384,-2.074141,-0.016975,-0.758317,-0.258199
191448,224913,NCI-H187,688007,1011,1011_688007,2,2.00,0,0.111111,0.222222,...,-0.983344,-0.258839,-1.026693,-1.650025,-3.334681,-1.263007,-0.826960,-0.076599,-0.172793,-0.002878


In [8]:
drop_col = ["slope_"+str(i) for i in range(9)] +  ["per_slope_change_"+str(i) for i in range(8)]
filterPlateau = filterPlateau.drop(columns=drop_col)

In [9]:
filterPlateau.to_csv(_FOLDER +'filteredResponses.csv', index=False)

### Merging of Cell Lines with Filtered Response Data

In [23]:
cellLines = pd.read_csv(_FOLDER +"Cell_Line_Features_PANCAN_simple_MOBEM.tsv", sep="\t")
cellLines['909701'].T

0       0
1       0
2       0
3       0
4       0
       ..
1068    0
1069    0
1070    0
1071    0
1072    0
Name: 909701, Length: 1073, dtype: int64

In [11]:
cellLines = cellLines.T

In [12]:
cellLines

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1063,1064,1065,1066,1067,1068,1069,1070,1071,1072
Unnamed: 0,ABCB1_mut,ABL2_mut,ACACA_mut,ACVR1B_mut,ACVR2A_mut,ADCY1_mut,AFF4_mut,AHCTF1_mut,AHNAK_mut,AKAP9_mut,...,chr9:104248247-104249501(C9orf125)_HypMET,"chr9:115875199-115875738(C9orf109, C9orf110)_H...",chr9:123555399-123555899(FBXW2)_HypMET,chr9:140310894-140312457(EXD3)_HypMET,chr9:21974578-21975306(CDKN2A)_HypMET,chr9:35756948-35757339(MSMP)_HypMET,chr9:35791584-35791924(NPR2)_HypMET,chr9:4984543-4985630(JAK2)_HypMET,chr9:86571047-86572027(C9orf64)_HypMET,chr9:98783216-98784364(NCRNA00092)_HypMET
1287381,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
924100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
910924,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
687561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909701,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
753620,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
905965,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1299061,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
cellLines.columns=cellLines.iloc[0]
cellLineFeatures = list(cellLines.columns)
# print (cellLines[list(cellLines.columns)] != 0)

In [66]:
cellLines = cellLines.tail(-1)
cellLines

Unnamed: 0,ABCB1_mut,ABL2_mut,ACACA_mut,ACVR1B_mut,ACVR2A_mut,ADCY1_mut,AFF4_mut,AHCTF1_mut,AHNAK_mut,AKAP9_mut,...,chr9:104248247-104249501(C9orf125)_HypMET,"chr9:115875199-115875738(C9orf109, C9orf110)_HypMET",chr9:123555399-123555899(FBXW2)_HypMET,chr9:140310894-140312457(EXD3)_HypMET,chr9:21974578-21975306(CDKN2A)_HypMET,chr9:35756948-35757339(MSMP)_HypMET,chr9:35791584-35791924(NPR2)_HypMET,chr9:4984543-4985630(JAK2)_HypMET,chr9:86571047-86572027(C9orf64)_HypMET,chr9:98783216-98784364(NCRNA00092)_HypMET
1287381,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
924100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
910924,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
687561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1287706,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909701,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
753620,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
905965,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1299061,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
cellLines = cellLines.loc[:, (cellLines != 0).any(axis=0)]

In [68]:
cellLines

Unnamed: 0,ABCB1_mut,ABL2_mut,ACACA_mut,ACVR1B_mut,ACVR2A_mut,ADCY1_mut,AFF4_mut,AHCTF1_mut,AHNAK_mut,AKAP9_mut,...,chr9:104248247-104249501(C9orf125)_HypMET,"chr9:115875199-115875738(C9orf109, C9orf110)_HypMET",chr9:123555399-123555899(FBXW2)_HypMET,chr9:140310894-140312457(EXD3)_HypMET,chr9:21974578-21975306(CDKN2A)_HypMET,chr9:35756948-35757339(MSMP)_HypMET,chr9:35791584-35791924(NPR2)_HypMET,chr9:4984543-4985630(JAK2)_HypMET,chr9:86571047-86572027(C9orf64)_HypMET,chr9:98783216-98784364(NCRNA00092)_HypMET
1287381,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
924100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
910924,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
687561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1287706,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909701,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
753620,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
905965,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1299061,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
