# Set data path

In [1]:
datapath = "../../data"

# Load the CRISPR data file

In [2]:
import pandas as pd
import numpy as np
import os
df = pd.read_csv(os.path.join(datapath, "CRISPRGeneEffect.csv")).rename(columns={'Unnamed: 0': 'gene'}).rename(columns=lambda x: x.split(' ')[0]).set_index('gene').T
print(f'{df.isna().sum().sum()} NaN over {len(df)*len(df.columns)} values')
df

739493 NaN over 20287300 values


gene,ACH-000001,ACH-000004,ACH-000005,ACH-000007,ACH-000009,ACH-000011,ACH-000012,ACH-000013,ACH-000015,ACH-000017,...,ACH-002693,ACH-002710,ACH-002785,ACH-002799,ACH-002800,ACH-002834,ACH-002847,ACH-002922,ACH-002925,ACH-002926
A1BG,-0.122637,0.019756,-0.107208,-0.031027,0.008888,0.022670,-0.096631,0.049811,-0.099040,-0.044896,...,-0.072582,-0.033722,-0.053881,-0.060617,0.025795,-0.055721,-0.009973,-0.025991,-0.127639,-0.068666
A1CF,0.025881,-0.083640,-0.023211,-0.137850,-0.146566,-0.057743,-0.024440,-0.158811,-0.070409,-0.115830,...,-0.237311,-0.108704,-0.114864,-0.042591,-0.132627,-0.121228,-0.119813,-0.007706,-0.040705,-0.107530
A2M,0.034217,-0.060118,0.200204,0.067704,0.084471,0.079679,0.041922,-0.003968,-0.029389,0.024537,...,-0.065940,0.079277,0.069333,0.030989,0.249826,0.072790,0.044097,-0.038468,0.134556,0.067806
A2ML1,-0.128082,-0.027417,0.116039,0.107988,0.089419,0.227512,0.039121,0.034778,0.084594,-0.003710,...,0.101541,0.038977,0.066599,0.043809,0.064657,0.021916,0.041358,0.236576,-0.047984,0.112071
A3GALT2,-0.031285,-0.036116,-0.172227,0.007992,0.065109,-0.130448,0.028947,-0.120875,-0.052288,-0.336776,...,0.005374,-0.144070,-0.256227,-0.116473,-0.294305,-0.221940,-0.146565,-0.239690,-0.116114,-0.149897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,-0.289724,0.032983,-0.201273,-0.100344,-0.112703,0.013401,0.005124,-0.089180,-0.005409,-0.070396,...,-0.296880,-0.084936,-0.128569,-0.110504,-0.087171,0.024959,-0.119911,-0.079342,-0.043555,-0.045115
ZYG11B,-0.062972,-0.410392,-0.178877,-0.462160,-0.598698,-0.296421,-0.131949,-0.145737,-0.216393,-0.257916,...,-0.332415,-0.193408,-0.327408,-0.257879,-0.349111,0.015259,-0.289412,-0.347484,-0.335270,-0.307900
ZYX,0.074180,0.113156,-0.055349,-0.001555,0.095877,0.067705,-0.109147,-0.034886,-0.137350,0.029457,...,-0.005090,-0.218960,-0.053033,-0.041612,-0.057478,-0.306562,-0.195097,-0.085302,-0.208063,0.070671
ZZEF1,0.111244,0.234388,-0.002161,-0.325964,-0.026742,-0.232453,-0.164482,-0.175850,-0.168087,-0.284838,...,-0.188751,-0.120449,-0.267081,0.006148,-0.189602,-0.148368,-0.206400,-0.095965,-0.094741,-0.187813


# Load the map between cell lines and tissues

In [3]:
df_map = pd.read_csv(os.path.join(datapath, "Model.csv"))
#print(df_map[['OncotreeLineage']].value_counts())
tissues = list(set([x for x in df_map[['OncotreeLineage']].values.ravel()]))

In [4]:
from HELPpy.utility.selection import filter_crispr_by_model
df = filter_crispr_by_model(df, df_map, minlines=10, line_group='OncotreeLineage')
df

gene,ACH-000001,ACH-000004,ACH-000005,ACH-000007,ACH-000009,ACH-000011,ACH-000012,ACH-000013,ACH-000015,ACH-000017,...,ACH-002693,ACH-002710,ACH-002785,ACH-002799,ACH-002800,ACH-002834,ACH-002847,ACH-002922,ACH-002925,ACH-002926
A1BG,-0.122637,0.019756,-0.107208,-0.031027,0.008888,0.022670,-0.096631,0.049811,-0.099040,-0.044896,...,-0.072582,-0.033722,-0.053881,-0.060617,0.025795,-0.055721,-0.009973,-0.025991,-0.127639,-0.068666
A1CF,0.025881,-0.083640,-0.023211,-0.137850,-0.146566,-0.057743,-0.024440,-0.158811,-0.070409,-0.115830,...,-0.237311,-0.108704,-0.114864,-0.042591,-0.132627,-0.121228,-0.119813,-0.007706,-0.040705,-0.107530
A2M,0.034217,-0.060118,0.200204,0.067704,0.084471,0.079679,0.041922,-0.003968,-0.029389,0.024537,...,-0.065940,0.079277,0.069333,0.030989,0.249826,0.072790,0.044097,-0.038468,0.134556,0.067806
A2ML1,-0.128082,-0.027417,0.116039,0.107988,0.089419,0.227512,0.039121,0.034778,0.084594,-0.003710,...,0.101541,0.038977,0.066599,0.043809,0.064657,0.021916,0.041358,0.236576,-0.047984,0.112071
A3GALT2,-0.031285,-0.036116,-0.172227,0.007992,0.065109,-0.130448,0.028947,-0.120875,-0.052288,-0.336776,...,0.005374,-0.144070,-0.256227,-0.116473,-0.294305,-0.221940,-0.146565,-0.239690,-0.116114,-0.149897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,-0.289724,0.032983,-0.201273,-0.100344,-0.112703,0.013401,0.005124,-0.089180,-0.005409,-0.070396,...,-0.296880,-0.084936,-0.128569,-0.110504,-0.087171,0.024959,-0.119911,-0.079342,-0.043555,-0.045115
ZYG11B,-0.062972,-0.410392,-0.178877,-0.462160,-0.598698,-0.296421,-0.131949,-0.145737,-0.216393,-0.257916,...,-0.332415,-0.193408,-0.327408,-0.257879,-0.349111,0.015259,-0.289412,-0.347484,-0.335270,-0.307900
ZYX,0.074180,0.113156,-0.055349,-0.001555,0.095877,0.067705,-0.109147,-0.034886,-0.137350,0.029457,...,-0.005090,-0.218960,-0.053033,-0.041612,-0.057478,-0.306562,-0.195097,-0.085302,-0.208063,0.070671
ZZEF1,0.111244,0.234388,-0.002161,-0.325964,-0.026742,-0.232453,-0.164482,-0.175850,-0.168087,-0.284838,...,-0.188751,-0.120449,-0.267081,0.006148,-0.189602,-0.148368,-0.206400,-0.095965,-0.094741,-0.187813


# Select some tissues
In this section we select only cell-lines of a specific tissue. We check that, once CRISPR datafile is reduced to a subset of total cell-lines, that there is no row (gene) in the datafile with all NaN as cell values. Inthat case we remove those rows (genes) before applying the labelling algorithm.

We start labelling genes for the `Kidney` tissue...

... then we do it for the `Lung` tissue ...

In [5]:
tissue = 'Lung'
from HELPpy.utility.selection import select_cell_lines, delrows_with_nan_percentage
from HELPpy.models.labelling import labelling
cell_lines = select_cell_lines(df, df_map, [tissue])
print(f"Selecting {len(cell_lines)} cell-lines")
# remove rows with all nans
df_nonan = delrows_with_nan_percentage(df[cell_lines], perc=95)

Selecting 119 cell-lines


In [6]:
from HELPpy.preprocess.imputer import imputer_knn
df_imp = imputer_knn(df_nonan)
df_imp

gene,ACH-000012,ACH-000015,ACH-000021,ACH-000029,ACH-000030,ACH-000035,ACH-000143,ACH-000161,ACH-000176,ACH-000187,...,ACH-001386,ACH-001549,ACH-002035,ACH-002051,ACH-002077,ACH-002156,ACH-002522,ACH-002526,ACH-002531,ACH-002650
A1BG,-0.096631,-0.099040,-0.115407,-0.010927,0.141588,-0.126955,-0.074537,-0.005710,0.019219,-0.055518,...,-0.010757,-0.104869,-0.185896,0.017155,-0.076636,-0.171598,0.035977,-0.172501,-0.033574,-0.056369
A1CF,-0.024440,-0.070409,-0.181106,-0.069569,-0.235507,-0.041039,-0.201004,-0.068009,-0.071535,-0.007234,...,-0.153395,-0.058616,-0.170929,-0.042142,-0.106737,0.050156,-0.055682,0.266869,-0.090805,-0.104222
A2M,0.041922,-0.029389,-0.020465,0.001538,0.012365,0.029849,0.028630,0.119970,-0.017309,-0.073195,...,0.111502,0.044334,0.219383,0.078776,0.034575,0.088341,0.244112,-0.063692,0.006704,0.118243
A2ML1,0.039121,0.084594,-0.025030,0.059103,0.060239,0.037083,-0.038309,0.116496,0.251856,0.090424,...,0.112639,-0.001246,0.037265,0.153954,-0.083971,0.336780,0.143028,-0.043568,0.042942,0.151456
A3GALT2,0.028947,-0.052288,-0.109040,-0.236308,-0.130461,0.071862,-0.092754,-0.222069,0.042737,-0.134045,...,-0.281219,-0.160658,-0.198531,-0.043895,-0.400027,-0.156647,-0.159812,0.071649,-0.201247,-0.114598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.005124,-0.005409,-0.066436,0.054278,-0.072610,-0.107669,0.068643,-0.033630,-0.152424,-0.017307,...,-0.040707,-0.066158,-0.010060,-0.067393,-0.163113,-0.150823,-0.100644,0.224465,0.048791,-0.012245
ZYG11B,-0.131949,-0.216393,-0.368909,-0.130836,-0.329001,-0.353797,-0.207713,-0.120313,-0.188664,-0.036838,...,-0.266762,-0.198401,-0.377490,-0.066518,-0.273882,-0.233026,-0.260728,-0.230549,-0.234106,-0.215543
ZYX,-0.109147,-0.137350,-0.127135,0.080878,-0.011675,0.015859,0.021795,-0.055467,-0.076744,0.057199,...,0.001814,0.045149,0.013886,0.059225,-0.201100,0.105959,-0.182085,-0.122084,0.065003,-0.063077
ZZEF1,-0.164482,-0.168087,-0.249770,-0.169796,-0.177205,-0.068669,0.079571,-0.115627,-0.050663,-0.229357,...,-0.216575,-0.168758,-0.211387,-0.361345,-0.135362,-0.381383,-0.198027,-0.162206,-0.165964,-0.240763


# AgglomerativeClustering

In [7]:
from sklearn.cluster import AgglomerativeClustering
model = AgglomerativeClustering(n_clusters=2)
yhat = model.fit_predict(df_imp)
np.unique(yhat, return_counts=True)

(array([0, 1]), array([ 1415, 16516]))

In [8]:
yhat

array([1, 1, 1, ..., 1, 1, 1])

# Birch

In [None]:
from sklearn.cluster import Birch
model = Birch(threshold=0.01, n_clusters=2)
# fit the model
model.fit(df_imp)
# assign a cluster to each example
yhat = model.predict(df_imp)
np.unique(yhat, return_counts=True)

(array([0, 1]), array([ 1415, 16516]))

# MeanShift

In [None]:
from sklearn.cluster import MeanShift
# define the model
model = MeanShift()
# fit the model
model.fit(df_imp)
# assign a cluster to each example
yhat = model.predict(df_imp)

# OPTICS

In [9]:
from sklearn.cluster import OPTICS
model = OPTICS(eps=0.8, min_samples=10)
# fit model and predict clusters
yhat = model.fit_predict(df_imp)
np.unique(yhat, return_counts=True)

(array([0]), array([17931]))

# GaussianMixture

In [None]:
from sklearn.cluster import GaussianMixture
model = GaussianMixture(n_components=2)
model.fit(df_imp)
# assign a cluster to each example
yhat = model.predict(df_imp)