In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter

from sklearn import preprocessing
from sklearn.metrics.cluster import adjusted_rand_score
import itertools
import random

import scipy
from tqdm import tqdm

plt.ion()
plt.show()
import pickle
import os
import sys
sys.path.append("..")


http://www.bushmanlab.org/links/genelists

In [2]:
rdf = pd.DataFrame(columns = ["Dataset", "Subspace No", "Subspace size", "Nb Cancer Genes", 
                              "% Cancer Genes", "Genes"])

In [4]:
an_genes_df = pd.read_csv("../data/rna_data/allOnco_May2018.tsv", sep = "\t", low_memory=False)
an_genes = an_genes_df["symbol"].values

# Load input data

In [19]:
filename ="BRCA"
clustering = "gmm"
method = "adapted_ratkowsky_lance"

filename ="KIRP" #


data = pd.read_pickle(f"../data/rna_data/{filename}.pkl")
# solutions = pd.read_pickle(f"data/rna_data/{filename}_solutions.pkl")
solutions = pd.read_pickle(f"../data/{filename}_{clustering}_{method}.pkl")

In [20]:
input_genes = data.columns[:-1]
input_genes = np.array([g.split('|')[0] for g in input_genes])

In [21]:
found = np.array([gi in input_genes for gi in an_genes])
print(Counter(found))

Counter({True: 2324, False: 255})


# How many important genes  are in top 3000 by supervision

In [22]:
found = np.array([gi in an_genes for gi in input_genes])
print(Counter(found))

Counter({False: 15614, True: 2324})


# Find genes which are documented to be cancer related, but they are not in the dataset

In [23]:
cancer_genes = np.where(found == True)[0]
cancer_genes.shape

(2324,)

In [24]:
solution_subspaces = solutions["features"].values
all_subspaces= np.unique(np.concatenate(solution_subspaces))

In [25]:
cancer_genes_not_in_subspaces = np.setdiff1d(cancer_genes, all_subspaces)
cancer_genes_not_in_subspaces.shape

(2199,)

In [26]:
all_subspaces = solution_subspaces[1]
cancer_genes_not_in_subspaces = np.intersect1d(cancer_genes, all_subspaces)
len(cancer_genes_not_in_subspaces)

0

In [27]:
from scipy.cluster.hierarchy import fcluster, linkage

In [28]:
truth = data["y"].values
data = data.drop("y", axis = 1).values
n_clusters = len(np.unique(truth))

Find close clusters of features

In [None]:
Z = linkage(data.T[cancer_genes_not_in_subspaces], method='complete', metric="correlation")

pred = fcluster(Z, 0.1, criterion='distance')

In [None]:
feature_clusters = []
for c, _ in Counter(pred).most_common()[:3]:
    feature_clusters.append(cancer_genes_not_in_subspaces[np.where(pred ==c)[0]])
feature_clusters

In [None]:
np.save("data/rna_data/BRCA_close_gene_clusters1.npy", feature_clusters)

# Analyze subspace results

In [29]:
solution_subspaces = solutions["features"].values
solution_subspaces = [ input_genes[s] for s in solution_subspaces]


# %%capture cap

all_subspaces= np.unique(np.concatenate(solution_subspaces))
found = np.array([gi in an_genes for gi in all_subspaces])
print(f"From {len(all_subspaces)} selected genes across {len(solution_subspaces)} subspaces, " + 
     f"{Counter(found)[True]} are in annotated genes set")

From 811 selected genes across 10 subspaces, 125 are in annotated genes set


In [30]:
found_by_subspace = []
for i, s in enumerate(solution_subspaces):
#     tmp = subspace_match[subspace_match["subspace"] == i][["ari", "additional_data"]].values[0]
#     print(f"\n\nSubspace {i} of size {len(s)}, best corresponds to {tmp[1]} with match ari {tmp[0]}")
    print(f"\n\nSubspace {i} of size {len(s)}")
    found = np.array([gi in an_genes for gi in s])
    found_idx = np.where(found == True)[0]
    if len(found_idx) > 0:
        print(f"{len(found_idx)} genes found: {s[found_idx]}, responsible for: " )
        gene_func = an_genes_df[an_genes_df["symbol"].isin(s[found_idx])]["name"].values
        print(gene_func)
        perc =round(len(found_idx)/len(s), 2)
        rdf.loc[rdf.shape[0]] = [filename, i, len(s), len(found_idx), perc, " ,".join(s[found_idx])]

# with open('bc_genes.txt', 'w') as f:
#     f.write(cap.stdout)



Subspace 0 of size 15
6 genes found: ['CD79A' 'FCRL5' 'PIM2' 'POU2AF1' 'TNFRSF13B' 'TNFRSF17'], responsible for: 
['TNF receptor superfamily member 13B'
 'Pim-2 proto-oncogene, serine/threonine kinase'
 'TNF receptor superfamily member 17' 'CD79a molecule'
 'POU class 2 associating factor 1' 'Fc receptor like 5']


Subspace 1 of size 14


Subspace 2 of size 153
21 genes found: ['BCL2L14' 'CALCA' 'DACH2' 'EYA4' 'FOXE1' 'GATA3' 'GPC5' 'GRHL2' 'GRM1'
 'KDR' 'LMO3' 'OPCML' 'PRDM16' 'RASL11B' 'RHBG' 'RHCG' 'SCNN1B' 'SLC4A1'
 'SSTR2' 'TEK' 'TIE1'], responsible for: 
['glypican 5' 'forkhead box E1'
 'opioid binding protein/cell adhesion molecule like'
 'EYA transcriptional coactivator and phosphatase 4'
 'Rh family B glycoprotein (gene/pseudogene)' 'Rh family C glycoprotein'
 'solute carrier family 4 member 1 (Diego blood group)' 'PR/SET domain 16'
 'RAS like family 11 member B' 'calcitonin related polypeptide alpha'
 'kinase insert domain receptor'
 'tyrosine kinase with immunoglobulin lik

In [31]:
rdf

Unnamed: 0,Dataset,Subspace No,Subspace size,Nb Cancer Genes,% Cancer Genes,Genes
0,BRCA,1,40,9,0.23,"AFF3 ,ERBB4 ,ESR1 ,GATA3 ,GREB1 ,INPP4B ,MYB ,..."
1,BRCA,3,35,6,0.17,"BCL11A ,CMTM7 ,FOXC1 ,MAPK4 ,NFIB ,WNT6"
2,BRCA,7,6,1,0.17,XAF1
3,BRCA,8,2,1,0.5,PAX8
4,KIRP,0,15,6,0.4,"CD79A ,FCRL5 ,PIM2 ,POU2AF1 ,TNFRSF13B ,TNFRSF17"
5,KIRP,2,153,21,0.14,"BCL2L14 ,CALCA ,DACH2 ,EYA4 ,FOXE1 ,GATA3 ,GPC..."
6,KIRP,3,55,7,0.13,"AIFM1 ,CUBN ,HNF1A ,HNF4A ,LRP2 ,RHOBTB1 ,SLC9..."
7,KIRP,5,157,36,0.23,"AFAP1L2 ,CDA ,CDH13 ,CSPG4 ,DLGAP1 ,EBF1 ,ELN ..."
8,KIRP,6,107,24,0.22,"AGAP2 ,BCL11B ,CCR4 ,CD38 ,CD74 ,CXCR3 ,GFI1 ,..."
9,KIRP,7,152,16,0.11,"AXIN2 ,CD200 ,CDC25C ,ELF3 ,ERC2 ,FAIM ,GALNT5..."


In [32]:
rdf.to_csv("../reports/annotated_genes.csv")

In [29]:
rdf.to_excel("../reports/annotated_genes.xlsx")

In [None]:
# rdf.to_excel("reports/annotated_genes.xlsx")