In [101]:
from itertools import combinations

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu
import seaborn as sns

In [25]:
meta = pd.read_csv('../data/processed/toil_tcga_sample_info_140722.csv')

In [5]:
annot = pd.read_csv('../data/processed/ensembl_annotation_050722.csv', low_memory=False)

In [37]:
def tcga_group(ctype, iscancer=True):
    if iscancer:
        return meta[(meta['project'] == ctype) & (meta['sample_type'] == 'primary tumor')]['sample'].to_list()
    else:
        return meta[(meta['project'] == ctype) & (meta['sample_type'] == 'normal tissue')]['sample'].to_list()

In [66]:
def prefilter(df):
    df = df.reset_index()
    
    df_gene_median_tpm = df.drop('ensembl_trs_id', axis=1).groupby('ensembl_gene_id').sum().replace(0.0, np.nan).median(0, skipna=True).mean()
    
    trs_tpm_sum = df.drop('ensembl_gene_id', axis=1).set_index('ensembl_trs_id').max(1)
    
    df = df[df['ensembl_trs_id'].isin(trs_tpm_sum[trs_tpm_sum > 3].index)]
    
    genes_n_trs = df.groupby('ensembl_gene_id').size()
    genes_tpm_sum = df.drop('ensembl_trs_id', axis=1).groupby('ensembl_gene_id').sum().max(1)
    
    genes_ok = (genes_n_trs[genes_n_trs > 1].index.intersection(genes_tpm_sum[genes_tpm_sum > df_gene_median_tpm].index))
    
    df = df[df['ensembl_gene_id'].isin(genes_ok)]
    
    df = df.set_index(['ensembl_gene_id', 'ensembl_trs_id'])     
    
    return df

In [None]:
def iso_plot():
    pass

In [102]:
def analyze_prevalence(data, ctype):
    res = pd.DataFrame(columns=['ensembl_gene_id', 'trs_1', 'trs_2', 'mean_norm', 'std_norm', 'mean_cancer', 'std_cancer', 'score', 'mwu_pval'])
    
    for i, (gene, part) in enumerate(data.groupby(level=0)):
        if len(part) < 2:
            pass
        
        part = part.droplevel(0) 
        
        for t1, t2 in combinations(part.index, 2):
            nx = np.cos(-np.pi/4) * part.loc[t1, tcga_group(ctype, False)].to_numpy() + np.sin(-np.pi/4) * part.loc[t2, tcga_group(ctype, False)].to_numpy()
            
            cx = np.cos(-np.pi/4) * part.loc[t1, tcga_group(ctype)].to_numpy() + np.sin(-np.pi/4) * part.loc[t2, tcga_group(ctype)].to_numpy()
            
            nx_mean = np.mean(nx)
            nx_std = np.std(nx)
            
            cx_mean = np.mean(cx)
            cx_std = np.std(cx)
            
            score = nx_mean * cx_mean
            
            mwu = mannwhitneyu(nx, cx)[1]
            
            res.loc[len(res.index)] = [gene, t1, t2, nx_mean, nx_std, cx_mean, cx_std, score, mwu]
            
        print(f'{i} / {len(data.index.get_level_values(0).unique())}    ', end='\r')
            
    return res

In [67]:
data = pd.read_csv('../data/processed/toil/toil_tcga_coad.csv')

In [68]:
data = data.rename({'sample': 'ensembl_trs_id'}, axis=1)

In [69]:
data['ensembl_gene_id'] = data['ensembl_trs_id'].map(annot.set_index('ensembl_trs_id')['ensembl_gene_id'])

In [70]:
data = data.dropna(axis=0, subset='ensembl_gene_id')

In [71]:
data = data.set_index(['ensembl_gene_id', 'ensembl_trs_id']).sort_index()

In [72]:
data = prefilter(data)

In [103]:
r = analyze_prevalence(data.loc[['ENSG00000000003', 'ENSG00000104635']], 'COAD')

1 / 2    

In [104]:
r

Unnamed: 0,ensembl_gene_id,trs_1,trs_2,mean_norm,std_norm,mean_cancer,std_cancer,score,mwu_pval
0,ENSG00000000003,ENST00000373020,ENST00000496771,37.609334,11.291865,52.57575,32.149144,1977.338953,0.01024639
1,ENSG00000000003,ENST00000373020,ENST00000612152,36.710112,10.971987,51.273506,31.343782,1882.256149,0.01051037
2,ENSG00000000003,ENST00000373020,ENST00000614008,38.091376,11.375713,53.251013,32.437654,2028.40437,0.009392419
3,ENSG00000000003,ENST00000496771,ENST00000612152,-0.899222,0.478344,-1.302244,1.256977,1.171007,0.1581501
4,ENSG00000000003,ENST00000496771,ENST00000614008,0.482043,0.430218,0.675262,0.667571,0.325505,0.1560736
5,ENSG00000000003,ENST00000612152,ENST00000614008,1.381264,0.664074,1.977507,1.468018,2.73146,0.04055022
6,ENSG00000104635,ENST00000289952,ENST00000359741,-18.388126,7.327453,-3.621837,4.394899,66.598789,2.2574619999999998e-21
7,ENSG00000104635,ENST00000289952,ENST00000381237,-7.846472,3.651358,-24.695977,12.606843,193.776287,2.217774e-20
8,ENSG00000104635,ENST00000289952,ENST00000517370,1.597549,0.891432,0.021066,0.326965,0.033653,2.6313700000000002e-23
9,ENSG00000104635,ENST00000289952,ENST00000520644,1.940754,0.99805,0.160427,0.482326,0.311349,5.536366e-23


In [97]:
data.loc[['ENSG00000000003', 'ENSG00000104635']]

Unnamed: 0_level_0,Unnamed: 1_level_0,TCGA-D5-5538-01,TCGA-F4-6854-01,TCGA-AA-3511-01,TCGA-QG-A5YX-01,TCGA-G4-6320-01,TCGA-CM-6164-01,TCGA-A6-A567-01,TCGA-AA-3516-11,TCGA-DM-A0X9-01,TCGA-AY-A69D-01,...,TCGA-AA-3514-11,TCGA-DM-A282-01,TCGA-CM-4744-01,TCGA-AZ-6605-01,TCGA-AZ-6601-01,TCGA-D5-6927-01,TCGA-AY-A71X-01,TCGA-A6-6782-01,TCGA-CM-4751-01,TCGA-AA-3660-01
ensembl_gene_id,ensembl_trs_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ENSG00000000003,ENST00000373020,172.07475,98.57059,76.28783,114.15002,55.8186,139.30369,63.38973,44.7113,88.92294,130.66138,...,88.03978,140.7889,67.31138,71.16913,49.08025,30.74017,48.69057,72.88147,40.12874,79.94755
ENSG00000000003,ENST00000496771,6.40978,3.58009,0.72998,0.92002,1.11003,1.88,1.64003,0.33999,0.70999,4.83998,...,0.78,0.64,0.72998,0.78,0.78997,1.04002,1.08003,1.01001,0.50001,1.22003
ENSG00000000003,ENST00000612152,5.05011,5.31006,2.85996,2.99991,2.93999,5.81005,2.33996,1.90005,3.42993,6.84993,...,2.70995,6.38008,1.51998,2.61997,2.14002,1.12001,2.68005,3.69996,1.32999,1.55
ENSG00000000003,ENST00000614008,1.66006,0.53001,0.0,0.27001,0.0,0.0,0.0,0.0,0.1,0.57001,...,0.42,0.07,0.24,0.0,0.0,0.07,0.0,0.0,0.0,0.84998
ENSG00000104635,ENST00000289952,0.36001,0.41,0.80001,0.64999,0.04,0.54,0.50001,3.21998,0.07,1.46004,...,1.99001,0.0,0.06,0.0,0.15,0.84998,0.07,0.23001,0.31999,0.31999
ENSG00000104635,ENST00000359741,3.18998,6.30008,7.06993,16.55979,1.97996,4.42008,0.81001,36.48067,6.06019,7.73024,...,22.73017,2.9699,3.18998,4.49986,6.25006,6.40978,2.61997,2.40995,5.85981,3.54011
ENSG00000104635,ENST00000381237,28.28076,33.55968,48.78854,42.59069,19.43016,54.81025,29.19907,7.20001,42.77116,46.83026,...,15.50011,17.7595,46.931,32.99913,51.90034,61.28136,14.67005,27.75062,16.39986,29.77958
ENSG00000104635,ENST00000517370,0.2,0.38999,0.75001,0.53001,0.1,0.58999,0.22,0.27999,0.47,1.96001,...,0.25001,0.13,0.03,0.08,0.72998,1.27003,0.27999,0.15,0.38999,0.08
ENSG00000104635,ENST00000520644,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.64999,0.0,0.0,0.0,0.0,0.0
