In [136]:
import pandas as pd
import numpy as np
import os
import EpiClockInvasiveBRCA.src.util as epi_util
from EpiClockInvasiveBRCA.src.consts import consts

# Indir of data
proj_dir = os.path.join(consts['official_indir'], 'Ringner')

In [137]:
gene_expr = pd.read_table(os.path.join(proj_dir, 'GSE25307_series_matrix.txt'), index_col=0)

sample_mapper = pd.read_table(os.path.join(proj_dir, 'GSE25307_series_matrix_header.txt'), index_col=0, header=None).T.set_index('!Sample_geo_accession').squeeze('columns')
gene_expr = gene_expr.rename(columns=sample_mapper)

In [156]:
probe_annot

Unnamed: 0_level_0,CLONEID,GSE25307_hitBLAT,GSE25307_geneSymbol,GSE25307_description
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
31,H300005900,uc001vft.1,AK124707,"AK124707 : Homo sapiens cDNA FLJ37307 fis, clo..."
32,H200003146,NM_052902,STK11IP,STK11IP : LKB1 interacting protein
33,H300007286,NM_001042462,TRAPPC5,TRAPPC5 : trafficking protein particle complex 5
34,H200011174,NM_030935,TSC22D4,TSC22D4 : TSC22 domain family 4
53,H300021579,NM_003707,RUVBL1,RUVBL1 : RuvB-like 1
...,...,...,...,...
54905,H200000731,NM_002104,GZMK,GZMK : granzyme K precursor
54906,H200001515,NM_001008660,PICALM,PICALM : phosphatidylinositol-binding clathrin...
54907,H200000755,NM_000281,PCBD1,PCBD1 : pterin-4 alpha-carbinolamine dehydratase
54909,H200001057,NM_001079864,TAX1BP1,TAX1BP1 : Tax1 (human T-cell leukemia virus ty...


In [138]:
probe_annot = pd.read_table(os.path.join(proj_dir, 'GSE25307_SupplementaryProbeAnnotations.txt'), index_col=0)
gene_name = probe_annot.loc[gene_expr.index, 'GSE25307_geneSymbol']

title_cols = gene_expr.columns
gene_expr['geneSymbol'] = gene_expr.index.map(gene_name)
gene_expr = gene_expr.groupby('geneSymbol')[title_cols].mean()

In [139]:
relative_gene_expr = gene_expr.apply(lambda col:col - gene_expr.mean(axis=1), axis=0)
assert (relative_gene_expr.mean(axis=1) < 1e-5).all()

In [24]:
samp_annot = pd.read_table(os.path.join(proj_dir, 'GSE25307_sample_annotations.txt'), index_col=0, skiprows=1)

In [105]:
Ringner_clinical_dir = os.path.join(consts['official_indir'], 'Ringner')
clinical = pd.read_table(os.path.join(Ringner_clinical_dir, 'cohort.T2.clinical.txt'), index_col=0, dtype={'grade':str})
ductal_samples = clinical.index[clinical['in_analysis_dataset']].values
ductal_titles = clinical.loc[ductal_samples, 'Title']
ductal_titles.isin(sample_mapper).mean()

1.0

In [42]:
# clinical.merge(samp_annot['PAM50-classification'], left_on='Title', right_index=True, how='left').groupby('genefu.pam50.subtype')['PAM50-classification'].value_counts().unstack()

In [155]:
## Gene modules

# Import gene modules from Fredlund et al.

gene_module_dict = {}
with open(os.path.join(consts['official_indir'], 'misc', 'Fredlund.gene.modules.txt'), 'r') as f:
    line_list = f.readlines()

for line in line_list:
    list_temp = line.rstrip().split('\t')
    gene_module_dict[list_temp[0]] = list_temp[1:]

# Add columns for the average expression of each gene module
mod_expr_list = []

for module in gene_module_dict.keys():
    markers = gene_module_dict[module]
#     genes_used = gene_name.index[gene_name.isin(markers)]
    genes_used = np.array(markers)[np.isin(markers, gene_expr.index)]
    if genes_used.shape[0] != len(markers):
        print(f'{len(markers) - genes_used.shape[0]} genes not found from the {module} module')
    selected_expr = relative_gene_expr.loc[genes_used]
    avg_module_expression = selected_expr.mean(axis=0)
    avg_module_expression.name = module
    mod_expr_list.append(avg_module_expression)

mod_expr = pd.concat(mod_expr_list, axis=1)

12 genes not found from the Stroma module
14 genes not found from the Lipid module
33 genes not found from the IR module
9 genes not found from the Mitotic Checkpoint module
9 genes not found from the Mitotic Progression module
7 genes not found from the Basal module
2 genes not found from the Early Response module
2 genes not found from the SR module


In [157]:
out_data = gene_expr[ductal_titles]
out_data.to_csv(os.path.join(proj_dir, 'Lund_gene_expr_withSymbols.txt'), sep='\t')
out_data

Unnamed: 0_level_0,TAX577191,TAX577393,TAX577411,TAX577543,TAX577042,TAX577505,TAX577448,TAX577245,TAX577566,TAX577386,...,TAX577158,TAX577554,TAX577334,TAX577284,TAX577423,TAX577354,TAX577223,TAX577240,TAX577036,TAX577210
geneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2'-PDE,0.7452,-0.1512,-0.5328,0.2427,-0.0819,-0.2210,0.4052,-0.359896,0.3446,0.5387,...,0.0460,0.1253,0.6141,-0.6309,0.8549,-0.6969,-0.0415,-0.0332,0.5184,0.6845
76P,0.2147,0.1555,0.8774,0.0054,-0.1874,-0.3053,0.0772,-0.296900,0.6200,-0.4169,...,-0.9683,0.6999,-0.6183,-0.7011,1.0270,1.0040,0.3768,-0.3437,-0.3226,-0.2492
A1BG,-0.7369,0.1930,0.3552,0.3120,-0.3295,-0.2034,0.1255,0.650197,0.4808,-1.2440,...,0.4461,0.0449,-0.4189,-0.1669,0.0132,0.7221,0.6337,-0.1283,-0.4627,-0.0651
A2BP1,-0.4812,-0.7510,0.6040,-0.9365,-0.0105,-0.7851,-0.6253,-0.149097,-0.7304,-0.4558,...,-0.9274,-0.7724,0.8003,-0.3987,0.0348,1.8570,-0.9031,-0.4234,0.4693,-0.2046
A2M,-1.2000,1.2960,-0.3077,0.5633,-0.0369,-0.1476,-1.2920,-0.479903,-0.5928,-0.4058,...,-0.9326,-0.2339,2.8460,-0.1542,-0.5527,-0.5613,-0.9490,-0.9288,0.2112,-0.0552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
keratin 19,0.2776,-6.5590,-0.9534,-3.4920,-1.0780,0.4087,-0.2548,0.928504,-0.6307,-0.7792,...,1.1960,-0.2072,-3.3080,-0.7013,0.3195,-0.2081,3.2480,-0.3956,-0.3009,1.4100
pp9943,0.6392,0.6675,-0.7691,0.6864,0.5910,0.2402,0.5785,0.171300,1.1050,1.9860,...,-0.0999,0.6321,-0.5290,0.7756,0.1497,-0.7347,0.2909,-0.0691,-0.0973,-0.1033
tiga1,-0.0006,0.1016,-0.2041,0.4131,-0.6149,0.6816,0.6704,-0.548604,-0.2400,0.2293,...,-0.4702,0.6875,-0.3896,0.3311,0.7317,-0.5474,-0.7627,-0.0094,-0.2933,-1.2720
tmp_locus_39,0.3587,-0.1325,-0.0950,0.8296,-0.0593,0.2462,-0.3285,0.219193,0.1225,1.0350,...,0.2233,0.2605,-0.0247,0.4627,0.0159,-0.4504,0.1377,-0.0591,0.3669,-1.2720


In [146]:
a = mod_expr.loc[clinical.loc[ductal_samples, 'Title']]
b = clinical.loc[ductal_samples, gene_module_dict.keys()]