## Generation of Archer and Cavalli Signatures for GSVA
Used to generate data for Fig 2

Author: Max Gold

In [12]:
import os
import pandas as pd
from scipy import stats
from sklearn.preprocessing import quantile_transform

In [3]:
base_folder = '../data/'

## Proteomics

In [4]:
## Load proteomics
pdf = pd.read_csv(os.path.join(base_folder, "archer_prot_norm.csv.gz"), index_col=0)

In [5]:
## Load metadata
mdf = pd.read_excel(os.path.join(base_folder,"Archer_metadata.xlsx"), index_col=0)

shha = set(mdf[mdf['Proteome']=='SHHa'].index).intersection(pdf.index)
shhb = set(mdf[mdf['Proteome']=='SHHb'].index).intersection(pdf.index)

In [6]:
## A vs. B t-test for every gene
tdl = []
for p in pdf.columns:
    a = pdf.loc[shha, p]
    b = pdf.loc[shhb, p]
    t,p = stats.ttest_ind(a,b)
    tdl.append([t,p])
tdf = pd.DataFrame(tdl, index = pdf.columns, columns = ['t', 'p'])

In [7]:
## Get top 200 genes for both and save to CSV
adf = tdf.sort_values('t', ascending=False).head(200)
bdf = tdf.sort_values('t', ascending=True).head(200)

# adf.to_csv('shha_prot_sigs.csv')
# bdf.to_csv('shhb_prot_sigs.csv')

## Cavalli

In [8]:
## Import cav_df
cav_df = pd.read_table(os.path.join(base_folder,'cavalli_expression.txt.gz'), index_col = 0)
cav_df.index = cav_df['HGNC_symbol_from_ensemblv77']
badcols = ['Description', 'EnsemblGeneID_from_ensemblv77', 'HGNC_symbol_from_ensemblv77', 'HGNC_ID_from_ensemblv77']
cav_df = cav_df.drop(badcols, axis=1)
cav_df = cav_df.groupby(cav_df.index).mean()
cav_df = cav_df.T

In [9]:
cav_ann = pd.read_excel(os.path.join(base_folder,"cavalli_subtypes.xlsx"), index_col=0, skiprows=[0])

  warn(msg)


In [10]:
cav_shh = list(cav_ann[cav_ann['Subgroup']=='SHH'].index)
cav_gamma = list(cav_ann[cav_ann['Subtype']=='SHH_gamma'].index)
cav_beta = list(cav_ann[cav_ann['Subtype']=='SHH_beta'].index)
cav_delta = list(cav_ann[cav_ann['Subtype']=='SHH_delta'].index)
cav_alpha = list(cav_ann[cav_ann['Subtype']=='SHH_alpha'].index)

In [13]:
## quantile normalize
def get_qt(xdf):
    return pd.DataFrame(quantile_transform(xdf, n_quantiles=xdf.shape[1], random_state=0, axis=1, output_distribution='uniform'), index=xdf.index, columns=xdf.columns)

qcav = get_qt(cav_df.loc[cav_shh])



In [14]:
xd = {'Gamma': cav_gamma, 'Beta': cav_beta, 'Delta': cav_delta, 'Alpha': cav_alpha}

In [15]:
## calculate t-values for each group vs. other SHH
btd = {}
genes = list(qcav.columns)
for k,v in xd.items():
    ll = []
    tt = qcav.loc[v].values
    ta = qcav.drop(v).values
    for index,g in enumerate(genes):
        val = tt[:,index]
        oval = ta[:,index]
        t,p = stats.ttest_ind(val, oval)
        ll.append([g,t,p])
    ltdf = pd.DataFrame(ll, columns = ['gene', 't', 'p'])
    btd[k] = ltdf.copy()
        

In [16]:
## Output top 200 genes as possible signature
for k,v in btd.items():
#     file = open(k + '_cavalli_sigs_200.csv', 'w')
#     for index, row in v.sort_values('t', ascending=False).head(200).iterrows():
#         file.write(row['gene'] + '\n')
#     file.close()
    print(k, end= ', ')

Gamma, Beta, Delta, Alpha, 