# Figure 1 and 2 (GSEA across tissues)

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.metrics import explained_variance_score
import pandas as pd
import numpy as np
from plotnine import *
from data_utils.data_handling import DataHandler
from scipy.stats import pearsonr, spearmanr

In [None]:
import rpy2
%load_ext rpy2.ipython

In [None]:
%%R
suppressPackageStartupMessages({
  library(data.table)
  library(ggplot2)
  library(ggthemes)
  library(ggrepel)
  library(ggpointdensity)
  library(viridis)
  library(cowplot)
})


In [None]:
data_path = '../../data/'

## Processing and loading data

Let's load the codon frequencies per transcript:

The DataHandler class computes sequence features and other attributes of each transcript from sequence data loaded into a file (loaded_data_gencode19)

In [None]:
data_handler = DataHandler(data_path+'loaded_data_gencode19')

In [None]:
codon_freq_df = data_handler.get_codon_ratio()
codon_freq_df

In [None]:
codons_to_use = list(codon_freq_df.columns.drop(['TAA', 'TAG', 'TGA'])) # Stop codons not considered

In [None]:
chromosome_df = data_handler.get_mrna_features(include_non_aug_start=False).loc[:, ['chromosome']]
chromosome_df

In [None]:
codon_freq_df = codon_freq_df.join(chromosome_df, how='inner')
codon_freq_df

Relative mRNA half-lives per subtissue

In [None]:
ei_centered_df = pd.read_csv(data_path+'gtex_ei_ratio_subtissues_85_percent_non_nas.csv', index_col=0)
ei_centered_df

In [None]:
2**ei_centered_df.apply(lambda el: el.quantile(.9)- el.quantile(.1), axis=1).median()

Typically mRNA stability varies 2.7-fold between tissues (median fold change between lowest and highest decile)

Reference decoding rates (geometric mean of the codon decoding rates estimated in HEK293 (Tuller et al)):

In [None]:
dec_rates_df = pd.read_csv(data_path + '/gencode19_avg_dec_rate', sep=' ', names=['transcript_id','mtdr'], index_col=0)
dec_rates_df

In [None]:
ei_centered_df.join(dec_rates_df, how='inner').to_csv('../../figures/figure_data/fig1/ei_centered_dec_rates_subtissues.csv')

Major-tissue-level EI ratios

In [None]:
ei_ratio_major_tissue_df = pd.read_csv(data_path+'gtex_ei_ratio_major_tissues_85_percent_non_nas.csv', index_col=0)
ei_ratio_major_tissue_df

In [None]:
all_atributes = pd.read_csv(data_path + 'GTEx_Annotations.csv')

In [None]:
tissue_subtissue_df = all_atributes.loc[:, ['SMTSD', 'SMTS']].drop_duplicates().rename({'SMTSD':'subtissue', 'SMTS':'major tissue'}, axis=1)
tissue_subtissue_df

## Visualizing codon frequency vs EI ratio

In [None]:
model_data_df = ei_centered_df.join(codon_freq_df, how='inner')
model_data_df

In [None]:
def quantiles_to_string(quantile):
    left = quantile.left * 100 if quantile.left >= 0 else 0.0
    right = quantile.right * 100
    return f'{left:.2f}-{right:.2f}%'

In [None]:
def boxplot_tissues_codon_freq(df, tissues_of_interest, codon, quantiles=25, xlab=None, ylab=None, legend_position=(.75,.825)):
    
    tissues_codon_df = df.loc[:, [codon] + tissues_of_interest]
    #tissues_codon_df.loc[:, tissues_of_interest] = tissues_codon_df.loc[:, tissues_of_interest].sub(tissues_codon_df.loc[:, tissues_of_interest].median(axis=0),axis=1)
    tissues_codon_df[f'{codon}_freq_quantile'] = pd.qcut(tissues_codon_df[codon],q=quantiles, duplicates='drop')
    tissues_codon_df[f'{codon}_freq_quantile'] = tissues_codon_df[f'{codon}_freq_quantile'].apply(quantiles_to_string)
    
    tissues_codon_df.loc[:, tissues_of_interest] = 2**tissues_codon_df.loc[:, tissues_of_interest]

    tissues_std = tissues_codon_df.groupby(f'{codon}_freq_quantile')[tissues_of_interest].apply(lambda group: group.std()/np.sqrt(len(group))).reset_index()
    
    tissues_std = tissues_std.melt(id_vars=f'{codon}_freq_quantile', value_name='std', var_name='tissue').set_index([f'{codon}_freq_quantile', 'tissue'])
    

    tissues_median = tissues_codon_df.groupby(f'{codon}_freq_quantile')[tissues_of_interest].median().reset_index()
    tissues_median = tissues_median.melt(id_vars=f'{codon}_freq_quantile', value_name='median', var_name='tissue').set_index([f'{codon}_freq_quantile', 'tissue'])

    tissues_q_median_df = pd.concat([tissues_std, tissues_median], axis=1).reset_index()

    xlab = codon + ' frequency' if xlab is None else xlab
    ylab = 'Half-life fold change' if ylab is None else ylab

    p = (ggplot(tissues_q_median_df, aes(x=f'{codon}_freq_quantile', y='median', color='tissue', group='tissue'))
            + geom_line(size=1)
            + geom_point(size=2.5)
            + geom_errorbar(aes(x=f'{codon}_freq_quantile', ymin='median-std', ymax='median+std'),size=1)
            + theme_bw()
            + theme(axis_text_x = element_text(angle=45), text=element_text(size=14), figure_size=(10,8), legend_position=legend_position,  legend_title = element_text(size = 20),
                     axis_title_x=element_text(size=19), axis_title_y=element_text(size=20),
                    #panel_grid_minor = element_blank(),
                    panel_border = element_blank(),
                    panel_background = element_blank())
            + scale_y_continuous(breaks=[0.5,1,1.5,2],limits=[0.5,2])
            #+ scale_y_log10(breaks=[0.01, 0.1,0.5,0.25,1,2,4,10,100])
            + labs(x=xlab, y=ylab, color='')
            + scale_color_manual(values=["#A7C7E7", "#D26081"]))
    
    return p

In [None]:
codon='CTG'
tissues_of_interest=['Skin - Not Sun Exposed (Suprapubic)', 'Brain - Frontal Cortex (BA9)']
plot_df = ei_centered_df.join(codon_freq_df, how='inner')

In [None]:
def boxplot_tissues_codon_freq2(df, tissues_of_interest, codon, quantiles=25, xlab=None, ylab=None, legend_position=(.75,.825)):
     
    #Get quantiles for codon frequencies and their respective mean
    tissues_codon_df = df.loc[:, [codon] + tissues_of_interest]
    tissues_codon_df[f'{codon}_freq_quantile'] = pd.qcut(tissues_codon_df[codon],q=quantiles, duplicates='drop')
    avg_codon_freq_df = tissues_codon_df.groupby([f'{codon}_freq_quantile']).mean()[codon].to_frame().rename(
        {codon:'mean_freq'}, axis=1)
    tissues_codon_melted_df = tissues_codon_df.melt(id_vars=[codon, f'{codon}_freq_quantile'], var_name='tissue', value_name='rel_hl')

    #Get std of the mean and the mean of relative half-life for each quantile
    tissues_std = tissues_codon_df.groupby(f'{codon}_freq_quantile')[tissues_of_interest].apply(lambda group: group.std()/np.sqrt(len(group))).reset_index()
    tissues_std = tissues_std.melt(id_vars=f'{codon}_freq_quantile', value_name='std', var_name='tissue').set_index([f'{codon}_freq_quantile', 'tissue'])

    tissues_mean = tissues_codon_df.groupby(f'{codon}_freq_quantile')[tissues_of_interest].mean().reset_index()
    tissues_mean = tissues_mean.melt(id_vars=f'{codon}_freq_quantile', value_name='mean', var_name='tissue').set_index([f'{codon}_freq_quantile', 'tissue'])

    tissues_q_mean_df = pd.concat([tissues_std, tissues_mean], axis=1).reset_index()

    #Get dataframe with mean of frequency relative half-life and with std of the mean for each quantile
    tissues_q_mean_df = tissues_q_mean_df.merge(
        avg_codon_freq_df, left_on=f'{codon}_freq_quantile',right_index=True)

    xlab = codon + ' frequency' if xlab is None else xlab
    ylab = 'Half-life fold change' if ylab is None else ylab

    p = (ggplot(tissues_q_mean_df, aes(x='mean_freq', y='2**mean',color='tissue',group='tissue'))
            + geom_smooth(data=tissues_codon_melted_df, 
                          mapping=aes(x=codon, y='2**rel_hl', color='tissue'), method='lm')
            + geom_errorbar(aes(x='mean_freq', ymin='2**(mean-std)', ymax='2**(mean+std)'),size=1,width = .0025)
            + geom_point(size=2.5)
            + theme_bw()
            + theme(text=element_text(size=14), figure_size=(10,8), legend_position=legend_position,  
                    legend_title = element_text(size = 20),
                     axis_title_x=element_text(size=19), axis_title_y=element_text(size=20),
                    #panel_grid_minor = element_blank(),
                    panel_border = element_blank(),
                    panel_background = element_blank())
            + scale_y_log10(breaks=[0.5,1,1.5,2])
            + coord_cartesian(xlim=(0,0.1),ylim=(np.log10(0.5),np.log10(2)))
            + labs(x=xlab, y=ylab, color='')
            + scale_color_manual(values=["#A7C7E7", "#D26081"]))
    
    return p

In [None]:
p = boxplot_tissues_codon_freq2(plot_df, tissues_of_interest, codon=codon, quantiles=15, ylab='Relative mRNA half-life',
                          xlab='\nCUG frequency', legend_position=(.75,.85))
p

In [None]:
p.save('../../figures/plots/fig1/cug_rel_hl_skin_brain.png')

In [None]:
p = boxplot_tissues_codon_freq(plot_df, tissues_of_interest, codon=codon, quantiles=15, ylab='Relative mRNA half-life',
                          xlab='\nCUG frequency', legend_position=(.75,.85))
p

## Computing COMD coefficient and evaluating the predictive power of reference decoding rate on relative mRNA HL

In [None]:
model_data_df = ei_centered_df.join(chromosome_df, how='inner')
model_data_df = model_data_df.join(dec_rates_df, how='inner')
model_data_df

In [None]:
model_data_df.mtdr.quantile(.96) - model_data_df.mtdr.quantile(.04) 

~1 codon per second corresponds to the difference between the slower and the faster 4th percentile

### Predictive power of reference decoding rate on mRNA half-life

In [None]:
def uni_var_linreg_tissues(tissues, train_df, test_df, feature):
    # If test_df = None, model will be trained and evaluated on the train_df 
    
    models_dict = {'tissue': tissues, 'model': [], 'exp_var_test': [], 'r_test':[]}
    test_preds_list = []
    coefs_df_list = []

    for tissue in tissues:

        train_tissue_df = train_df.loc[:, [tissue, feature]].dropna()
        test_tissue_df = test_df.loc[:, [tissue, feature]].dropna() if test_df is not None else train_tissue_df

        model = LinearRegression()
        model.fit(train_tissue_df[feature].values.reshape(-1, 1), train_tissue_df[tissue].values)
        
        test_tissue_df['prediction'] = model.predict(test_tissue_df[feature].values.reshape(-1, 1))
        exp_var_test = explained_variance_score(test_tissue_df[tissue].values, test_tissue_df['prediction'])
        pearson_test = pearsonr(test_tissue_df[tissue].values, test_tissue_df['prediction'])[0]

        models_dict['model'].append(model)
        models_dict['r_test'].append(pearson_test)
        models_dict['exp_var_test'].append(exp_var_test)

        test_tissue_preds_df = test_tissue_df.loc[:, [tissue, 'prediction']].rename({tissue:'ground_truth'}, axis=1)
        test_tissue_preds_df['tissue'] = tissue
        test_preds_list.append(test_tissue_preds_df)
        coefs_df_list.append(model.coef_[0])

    coefs_df = pd.DataFrame({'tissue':tissues, 'coef':coefs_df_list}).sort_values(by='coef')
    test_preds_df = pd.concat(test_preds_list)
    model_df = pd.DataFrame(models_dict)

    return coefs_df, test_preds_df, model_df

Let's compute the linear regression on random chromosome test sets in order to get an estimate of the error in the perfomance metrics

In [None]:
n_test_sets = 10
model_df_list = []
for test_set_i in range(n_test_sets):
    
    bad_test_set_size = True
    while bad_test_set_size:
        # Sample from 3 to 6 chromosomes to belong to the test set, the size of test has to be between 0.2 and 0.25 of the 
        # total amount of points
        test_chromosomes = np.random.choice(model_data_df['chromosome'].unique(), size=np.random.randint(low=3,high=6), replace=False)
        train_df = model_data_df[~model_data_df['chromosome'].isin(test_chromosomes)]
        test_df = model_data_df[model_data_df['chromosome'].isin(test_chromosomes)]
    
        test_all_ratio = len(test_df)/len(model_data_df)
        if  (test_all_ratio >= 0.2) & (test_all_ratio <= 0.25):
            bad_test_set_size = False
            print(test_chromosomes)
            print('test set ratio:', test_all_ratio)
            
    coefs_df, test_preds_df, model_df = uni_var_linreg_tissues(tissues=ei_centered_df.columns, train_df=train_df, test_df=test_df, feature='mtdr')
    
    model_df.set_index('tissue', inplace=True)
    model_df.columns = model_df.columns + f'_test_set_{test_set_i}'
    model_df_list.append(model_df)

In [None]:
models_df = pd.concat(model_df_list, axis=1)

In [None]:
models_df['r_test_std'] = models_df.loc[:, models_df.columns.str.contains('^r_test')].std(axis=1)
models_df['r_test_mean'] = models_df.loc[:, models_df.columns.str.contains('^r_test')].mean(axis=1)

In [None]:
models_df['exp_var_test_std'] = models_df.loc[:, models_df.columns.str.contains('exp_var_test')].std(axis=1)
models_df['exp_var_test_mean'] = models_df.loc[:, models_df.columns.str.contains('exp_var_test')].mean(axis=1)

In [None]:
models_df.reset_index(inplace=True)
models_df['tissue'] = pd.Categorical(values=models_df['tissue'].values, 
                                    categories=models_df.sort_values(by='r_test_mean')['tissue'], ordered=True)

(ggplot(models_df, aes('tissue', 'r_test_mean'))
    + geom_col(fill='grey', color='white')
    + geom_errorbar(aes(ymin='r_test_mean-r_test_std', ymax='r_test_mean+r_test_std')) 
    + theme_bw()
    + theme(panel_grid_major_x = element_blank(),
            panel_background = element_blank(),
            axis_line=element_line(), 
            axis_text_x = element_text(angle=50, size=9,hjust = 1), figure_size=(9,6))
    + labs(y='r test set'))

### Computing the COMD coefficient

#### Subtissue level

To do it we are going to consider the whole dataset and not divide into sets.

In [None]:
coefs_df, test_preds_df, model_df = uni_var_linreg_tissues(tissues=ei_centered_df.columns, train_df=model_data_df, test_df=None, feature='mtdr')

In [None]:
comd_df = coefs_df.copy()
comd_df.rename({'coef':'comd_coef'}, axis=1, inplace=True)

In [None]:
comd_df.to_csv(data_path + 'comd_coef_subtissues.csv')
comd_df.to_csv('../../figures/figure_data/fig1/comd_coef_subtissues.csv')

In [None]:
model_df.reset_index(inplace=True)
model_df['tissue'] = pd.Categorical(values=model_df['tissue'].values, 
                                    categories=model_df.sort_values(by='r_test')['tissue'], ordered=True)

(ggplot(model_df, aes('tissue', 'r_test'))
    + geom_col(fill='grey', color='white')
    + theme_bw()
    + theme(panel_grid_major_x = element_blank(),
            panel_background = element_blank(),
            axis_line=element_line(), 
            axis_text_x = element_text(angle=50, size=9,hjust = 1), figure_size=(9,6))
    + labs(y='r all data'))

In [None]:
coefs_df['tissue'] = pd.Categorical(values=coefs_df['tissue'].values, categories=coefs_df.sort_values(by='coef')['tissue'], ordered=True)

(ggplot(coefs_df, aes('tissue', 'coef'))
    + geom_point(color='green', size=3)
    + scale_y_continuous(limits=(-0.6,0.7))
    + theme_bw()
    + theme( panel_grid_major = element_blank(), legend_title = element_text(size = 20),
                    #panel_grid_minor = element_blank(),
                    #panel_border = element_blank(),
                    panel_background = element_blank(),
                axis_line=element_line(), 
           axis_text_x = element_text(angle=50, size=10,hjust = 1), figure_size=(12,4), axis_title_x=element_text(size=16), axis_title_y=element_text(size=16),
         axis_text_y = element_text(size=11))
)

#### Major tissue level

In [None]:
model_data_df = ei_ratio_major_tissue_df.join(chromosome_df, how='inner')
model_data_df = model_data_df.join(dec_rates_df, how='inner')
model_data_df

To do it we are going to consider the whole dataset and not divide into sets.

In [None]:
coefs_df, test_preds_df, model_df = uni_var_linreg_tissues(tissues=ei_ratio_major_tissue_df.columns, train_df=model_data_df, test_df=None, feature='mtdr')

In [None]:
comd_major_tissue_df = coefs_df.copy()

In [None]:
comd_major_tissue_df.rename({'coef':'comd_coef'}, axis=1, inplace=True)

In [None]:
comd_major_tissue_df.to_csv(data_path + 'comd_coef_tissue.csv')

In [None]:
comd_major_tissue_df.to_csv('../../figures/figure_data/fig1/comd_coef_tissue.csv')

In [None]:
model_df.reset_index(inplace=True)
model_df['tissue'] = pd.Categorical(values=model_df['tissue'].values, 
                                    categories=model_df.sort_values(by='r_test')['tissue'], ordered=True)

(ggplot(model_df, aes('tissue', 'r_test'))
    + geom_col(fill='grey', color='white')
    + theme_bw()
    + theme(panel_grid_major_x = element_blank(),
            panel_background = element_blank(),
            axis_line=element_line(), 
            axis_text_x = element_text(angle=50, size=9,hjust = 1), figure_size=(9,6))
    + labs(y='r test set'))

In [None]:
coefs_df['tissue'] = pd.Categorical(values=coefs_df['tissue'].values, categories=coefs_df.sort_values(by='coef')['tissue'], ordered=True)

(ggplot(coefs_df, aes('tissue', 'coef'))
    + geom_point(color='green', size=3)
    + scale_y_continuous(limits=(-0.6,0.6))
    + theme_bw()
    + theme( panel_grid_major = element_blank(), legend_title = element_text(size = 10),
                    #panel_grid_minor = element_blank(),
                    #panel_border = element_blank(),
                    panel_background = element_blank(),
                axis_line=element_line(), 
           axis_text_x = element_text(angle=50, size=10,hjust = 1), figure_size=(8,3), axis_title_x=element_text(size=16), axis_title_y=element_text(size=16),
         axis_text_y = element_text(size=11)))

## Heatmap of the slope between codon frequency and relative mRNA half-life 

In [None]:
codon_ei_df = ei_centered_df.join(codon_freq_df, how='inner')
codon_ei_df

Let's compute the slope of codon frequency vs HL

In [None]:
tissues = ei_centered_df.columns

for j, tissue in enumerate(tissues):
    codon_coef_list = []
    codon_list = []
    for codon in codons_to_use:
        ei_codon_i_tissue_j = codon_ei_df.loc[:, [codon, tissue]].dropna()
        codon_list.append(codon)
        model = LinearRegression()
        x = ei_codon_i_tissue_j.loc[:, codon].values.reshape(-1, 1)
        y = ei_codon_i_tissue_j.loc[:, tissue]
        model.fit(x, y)
        codon_coef_list.append(model.coef_[0])
    
    tissue_coefs_df = pd.DataFrame({'codon':codon_list, tissue: codon_coef_list})
    
    if j==0:
        all_tissue_coefs_df = tissue_coefs_df
    else:
        all_tissue_coefs_df = all_tissue_coefs_df.merge(tissue_coefs_df, left_on='codon', right_on='codon')

Order tissues by its comd coefficient value:

In [None]:
all_tissue_coefs_df = all_tissue_coefs_df.set_index('codon').loc[:, comd_df.tissue]

In [None]:
import seaborn as sns

In [None]:
sns.clustermap(all_tissue_coefs_df, xticklabels=True, yticklabels=True, figsize=(11,11),method='average', row_cluster=True, col_cluster=False, 
                       cmap="RdBu_r", cbar_kws={'label': 'estimated effect\n on HL fold change'})

Loading codon optimalit and speed metrics:

In [None]:
cdt_df = pd.read_csv(data_path+'cdt_tuller_organisms.csv', sep=';').dropna().set_index('Codon').loc[:, ['H. sapiens  HEK293']]
cdt_df.rename({'H. sapiens  HEK293':'decoding_time'}, axis=1, inplace=True)
cdt_df['decoding rate (HEK293)'] = 1/cdt_df['decoding_time']
csc_df = pd.read_csv(data_path+'csc_wu_etal.csv')
csc_df['average csc']=csc_df.loc[:, ['293T_endo', 'HeLa_endo', 'RPE_endo', '293T_ORFome','K562_ORFome']].mean(axis=1) 
csc_df.set_index('codon', inplace=True)

In [None]:
codon_metrics_df = pd.concat([cdt_df, csc_df], join='inner', axis=1)
codon_metrics_df

In [None]:
tissues = ei_centered_df.columns
r_plot_df = all_tissue_coefs_df.join(codon_metrics_df.loc[:, 'decoding rate (HEK293)'], how='inner')
r_plot_df = r_plot_df.sort_values(by= 'Skin - Sun Exposed (Lower leg)')
sat_value=10
r_plot_df.loc[:, tissues] = r_plot_df.loc[:, tissues].applymap(lambda el: np.sign(el)*sat_value if np.abs(el)>sat_value else el) #saturate values to sat_value
r_plot_df.to_csv('../../figures/figure_data/fig1/codon_subtissue_effects_rplot.csv')

## Correlation between estimated codon effects per tissue and codon optimality metrics

In [None]:
metrics_of_interest = ['293T_endo', 'HeLa_endo', 'RPE_endo', '293T_ORFome','K562_ORFome','average csc','decoding rate (HEK293)']
all_tissue_coefs_df = all_tissue_coefs_df.join(codon_metrics_df.loc[:, metrics_of_interest], how='inner')

In [None]:
all_tissue_coefs_df.reset_index(inplace=True)
all_tissue_coefs_df

In [None]:
cor_df = all_tissue_coefs_df.corr(method='spearman')

In [None]:
# Correlation heatmap
import matplotlib.pyplot as plt
sns.set(font_scale=2)
corr_df = all_tissue_coefs_df.corr(method='spearman')
fig, ax = plt.subplots(figsize=(15,30)) 

cor_plot = sns.heatmap(cor_df.iloc[:,49:], cmap='RdBu', ax=ax, annot=True, cbar_kws={'label': 'correlation (\u03c1)','shrink': 0.2}, annot_kws={"fontsize":20})

In [None]:
fig.savefig("../../figures/plots/fig1/cor_optimality_dec_rate_codon_effects.png", dpi=200,bbox_inches="tight")

## Gene set enrichment analysis COMD coefficient vs gene expression

Loading tpms per transcript and tissue from gtex

In [None]:
tpm_df = pd.read_csv(data_path + 'gtex_official_median_tpm_per_tissue.csv').drop('gene_id', axis=1)
tpm_df

In [None]:
tpm_df = tpm_df.set_index('Description').T

In [None]:
tpm_df = tpm_df.merge(tissue_subtissue_df, left_index=True, right_on='subtissue').drop(['subtissue'], axis=1)
tpm_df

In [None]:
tpm_df = tpm_df.groupby('major tissue').median().T.reset_index().rename({'index':'Description'}, axis=1).set_index('Description')
tpm_df

Consider genes with TPM>1 across all tissues

In [None]:
tpm_df = tpm_df[tpm_df>1]
tpm_df = tpm_df.dropna()
tpm_df

This gene appears more than once with the same name (snoRNA)

In [None]:
duplicated_genes = tpm_df.loc[tpm_df.index.duplicated()].index.unique()
duplicated_genes

Remove duplicated genes:

In [None]:
tpm_df = tpm_df.loc[tpm_df.index.difference(duplicated_genes)].T
tpm_df

In [None]:
exp_comd_df = comd_major_tissue_df.merge(tpm_df, left_on='tissue', right_index=True)
exp_comd_df

In [None]:
rho_list = []
for gene in tpm_df.columns:
    rho_list.append(float(spearmanr(exp_comd_df['comd_coef'], exp_comd_df[gene])[0]))
    
rho_df = pd.DataFrame({'gene': tpm_df.columns, 'rho': rho_list})
rho_df

In [None]:
rho_df = rho_df.dropna().sort_values('rho', ascending=False)
rho_df

Prepare the data table for inpout into the gsea prerank function:

In [None]:
rho_df.columns = [0,1]
rho_df.reset_index(drop=True,inplace=True)
rho_df

In [None]:
import gseapy as gp
pre_res = gp.prerank(rnk=rho_df, gene_sets='GO_Biological_Process_2018',
                     processes=40,
                     permutation_num=40000,
                     outdir='gsea_results/gtex_major_tissue_comd_coef_gsea', format='svg', seed=123)

There are genes with the same spearman correlation. Untying them is arbitrary, but the rank of those genes with respect to all is still the same. After running gsea untying the list by adding small random noise, the results barely change. 

In [None]:
pre_res.res2d.drop('Name', axis=1).set_index('Term').to_csv(data_path + 'gsea_human_comd_results.csv')

In [None]:
from gseapy.plot import gseaplot

term = 'mitochondrial ATP synthesis coupled electron transport (GO:0042775)'
gseaplot(rank_metric=pre_res.ranking, term=term, 
         **pre_res.results[term], figsize=(3.5,5.5), cmap='RdBu', ofname='../../figures/plots/fig2/mito_atp_syn_gsea.svg')

In [None]:
from gseapy.plot import gseaplot

term = 'mitochondrial ATP synthesis coupled electron transport (GO:0042775)'
gseaplot(rank_metric=pre_res.ranking, term=term, 
         **pre_res.results[term], figsize=(3.5,5.5), cmap='RdBu')

In [None]:
pre_res.res2d.sort_values(by="NES", ascending=True).iloc[:40]

In [None]:
pre_res.res2d.sort_values(by="NES", ascending=False).iloc[:40]

In [None]:
exp_comd_df.to_csv('../../figures/figure_data/fig2/tpm_comd_major_tissue.csv')

In [None]:
adjust_text_dict = {
    'expand_points': (0, 0),
    'arrowprops': {
        'arrowstyle': '-',
        'color': 'k',
        'lw':0.3
    }
}
gene='MT-RNR2'
(ggplot(exp_comd_df, aes('comd_coef', gene, label='tissue'))
    + geom_point(size=1, color="dodgerblue")
    + theme_bw()
    + theme(figure_size=(6, 6), axis_line=element_line(),#panel_grid_major = element_blank(),
                    #panel_grid_minor = element_blank(),
                    #panel_border = element_blank(),
                    panel_background = element_blank())
    + scale_y_log10()
    + geom_text(aes(label='tissue'), adjust_text = adjust_text_dict, size=7)
    + labs( y=gene+' TPM'))

In [None]:
adjust_text_dict = {
    'expand_points': (0, 0),
    'arrowprops': {
        'arrowstyle': '-',
        'color': 'k',
        'lw':0.3
    }
}
gene='NDUFB3'
(ggplot(exp_comd_df, aes('comd_coef', gene, label='tissue'))
    + geom_point(size=1, color="dodgerblue")
    + theme_bw()
    + theme(figure_size=(6, 6), axis_line=element_line(),#panel_grid_major = element_blank(),
                    #panel_grid_minor = element_blank(),
                    #panel_border = element_blank(),
                    panel_background = element_blank())
    + scale_y_log10()
    + geom_text(aes(label='tissue'), adjust_text = adjust_text_dict, size=7)
    + labs( y=gene+' TPM'))

## GSEA all results: human - between and across tissues plus mouse

In [None]:
pre_res_mouse = pd.read_csv(data_path + 'gsea_tabula_muris_comd_coef_results.csv',index_col=0)
pre_res_human = pd.read_csv(data_path +'gsea_human_comd_results.csv',index_col=0)

In [None]:
pre_res_mouse['tissue'] = 'mouse'
pre_res_human['tissue'] = 'human'

In [None]:
pre_res_human.reset_index()[pre_res_human.reset_index()['Term'].str.contains('translation')]

In [None]:
pre_res_tissues_df = pd.read_csv(data_path + 'gsea_within_tissues_human_comd_coef.csv', index_col=0).set_index('Term')
pre_res_tissues_df

In [None]:
pre_res_list = [pre_res_human, pre_res_mouse, pre_res_tissues_df]

In [None]:
fdr_thresh=0.01
pre_res_filtered_list = []
#n=5

for pre_res in pre_res_list:
    pre_res_filtered = pre_res[pre_res['FDR q-val']<=fdr_thresh].sort_values(by='NES')
    if(len(pre_res_filtered) == 0):
        continue
    #Select top and bottom n pathways 
    #pre_res_filtered = pre_res_filtered.iloc[np.r_[:n, len(pre_res_filtered)-n:len(pre_res_filtered)],:]
    pre_res_filtered_list.append(pre_res_filtered)

In [None]:
pre_res_all_df = pd.concat(pre_res_filtered_list, join='outer').reset_index()
pre_res_all_df

In [None]:
pre_res_all_piv = pre_res_all_df.pivot(columns='Term', values='NES',index='tissue')
pre_res_all_piv.columns = pd.Series(pre_res_all_piv.columns).apply(lambda el: el.split('(G')[0])
pre_res_all_piv = pre_res_all_piv.T
pre_res_all_piv

Select pathways common to both human and mouse (across tissues only)

In [None]:
pre_res_all_piv_filtered = pre_res_all_piv.copy()
pre_res_all_piv_filtered = pre_res_all_piv_filtered[~((pre_res_all_piv_filtered['human'].isna())|(pre_res_all_piv_filtered['mouse'].isna()))]
pre_res_all_piv_filtered = pre_res_all_piv_filtered.rename({'human':'HUMAN', 'mouse':'MOUSE'}, axis=1)

In [None]:
%%R -w 1200 -h 900 -i pre_res_all_piv_filtered
library(devtools)
library(pheatmap)
pheatmap(pre_res_all_piv_filtered, na_col = "grey", cluster_rows=FALSE, cluster_cols=FALSE, fontsize=12)

Remove tissues with all Nas:

In [None]:
pre_res_all_piv_filtered = pre_res_all_piv_filtered.dropna(axis=1,thresh=1)
pre_res_all_piv_filtered

In [None]:
pre_res_all_piv_filtered.to_csv('../../figures/figure_data/fig2/common_pathways_mouse_human_within_tissues.csv')