In [None]:
#imports
import pandas as pd
import ast
import matplotlib
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
np.random.seed(133)

In [None]:
#data from: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE50049
	# Artieri CG, Fraser HB. Evolution at two levels of gene expression in yeast. 
    #Genome Res 2014 Mar;24(3):411-21. PMID: 24318729

df = pd.read_csv('/Users/clairedubin/spur/publishable_data/external_datasets/GSE50049_PROCESSED_DATA_10samples.csv')
df.head()

In [None]:
#remove any genes not tested in dxy
dxy_genes = pd.read_csv('/Users/clairedubin/spur/peroxisomes/dxy/raw_data/1WineEuropean_dxy_110619.csv')['Unnamed: 0'].str.split('_', expand=True)[2].tolist()
df = df[df['SGD'].isin(dxy_genes)]
df.head()

In [None]:
# readME from website:

# SCER_GENE	S. cerevisiae gene name according to the annotation of Scanell et al. 2011
# SPAR_GENE	S. paradoxus gene name according to the annotation of Scanell et al. 2011
# SGD	Saccharomyces Genome Database gene identifier ('NA' indicates that the gene is present in the Scannell et al. (2011) annotation, but absent from the SGD)
# SGD_SYSTEMATIC_ID	SGD systematic ID
# STANDARD_NAME	Standard three letter designation
# SCER_MAPPABLE_BASES	Number of unmasked bases in the Scer ortholog
# SPAR_MAPPABLE_BASES	Number of unmasked bases in the Spar ortholog
# HYBRID_SCER_MRNA_REP1_TOTCOV	The total base level coverage of the Scer allele in replicate 1 of the mRNA fraction
# HYBRID_SCER_MRNA_REP2_TOTCOV	as above, replicate 2
# HYBRID_SPAR_MRNA_REP1_TOTCOV	The total base level coverage of the Spar allele in replicate 1 of the mRNA fraction (Note that all total coverage Spar samples are scaled such that the total number of bases covered are the same between species)
# HYBRID_SPAR_MRNA_REP2_TOTCOV	as above, replicate 2
# HYBRID_MRNA_REP1_RAT	The started (coverage + 1) Log2 Sc/Sp ratio for mRNA fraction replicate 1
# HYBRID_MRNA_REP2_RAT	as above, replicate 2
# PARENTAL_SCER_MRNA_REP1_TOTCOV	The total base level coverage of the Scer allele in replicate 1 of the mRNA fraction
# PARENTAL_SCER_MRNA_REP2_TOTCOV	as above, replicate 2
# PARENTAL_SPAR_MRNA_REP1_TOTCOV	The total base level coverage of the Spar allele in replicate 1 of the mRNA fraction (Note that all total coverage Spar samples are scaled such that the total number of bases covered are the same between species)
# PARENTAL_SPAR_MRNA_REP2_TOTCOV	as above, replicate 2
# PARENTAL_MRNA_REP1_RAT	The started (coverage + 1) Log2 Sc/Sp ratio for mRNA fraction replicate 1
# PARENTAL_MRNA_REP2_RAT	as above, replicate 2
# HYBRID_SCER_RIBO_REP1_TOTCOV	The total base level coverage of the Scer allele in replicate 1 of the Ribo fraction
# HYBRID_SCER_RIBO_REP2_TOTCOV	as above, replicate 2
# HYBRID_SPAR_RIBO_REP1_TOTCOV	The total base level coverage of the Spar allele in replicate 1 of the Ribo fraction
# HYBRID_SPAR_RIBO_REP2_TOTCOV	as above, replicate 2
# HYBRID_RIBO_REP1_RAT	The started (coverage + 1) Log2 Sc/Sp ratio for Ribo fraction replicate 1
# HYBRID_RIBO_REP2_RAT	as above, replicate 2
# PARENTAL_SCER_RIBO_REP1_TOTCOV	The total base level coverage of the Scer allele in replicate 1 of the Ribo fraction
# PARENTAL_SCER_RIBO_REP2_TOTCOV	as above, replicate 2
# PARENTAL_SPAR_RIBO_REP1_TOTCOV	The total base level coverage of the Spar allele in replicate 1 of the Ribo fraction
# PARENTAL_SPAR_RIBO_REP2_TOTCOV	as above, replicate 2
# PARENTAL_RIBO_REP1_RAT	The started (coverage + 1) Log2 Sc/Sp ratio for Ribo fraction replicate 1
# PARENTAL_RIBO_REP2_RAT	as above, replicate 2
# MRNA_CIS_TEST	≥ than 100 reads mapping to both orthologs in both replicates in the hybrid mRNA fraction (minimum to test for significant differences) (1 = yes, 0 = no)
# TRANSL_CIS_TEST	≥ than 100 reads mapping to both orthologs in both replicates in the hybrid mRNA and Ribo fractions (minimum to test for significant differences) (1 = yes, 0 = no)
# PARENTAL_TEST	≥ than 100 reads mapping to both orthologs in both replicates in all hybrid and parental mRNA fractions, and no orthologs differentially expressed due to mating type or ploidy state(1 = yes, 0 = no)
# MRNA_CIS_MAX_P	Max p value (2 tailed) obtained from the permutation approach testing for significant mRNA divergence in cis
# MRNA_CIS_DIR	Species with higher expression level
# MRNA_CIS_NO_TEST	Failed to test because: LOW_COV, < 100 reads in appropriate samples; REPS_DISAGREE, replicates disagree in direction
# TRANSL_CIS_MAX_P	Max p value (2 tailed) obtained from the permutation approach testing for significant translational divergence in cis
# TRANSL_CIS_DIR	Species with higher expression level
# TRANSL_CIS_NO_TEST	Failed to test because: LOW_COV, < 100 reads in appropriate samples; REPS_DISAGREE, replicates disagree in direction
# MRNA_TRANS_MAX_P	Max p value (2 tailed) obtained from the permutation approach testing for significant mRNA divergence in trans
# MRNA_TRANS_DIR	Species with higher expression level
# MRNA_TRANS_NO_TEST	Failed to test because: LOW_COV, < 100 reads in appropriate samples; REPS_DISAGREE, replicates disagree in direction
# TRANSL_TRANS_MAX_P	Max p value (2 tailed) obtained from the permutation approach testing for significant translational divergence in trans
# TRANSL_TRANS_DIR	Species with higher expression level
# TRANSL_TRANS_NO_TEST	Failed to test because: LOW_COV, < 100 reads in appropriate samples; REPS_DISAGREE, replicates disagree in direction
# MRNA_CIS_FDR_ADJ_P	FDR adjusted pvalue for the mRNA cis test
# TRANSL_CIS_FDR_ADJ_P	FDR adjusted pvalue for the translational cis test
# MRNA_TRANS_FDR_ADJ_P	FDR adjusted pvalue for the mRNA trans test
# TRANSL_TRANS_FDR_ADJ_P	FDR adjusted pvalue for the translational trans test
# MRNA_CIS_SCER	Significant mRNA divergence in cis where Scer is the more highly expressed allele? (1 = yes, 0 = no)
# MRNA_CIS_SPAR	Significant mRNA divergence in cis where Spar is the more highly expressed allele? (1 = yes, 0 = no)
# TRANSL_CIS_SCER	Significant translational divergence in cis where Scer is the more highly translated allele? (1 = yes, 0 = no)
# TRANSL_CIS_SPAR	Significant translational divergence in cis where Spar is the more highly translated allele? (1 = yes, 0 = no)
# MRNA_TRANS_SCER	Significant mRNA divergence in trans where Scer is the more highly expressed allele? (1 = yes, 0 = no)
# MRNA_TRANS_SPAR	Significant mRNA divergence in trans where Spar is the more highly expressed allele? (1 = yes, 0 = no)
# TRANSL_TRANS_SCER	Significant translational divergence in trans where Scer is the more highly translated allele? (1 = yes, 0 = no)
# TRANSL_TRANS_SPAR	Significant translational divergence in trans where Spar is the more highly translated allele? (1 = yes, 0 = no)
# HYBRID_SCER_MRNA_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Scer allele in the hybrid mRNA fraction, replicate 1
# HYBRID_SCER_MRNA_REP2_RPKM	as above, replicate 2
# HYBRID_SPAR_MRNA_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Spar allele in the hybrid mRNA fraction, replicate 1
# HYBRID_SPAR_MRNA_REP2_RPKM	as above, replicate 2
# HYBRID_SCER_RIBO_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Scer allele in the hybrid Ribo fraction, replicate 1
# HYBRID_SCER_RIBO_REP2_RPKM	as above, replicate 2
# HYBRID_SPAR_RIBO_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Spar allele in the hybrid Ribo fraction, replicate 1
# HYBRID_SPAR_RIBO_REP2_RPKM	as above, replicate 2
# PARENTAL_SCER_MRNA_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Scer allele in the parental mRNA fraction, replicate 1
# PARENTAL_SCER_MRNA_REP2_RPKM	as above, replicate 2
# PARENTAL_SPAR_MRNA_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Spar allele in the parental mRNA fraction, replicate 1
# PARENTAL_SPAR_MRNA_REP2_RPKM	as above, replicate 2
# PARENTAL_SCER_RIBO_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Scer allele in the parental Ribo fraction, replicate 1
# PARENTAL_SCER_RIBO_REP2_RPKM	as above, replicate 2
# PARENTAL_SPAR_RIBO_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Spar allele in the parental Ribo fraction, replicate 1
# PARENTAL_SPAR_RIBO_REP2_RPKM	as above, replicate 2
# AFFECTED_BY_PLOIDY	Gene differentially expressed in different ploidy states according to Wu et al. (2010) (1 = yes, 0 = no)
# AFFTECTED_BY_MATING_TYPE	Gene differentially expressed in different mating types according to Galitski et al. (1999) (1 = yes, 0 = no)
# TATA	TATA containing promoter according to Tirosh et al. (2006) (1 = yes, 0 = no)
# TIROSH_2008_OPN	OPN, DPN, or NA promoter according to Tirosh et al. (2008) (1 = yes, 0 = no)
# CDS_PDIV	Percent divergence in CDS
# PROMOTER_PDIV	Percent divergence in Promoter (200 bp upstream of TSS)
# FIVE_UTR_PDIV	Percent divergence in 5 prime UTR
# THREE_UTR_PDIV	Percent divergence in 3 prime UTR
# FIVE_UTR_50BP_PDIV	Percent divergence in first 50 bp of 5 prime UTR
# KA	Non-synonymous substitutions per non-synonymous site between S. cerevisiae and S. paradoxus
# KS	Synonymous substitutions per synonymous site between S. cerevisiae and S. paradoxus
# OMEGA	KA/KS

In [None]:
#load go term data as go_terms, remove 3 broad go terms
#source: http://geneontology.org/docs/download-go-annotations/

go_terms = pd.read_csv('/Users/clairedubin/spur/publishable_data/external_datasets/go_terms.csv', header=None)
go_terms = go_terms.drop(columns=[0, 1,3,5,6,7,8,11,12,13,14,15])
go_terms = go_terms.rename(columns={2: 'sgd_name', 4:'go_term', 9:'gene_desc', 10:'gene'})
go_terms['gene'] = [i[0] for i in go_terms['gene'].str.split('|')]
go_terms = go_terms.drop_duplicates()
go_terms = go_terms[~go_terms['go_term'].isin(['GO:0005575', 'GO:0008150', 'GO:0003674'])]
go_terms = go_terms.set_index('gene')
go_terms.head()

In [None]:
#load list of essential genes as essential_genes
#source: http://www-sequence.stanford.edu/group/yeast_deletion_project/Essential_ORFs.txt

essential = pd.read_csv('/Users/clairedubin/spur/publishable_data/external_datasets/essential.csv', header=None)
essential[1] = essential[1].str.strip('\t')
essential_genes = essential[1].str.strip(' ').tolist()

In [None]:
#split expression data into essential and non essential genes

essential_df = df[df['SGD'].isin(essential_genes)]
non_essential_df = df[~df['SGD'].isin(essential_genes)]

In [None]:
#merge GO term and expression data

merged = go_terms.merge(df, left_index=True, right_on='SGD')[['go_term', 'SGD', 'STANDARD_NAME','PARENTAL_RIBO_REP1_RAT', 'PARENTAL_RIBO_REP2_RAT', 'PARENTAL_MRNA_REP1_RAT', 'PARENTAL_MRNA_REP2_RAT','PARENTAL_TEST', 'HYBRID_MRNA_REP1_RAT', 'HYBRID_MRNA_REP2_RAT','HYBRID_RIBO_REP1_RAT', 'HYBRID_RIBO_REP2_RAT']]
merged['essential'] = merged.index.isin(essential_genes)
merged.head()

In [None]:
#group by GO term
go_term_groups = merged.groupby('go_term').count()

medians = merged.set_index('go_term').drop(columns=['SGD', 'STANDARD_NAME']).groupby('go_term').median()[['PARENTAL_RIBO_REP1_RAT', 'PARENTAL_RIBO_REP2_RAT', 'PARENTAL_MRNA_REP1_RAT', 'PARENTAL_MRNA_REP2_RAT','PARENTAL_TEST', 'HYBRID_MRNA_REP1_RAT', 'HYBRID_MRNA_REP2_RAT','HYBRID_RIBO_REP1_RAT', 'HYBRID_RIBO_REP2_RAT']].add_suffix('_median')
means = merged.set_index('go_term').drop(columns=['SGD', 'STANDARD_NAME']).groupby('go_term').mean()[['PARENTAL_RIBO_REP1_RAT', 'PARENTAL_RIBO_REP2_RAT', 'PARENTAL_MRNA_REP1_RAT', 'PARENTAL_MRNA_REP2_RAT','PARENTAL_TEST', 'HYBRID_MRNA_REP1_RAT', 'HYBRID_MRNA_REP2_RAT','HYBRID_RIBO_REP1_RAT', 'HYBRID_RIBO_REP2_RAT']].add_suffix('_mean')


medians['PARENTAL_MRNA_RAT_median_comb'] = medians[['PARENTAL_MRNA_REP1_RAT_median','PARENTAL_MRNA_REP2_RAT_median']].mean(axis=1).abs()
means['PARENTAL_MRNA_RAT_mean_comb'] = means[['PARENTAL_MRNA_REP1_RAT_mean','PARENTAL_MRNA_REP2_RAT_mean']].mean(axis=1).abs()
medians['HYBRID_MRNA_RAT_median_comb'] = medians[['HYBRID_MRNA_REP1_RAT_median','HYBRID_MRNA_REP2_RAT_median']].mean(axis=1).abs()
means['HYBRID_MRNA_RAT_mean_comb'] = means[['HYBRID_MRNA_REP1_RAT_mean','HYBRID_MRNA_REP2_RAT_mean']].mean(axis=1).abs()

medians['PARENTAL_RIBO_RAT_median_comb'] = medians[['PARENTAL_RIBO_REP1_RAT_median','PARENTAL_RIBO_REP2_RAT_median']].mean(axis=1).abs()
means['PARENTAL_RIBO_RAT_mean_comb'] = means[['PARENTAL_RIBO_REP1_RAT_mean','PARENTAL_RIBO_REP2_RAT_mean']].mean(axis=1).abs()
medians['HYBRID_RIBO_RAT_median_comb'] = medians[['HYBRID_RIBO_REP1_RAT_median','HYBRID_RIBO_REP2_RAT_median']].mean(axis=1).abs()
means['HYBRID_RIBO_RAT_mean_comb'] = means[['HYBRID_RIBO_REP1_RAT_mean','HYBRID_RIBO_REP2_RAT_mean']].mean(axis=1).abs()

go_term_groups = means.merge(medians, right_index=True, left_index=True)

go_term_groups.head()

In [None]:
#get GO:0005778 data
go5778_genes = ast.literal_eval(go_term_groups.loc['GO:0005778']['all_gene_list'])
go5778_df = df[df['SGD'].isin(go5778_genes)]
go5778_df = df[df['SGD'].isin(go5778_genes)][['SGD', 'STANDARD_NAME','PARENTAL_RIBO_REP1_RAT', 'PARENTAL_RIBO_REP2_RAT', 'PARENTAL_MRNA_REP1_RAT', 'PARENTAL_MRNA_REP2_RAT','PARENTAL_TEST', 'HYBRID_MRNA_REP1_RAT', 'HYBRID_MRNA_REP2_RAT','HYBRID_RIBO_REP1_RAT', 'HYBRID_RIBO_REP2_RAT']]
go5778_df.head()

In [None]:
#resample function for mRNA profiling

def resample_abs_median(go_term):
    
    loc = go_term_groups_75.loc[go_term]
    gene_list = ast.literal_eval(loc['tested_gene_list'])
    essential_count = loc['tested_essential_count']
    non_essential_count = len(gene_list) - essential_count
    
    my_parent_median = loc['PARENTAL_MRNA_RAT_median_comb']
    my_hybrid_median = loc['HYBRID_MRNA_RAT_median_comb']

    parent_count = 0
    hybrid_count = 0
    
    for i in range(10000):

        sample = essential_df.sample(n=essential_count,replace=True)
        sample = sample.append(non_essential_df.sample(n=non_essential_count,replace=True))

        parent_sample_median = np.abs(np.mean([sample['PARENTAL_MRNA_REP1_RAT'].median(), sample['PARENTAL_MRNA_REP2_RAT'].median()]))
        hybrid_sample_median = np.abs(np.mean([sample['HYBRID_MRNA_REP1_RAT'].median(), sample['HYBRID_MRNA_REP2_RAT'].median()]))

        if parent_sample_median > my_parent_median:
            parent_count += 1
        if hybrid_sample_median > my_hybrid_median:
            hybrid_count += 1

    print(go_term, parent_count/10000, hybrid_count/10000)
    return [parent_count/10000, hybrid_count/10000]

In [None]:
resample_abs_median('GO:0005778')

In [None]:
#resample function for ribosomal profiling

def resample_abs_median_ribo(go_term):
    
    loc = go_term_groups_75.loc[go_term]
    gene_list = ast.literal_eval(loc['tested_gene_list'])
    essential_count = loc['tested_essential_count']
    non_essential_count = len(gene_list) - essential_count
    
    my_parent_median = loc['PARENTAL_RIBO_RAT_median_comb']
    my_hybrid_median = loc['HYBRID_RIBO_RAT_median_comb']

    parent_count = 0
    hybrid_count = 0
    
    for i in range(10000):

        sample = essential_df.sample(n=essential_count,replace=True)
        sample = sample.append(non_essential_df.sample(n=non_essential_count,replace=True))

        parent_sample_median = np.abs(np.mean([sample['PARENTAL_RIBO_REP1_RAT'].median(), sample['PARENTAL_RIBO_REP2_RAT'].median()]))
        hybrid_sample_median = np.abs(np.mean([sample['HYBRID_RIBO_REP1_RAT'].median(), sample['HYBRID_RIBO_REP2_RAT'].median()]))

        if parent_sample_median > my_parent_median:
            parent_count += 1
        if hybrid_sample_median > my_hybrid_median:
            hybrid_count += 1

    print(go_term, parent_count/10000, hybrid_count/10000)
    return [parent_count/10000, hybrid_count/10000]

In [None]:
resample_abs_median_ribo('GO:0005778')