In [1]:
#imports
import pandas as pd
import ast
import matplotlib
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
np.random.seed(133)

In [3]:
#data from: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE50049
	# Artieri CG, Fraser HB. Evolution at two levels of gene expression in yeast. 
    #Genome Res 2014 Mar;24(3):411-21. PMID: 24318729

df = pd.read_csv('/Users/clairedubin/spur/publishable_data/external_datasets/GSE50049_PROCESSED_DATA_10samples.csv')
df.head()

Unnamed: 0,SCER_GENE,SPAR_GENE,SGD,SGD_SYSTEMATIC_ID,STANDARD_NAME,SCER_MAPPABLE_BASES,SPAR_MAPPABLE_BASES,HYBRID_SCER_MRNA_REP1_TOTCOV,HYBRID_SCER_MRNA_REP2_TOTCOV,HYBRID_SPAR_MRNA_REP1_TOTCOV,...,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109,Unnamed: 110,Unnamed: 111,Unnamed: 112,Unnamed: 113,Unnamed: 114
0,Scer_1.103,Spar_1.85,YAR015W,S000000070,ADE1,360,360,35762,40462,47396.27694,...,,,,,,,,,,
1,Scer_1.104,Spar_1.86,YAR018C,S000000071,KIN3,954,954,15799,13767,17607.84533,...,,,,,,,,,,
2,Scer_1.105,Spar_1.87,YAR019C,S000000072,CDC15,2510,2510,13859,14691,10806.91053,...,,,,,,,,,,
3,Scer_1.26,Spar_1.6,,,,1032,1021,1053,860,2545.019496,...,,,,,,,,,,
4,Scer_1.27,Spar_1.7,YAL061W,S000000057,BDH2,1045,1045,31915,42252,3238.30423,...,,,,,,,,,,


In [4]:
#remove any genes not tested in dxy
dxy_genes = pd.read_csv('/Users/clairedubin/spur/publishable_data/raw_data/dxy_raw_081319.csv')['Unnamed: 0'].tolist()
df = df[df['SGD'].isin(dxy_genes)]

In [5]:
# readME from website:

# SCER_GENE	S. cerevisiae gene name according to the annotation of Scanell et al. 2011
# SPAR_GENE	S. paradoxus gene name according to the annotation of Scanell et al. 2011
# SGD	Saccharomyces Genome Database gene identifier ('NA' indicates that the gene is present in the Scannell et al. (2011) annotation, but absent from the SGD)
# SGD_SYSTEMATIC_ID	SGD systematic ID
# STANDARD_NAME	Standard three letter designation
# SCER_MAPPABLE_BASES	Number of unmasked bases in the Scer ortholog
# SPAR_MAPPABLE_BASES	Number of unmasked bases in the Spar ortholog
# HYBRID_SCER_MRNA_REP1_TOTCOV	The total base level coverage of the Scer allele in replicate 1 of the mRNA fraction
# HYBRID_SCER_MRNA_REP2_TOTCOV	as above, replicate 2
# HYBRID_SPAR_MRNA_REP1_TOTCOV	The total base level coverage of the Spar allele in replicate 1 of the mRNA fraction (Note that all total coverage Spar samples are scaled such that the total number of bases covered are the same between species)
# HYBRID_SPAR_MRNA_REP2_TOTCOV	as above, replicate 2
# HYBRID_MRNA_REP1_RAT	The started (coverage + 1) Log2 Sc/Sp ratio for mRNA fraction replicate 1
# HYBRID_MRNA_REP2_RAT	as above, replicate 2
# PARENTAL_SCER_MRNA_REP1_TOTCOV	The total base level coverage of the Scer allele in replicate 1 of the mRNA fraction
# PARENTAL_SCER_MRNA_REP2_TOTCOV	as above, replicate 2
# PARENTAL_SPAR_MRNA_REP1_TOTCOV	The total base level coverage of the Spar allele in replicate 1 of the mRNA fraction (Note that all total coverage Spar samples are scaled such that the total number of bases covered are the same between species)
# PARENTAL_SPAR_MRNA_REP2_TOTCOV	as above, replicate 2
# PARENTAL_MRNA_REP1_RAT	The started (coverage + 1) Log2 Sc/Sp ratio for mRNA fraction replicate 1
# PARENTAL_MRNA_REP2_RAT	as above, replicate 2
# HYBRID_SCER_RIBO_REP1_TOTCOV	The total base level coverage of the Scer allele in replicate 1 of the Ribo fraction
# HYBRID_SCER_RIBO_REP2_TOTCOV	as above, replicate 2
# HYBRID_SPAR_RIBO_REP1_TOTCOV	The total base level coverage of the Spar allele in replicate 1 of the Ribo fraction
# HYBRID_SPAR_RIBO_REP2_TOTCOV	as above, replicate 2
# HYBRID_RIBO_REP1_RAT	The started (coverage + 1) Log2 Sc/Sp ratio for Ribo fraction replicate 1
# HYBRID_RIBO_REP2_RAT	as above, replicate 2
# PARENTAL_SCER_RIBO_REP1_TOTCOV	The total base level coverage of the Scer allele in replicate 1 of the Ribo fraction
# PARENTAL_SCER_RIBO_REP2_TOTCOV	as above, replicate 2
# PARENTAL_SPAR_RIBO_REP1_TOTCOV	The total base level coverage of the Spar allele in replicate 1 of the Ribo fraction
# PARENTAL_SPAR_RIBO_REP2_TOTCOV	as above, replicate 2
# PARENTAL_RIBO_REP1_RAT	The started (coverage + 1) Log2 Sc/Sp ratio for Ribo fraction replicate 1
# PARENTAL_RIBO_REP2_RAT	as above, replicate 2
# MRNA_CIS_TEST	≥ than 100 reads mapping to both orthologs in both replicates in the hybrid mRNA fraction (minimum to test for significant differences) (1 = yes, 0 = no)
# TRANSL_CIS_TEST	≥ than 100 reads mapping to both orthologs in both replicates in the hybrid mRNA and Ribo fractions (minimum to test for significant differences) (1 = yes, 0 = no)
# PARENTAL_TEST	≥ than 100 reads mapping to both orthologs in both replicates in all hybrid and parental mRNA fractions, and no orthologs differentially expressed due to mating type or ploidy state(1 = yes, 0 = no)
# MRNA_CIS_MAX_P	Max p value (2 tailed) obtained from the permutation approach testing for significant mRNA divergence in cis
# MRNA_CIS_DIR	Species with higher expression level
# MRNA_CIS_NO_TEST	Failed to test because: LOW_COV, < 100 reads in appropriate samples; REPS_DISAGREE, replicates disagree in direction
# TRANSL_CIS_MAX_P	Max p value (2 tailed) obtained from the permutation approach testing for significant translational divergence in cis
# TRANSL_CIS_DIR	Species with higher expression level
# TRANSL_CIS_NO_TEST	Failed to test because: LOW_COV, < 100 reads in appropriate samples; REPS_DISAGREE, replicates disagree in direction
# MRNA_TRANS_MAX_P	Max p value (2 tailed) obtained from the permutation approach testing for significant mRNA divergence in trans
# MRNA_TRANS_DIR	Species with higher expression level
# MRNA_TRANS_NO_TEST	Failed to test because: LOW_COV, < 100 reads in appropriate samples; REPS_DISAGREE, replicates disagree in direction
# TRANSL_TRANS_MAX_P	Max p value (2 tailed) obtained from the permutation approach testing for significant translational divergence in trans
# TRANSL_TRANS_DIR	Species with higher expression level
# TRANSL_TRANS_NO_TEST	Failed to test because: LOW_COV, < 100 reads in appropriate samples; REPS_DISAGREE, replicates disagree in direction
# MRNA_CIS_FDR_ADJ_P	FDR adjusted pvalue for the mRNA cis test
# TRANSL_CIS_FDR_ADJ_P	FDR adjusted pvalue for the translational cis test
# MRNA_TRANS_FDR_ADJ_P	FDR adjusted pvalue for the mRNA trans test
# TRANSL_TRANS_FDR_ADJ_P	FDR adjusted pvalue for the translational trans test
# MRNA_CIS_SCER	Significant mRNA divergence in cis where Scer is the more highly expressed allele? (1 = yes, 0 = no)
# MRNA_CIS_SPAR	Significant mRNA divergence in cis where Spar is the more highly expressed allele? (1 = yes, 0 = no)
# TRANSL_CIS_SCER	Significant translational divergence in cis where Scer is the more highly translated allele? (1 = yes, 0 = no)
# TRANSL_CIS_SPAR	Significant translational divergence in cis where Spar is the more highly translated allele? (1 = yes, 0 = no)
# MRNA_TRANS_SCER	Significant mRNA divergence in trans where Scer is the more highly expressed allele? (1 = yes, 0 = no)
# MRNA_TRANS_SPAR	Significant mRNA divergence in trans where Spar is the more highly expressed allele? (1 = yes, 0 = no)
# TRANSL_TRANS_SCER	Significant translational divergence in trans where Scer is the more highly translated allele? (1 = yes, 0 = no)
# TRANSL_TRANS_SPAR	Significant translational divergence in trans where Spar is the more highly translated allele? (1 = yes, 0 = no)
# HYBRID_SCER_MRNA_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Scer allele in the hybrid mRNA fraction, replicate 1
# HYBRID_SCER_MRNA_REP2_RPKM	as above, replicate 2
# HYBRID_SPAR_MRNA_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Spar allele in the hybrid mRNA fraction, replicate 1
# HYBRID_SPAR_MRNA_REP2_RPKM	as above, replicate 2
# HYBRID_SCER_RIBO_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Scer allele in the hybrid Ribo fraction, replicate 1
# HYBRID_SCER_RIBO_REP2_RPKM	as above, replicate 2
# HYBRID_SPAR_RIBO_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Spar allele in the hybrid Ribo fraction, replicate 1
# HYBRID_SPAR_RIBO_REP2_RPKM	as above, replicate 2
# PARENTAL_SCER_MRNA_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Scer allele in the parental mRNA fraction, replicate 1
# PARENTAL_SCER_MRNA_REP2_RPKM	as above, replicate 2
# PARENTAL_SPAR_MRNA_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Spar allele in the parental mRNA fraction, replicate 1
# PARENTAL_SPAR_MRNA_REP2_RPKM	as above, replicate 2
# PARENTAL_SCER_RIBO_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Scer allele in the parental Ribo fraction, replicate 1
# PARENTAL_SCER_RIBO_REP2_RPKM	as above, replicate 2
# PARENTAL_SPAR_RIBO_REP1_RPKM	Reads per Kilobase per Million Mapped Reads mapping to the Spar allele in the parental Ribo fraction, replicate 1
# PARENTAL_SPAR_RIBO_REP2_RPKM	as above, replicate 2
# AFFECTED_BY_PLOIDY	Gene differentially expressed in different ploidy states according to Wu et al. (2010) (1 = yes, 0 = no)
# AFFTECTED_BY_MATING_TYPE	Gene differentially expressed in different mating types according to Galitski et al. (1999) (1 = yes, 0 = no)
# TATA	TATA containing promoter according to Tirosh et al. (2006) (1 = yes, 0 = no)
# TIROSH_2008_OPN	OPN, DPN, or NA promoter according to Tirosh et al. (2008) (1 = yes, 0 = no)
# CDS_PDIV	Percent divergence in CDS
# PROMOTER_PDIV	Percent divergence in Promoter (200 bp upstream of TSS)
# FIVE_UTR_PDIV	Percent divergence in 5 prime UTR
# THREE_UTR_PDIV	Percent divergence in 3 prime UTR
# FIVE_UTR_50BP_PDIV	Percent divergence in first 50 bp of 5 prime UTR
# KA	Non-synonymous substitutions per non-synonymous site between S. cerevisiae and S. paradoxus
# KS	Synonymous substitutions per synonymous site between S. cerevisiae and S. paradoxus
# OMEGA	KA/KS

In [6]:
#load go term data as go_terms, remove 3 broad go terms
#source: http://geneontology.org/docs/download-go-annotations/

go_terms = pd.read_csv('/Users/clairedubin/spur/publishable_data/external_datasets/go_terms.csv', header=None)
go_terms = go_terms.drop(columns=[0, 1,3,5,6,7,8,11,12,13,14,15])
go_terms = go_terms.rename(columns={2: 'sgd_name', 4:'go_term', 9:'gene_desc', 10:'gene'})
go_terms['gene'] = [i[0] for i in go_terms['gene'].str.split('|')]
go_terms = go_terms.drop_duplicates()
go_terms = go_terms[~go_terms['go_term'].isin(['GO:0005575', 'GO:0008150', 'GO:0003674'])]
go_terms = go_terms.set_index('gene')
go_terms.head()

Unnamed: 0_level_0,sgd_name,go_term,gene_desc
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
YDL159W,STE7,GO:0000187,Signal transducing MAP kinase kinase
YJL128C,PBS2,GO:0000187,MAP kinase kinase of the HOG signaling pathway
YOL144W,NOP8,GO:0003723,Nucleolar protein required for 60S ribosomal s...
YIL095W,PRK1,GO:0120133,Ser/Thr protein kinase
YER038W-A,FMP49,GO:0005739,Mitochondrial protein of unknown function


In [7]:
#load list of essential genes as essential_genes
#source: http://www-sequence.stanford.edu/group/yeast_deletion_project/Essential_ORFs.txt

essential = pd.read_csv('/Users/clairedubin/spur/publishable_data/external_datasets/essential.csv', header=None)
essential[1] = essential[1].str.strip('\t')
essential_genes = essential[1].str.strip(' ').tolist()

In [8]:
#split expression data into essential and non essential genes

essential_df = df[df['SGD'].isin(essential_genes)]
non_essential_df = df[~df['SGD'].isin(essential_genes)]

In [9]:
#merge GO term and expression data

merged = go_terms.merge(df, left_index=True, right_on='SGD')[['go_term', 'SGD', 'STANDARD_NAME','PARENTAL_RIBO_REP1_RAT', 'PARENTAL_RIBO_REP2_RAT', 'PARENTAL_MRNA_REP1_RAT', 'PARENTAL_MRNA_REP2_RAT','PARENTAL_TEST', 'HYBRID_MRNA_REP1_RAT', 'HYBRID_MRNA_REP2_RAT','HYBRID_RIBO_REP1_RAT', 'HYBRID_RIBO_REP2_RAT']]
merged['essential'] = merged.index.isin(essential_genes)
merged.head()

Unnamed: 0,go_term,SGD,STANDARD_NAME,PARENTAL_RIBO_REP1_RAT,PARENTAL_RIBO_REP2_RAT,PARENTAL_MRNA_REP1_RAT,PARENTAL_MRNA_REP2_RAT,PARENTAL_TEST,HYBRID_MRNA_REP1_RAT,HYBRID_MRNA_REP2_RAT,HYBRID_RIBO_REP1_RAT,HYBRID_RIBO_REP2_RAT,essential
2917,GO:0000187,YDL159W,STE7,-0.207706,-0.310749,-0.35263,-0.240539,1,-0.289806,-0.145233,0.215161,0.46532,False
2917,GO:0004708,YDL159W,STE7,-0.207706,-0.310749,-0.35263,-0.240539,1,-0.289806,-0.145233,0.215161,0.46532,False
2917,GO:0071507,YDL159W,STE7,-0.207706,-0.310749,-0.35263,-0.240539,1,-0.289806,-0.145233,0.215161,0.46532,False
2917,GO:0016301,YDL159W,STE7,-0.207706,-0.310749,-0.35263,-0.240539,1,-0.289806,-0.145233,0.215161,0.46532,False
2917,GO:0000196,YDL159W,STE7,-0.207706,-0.310749,-0.35263,-0.240539,1,-0.289806,-0.145233,0.215161,0.46532,False


In [10]:
#group by GO term
go_term_groups = merged.groupby('go_term').count()

medians = merged.set_index('go_term').drop(columns=['SGD', 'STANDARD_NAME']).groupby('go_term').median()[['PARENTAL_RIBO_REP1_RAT', 'PARENTAL_RIBO_REP2_RAT', 'PARENTAL_MRNA_REP1_RAT', 'PARENTAL_MRNA_REP2_RAT','PARENTAL_TEST', 'HYBRID_MRNA_REP1_RAT', 'HYBRID_MRNA_REP2_RAT','HYBRID_RIBO_REP1_RAT', 'HYBRID_RIBO_REP2_RAT']].add_suffix('_median')
means = merged.set_index('go_term').drop(columns=['SGD', 'STANDARD_NAME']).groupby('go_term').mean()[['PARENTAL_RIBO_REP1_RAT', 'PARENTAL_RIBO_REP2_RAT', 'PARENTAL_MRNA_REP1_RAT', 'PARENTAL_MRNA_REP2_RAT','PARENTAL_TEST', 'HYBRID_MRNA_REP1_RAT', 'HYBRID_MRNA_REP2_RAT','HYBRID_RIBO_REP1_RAT', 'HYBRID_RIBO_REP2_RAT']].add_suffix('_mean')


medians['PARENTAL_MRNA_RAT_median_comb'] = medians[['PARENTAL_MRNA_REP1_RAT_median','PARENTAL_MRNA_REP2_RAT_median']].mean(axis=1).abs()
means['PARENTAL_MRNA_RAT_mean_comb'] = means[['PARENTAL_MRNA_REP1_RAT_mean','PARENTAL_MRNA_REP2_RAT_mean']].mean(axis=1).abs()
medians['HYBRID_MRNA_RAT_median_comb'] = medians[['HYBRID_MRNA_REP1_RAT_median','HYBRID_MRNA_REP2_RAT_median']].mean(axis=1).abs()
means['HYBRID_MRNA_RAT_mean_comb'] = means[['HYBRID_MRNA_REP1_RAT_mean','HYBRID_MRNA_REP2_RAT_mean']].mean(axis=1).abs()


go_term_groups = means.merge(medians, right_index=True, left_index=True)

go_term_groups.head()

Unnamed: 0_level_0,PARENTAL_RIBO_REP1_RAT_mean,PARENTAL_RIBO_REP2_RAT_mean,PARENTAL_MRNA_REP1_RAT_mean,PARENTAL_MRNA_REP2_RAT_mean,PARENTAL_TEST_mean,HYBRID_MRNA_REP1_RAT_mean,HYBRID_MRNA_REP2_RAT_mean,HYBRID_RIBO_REP1_RAT_mean,HYBRID_RIBO_REP2_RAT_mean,PARENTAL_MRNA_RAT_mean_comb,...,PARENTAL_RIBO_REP2_RAT_median,PARENTAL_MRNA_REP1_RAT_median,PARENTAL_MRNA_REP2_RAT_median,PARENTAL_TEST_median,HYBRID_MRNA_REP1_RAT_median,HYBRID_MRNA_REP2_RAT_median,HYBRID_RIBO_REP1_RAT_median,HYBRID_RIBO_REP2_RAT_median,PARENTAL_MRNA_RAT_median_comb,HYBRID_MRNA_RAT_median_comb
go_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GO:0000001,-0.121215,-0.096044,0.000722,0.00108,0.863636,-0.00907,0.061054,-0.056876,0.072088,0.000901,...,-0.040074,0.047576,0.015089,1.0,-0.04704,0.073144,-0.062581,0.094099,0.031332,0.013052
GO:0000002,-0.17799,-0.277574,-0.13396,-0.066189,0.7,-0.08695,-0.274024,-0.002295,0.023806,0.100075,...,-0.410592,-0.151805,-0.276117,1.0,-0.03694,-0.167769,-0.072713,-0.239177,0.213961,0.102354
GO:0000007,-0.288176,-0.265951,0.113453,0.307723,1.0,0.542926,0.606126,0.675669,0.721774,0.210588,...,-0.265951,0.113453,0.307723,1.0,0.542926,0.606126,0.675669,0.721774,0.210588,0.574526
GO:0000009,0.524813,-0.349243,0.210346,0.083977,1.0,0.030063,0.095009,-0.068826,-0.21568,0.147161,...,-0.246178,0.157696,-0.026375,1.0,0.000267,0.11277,0.065322,-0.198551,0.065661,0.056518
GO:0000010,-0.452014,-0.650996,0.138346,0.22712,1.0,-0.034363,0.150804,0.195357,-0.275807,0.182733,...,-0.650996,0.138346,0.22712,1.0,-0.034363,0.150804,0.195357,-0.275807,0.182733,0.05822


In [11]:
#eliminate GO terms with less than 50% of genes used in MK test or less than 10 genes tested
go_term_other = pd.read_csv('/Users/clairedubin/spur/publishable_data/raw_data/population_test_data_by_go_term_50percentcutoff_081419.csv')

go_terms_to_use = go_term_other[(go_term_other['tested_gene_counts']/go_term_other['total_gene_counts'] > .75) & (go_term_other['total_gene_counts'] > 10) ]['go_term'].tolist()


go_term_groups_75 = go_term_groups[go_term_groups.index.isin(go_terms_to_use)]

go_term_groups_75 = go_term_other.merge(go_term_groups_75, right_index=True, left_on='go_term')[['go_term','tested_gene_counts', 'total_gene_counts', 'all_gene_list',
       'tested_gene_list', 'tested_essential_count',
       'PARENTAL_RIBO_REP1_RAT_mean', 'PARENTAL_RIBO_REP2_RAT_mean',
       'PARENTAL_MRNA_REP1_RAT_mean', 'PARENTAL_MRNA_REP2_RAT_mean',
       'PARENTAL_TEST_mean', 'HYBRID_MRNA_REP1_RAT_mean',
       'HYBRID_MRNA_REP2_RAT_mean', 'HYBRID_RIBO_REP1_RAT_mean',
       'HYBRID_RIBO_REP2_RAT_mean', 'PARENTAL_RIBO_REP1_RAT_median',
       'PARENTAL_RIBO_REP2_RAT_median', 'PARENTAL_MRNA_REP1_RAT_median',
       'PARENTAL_MRNA_REP2_RAT_median', 'PARENTAL_TEST_median',
       'HYBRID_MRNA_REP1_RAT_median', 'HYBRID_MRNA_REP2_RAT_median',
       'HYBRID_RIBO_REP1_RAT_median', 'HYBRID_RIBO_REP2_RAT_median',
        'PARENTAL_MRNA_RAT_median_comb', 'PARENTAL_MRNA_RAT_mean_comb',
        'HYBRID_MRNA_RAT_median_comb', 'HYBRID_MRNA_RAT_mean_comb'                                                                                    ]].set_index('go_term')
go_term_groups_75 



Unnamed: 0_level_0,tested_gene_counts,total_gene_counts,all_gene_list,tested_gene_list,tested_essential_count,PARENTAL_RIBO_REP1_RAT_mean,PARENTAL_RIBO_REP2_RAT_mean,PARENTAL_MRNA_REP1_RAT_mean,PARENTAL_MRNA_REP2_RAT_mean,PARENTAL_TEST_mean,...,PARENTAL_MRNA_REP2_RAT_median,PARENTAL_TEST_median,HYBRID_MRNA_REP1_RAT_median,HYBRID_MRNA_REP2_RAT_median,HYBRID_RIBO_REP1_RAT_median,HYBRID_RIBO_REP2_RAT_median,PARENTAL_MRNA_RAT_median_comb,PARENTAL_MRNA_RAT_mean_comb,HYBRID_MRNA_RAT_median_comb,HYBRID_MRNA_RAT_mean_comb
go_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GO:0000001,22,28,"['YNL079C', 'YOR326W', 'YGL055W', 'YDL029W', '...","['YNL079C', 'YOR326W', 'YGL055W', 'YKL013C', '...",5,-0.121215,-0.096044,0.000722,0.001080,0.863636,...,0.015089,1.0,-0.047040,0.073144,-0.062581,0.094099,0.031332,0.000901,0.013052,0.025992
GO:0000011,15,19,"['YNL079C', 'YOR326W', 'YLR043C', 'YBR097W', '...","['YNL079C', 'YOR326W', 'YLR043C', 'YBR097W', '...",1,0.052366,-0.078237,0.323808,0.155813,0.937500,...,0.064459,1.0,-0.022488,-0.009274,0.017605,0.160123,0.218977,0.239810,0.015881,0.003404
GO:0000026,11,12,"['YDR483W', 'YOR099W', 'YBR015C', 'YJL186W', '...","['YDR483W', 'YOR099W', 'YBR015C', 'YJL186W', '...",0,0.045251,-0.477178,0.019515,-0.059482,0.909091,...,-0.006844,1.0,-0.052262,-0.171735,-0.239913,-0.180868,0.060677,0.019984,0.111999,0.152394
GO:0000045,21,25,"['YPL166W', 'YDR170C', 'YBR080C', 'YGL180W', '...","['YDR170C', 'YBR080C', 'YLR423C', 'YPR185W', '...",3,0.343854,0.018499,0.313832,0.304456,0.619048,...,0.285003,1.0,0.129863,-0.046142,0.085623,0.218258,0.260041,0.309144,0.041860,0.038147
GO:0000079,28,31,"['YAL040C', 'YIL050W', 'YOL001W', 'YMR199W', '...","['YAL040C', 'YIL050W', 'YOL001W', 'YMR199W', '...",3,-0.116185,-0.231966,-0.169354,-0.175454,0.821429,...,-0.118092,1.0,0.023056,-0.063608,-0.174732,0.102738,0.045328,0.172404,0.020276,0.040108
GO:0000082,21,26,"['YFL009W', 'YJL194W', 'YDR054C', 'YDL047W', '...","['YJL194W', 'YDR054C', 'YDL047W', 'YDL134C', '...",6,0.009763,0.062052,-0.068490,-0.063298,0.894737,...,0.087955,1.0,-0.014088,-0.118587,-0.089816,0.250056,0.097846,0.065894,0.066337,0.058769
GO:0000086,18,20,"['YFL009W', 'YDR054C', 'YMR036C', 'YGR108W', '...","['YDR054C', 'YMR036C', 'YGR108W', 'YPR119W', '...",8,-0.052692,-0.206074,-0.071057,-0.191753,0.833333,...,-0.172536,1.0,0.014353,0.020894,0.044525,-0.055518,0.093436,0.131405,0.017624,0.060880
GO:0000131,50,66,"['YPR165W', 'YBR109C', 'YFL005W', 'YHR023W', '...","['YPR165W', 'YFL005W', 'YHR023W', 'YAL041W', '...",16,0.054174,-0.030390,0.068423,0.095147,0.875000,...,-0.017115,1.0,0.036617,0.074223,-0.057708,0.118034,0.037262,0.081785,0.055420,0.029698
GO:0000132,12,14,"['YOR326W', 'YPL174C', 'YKR054C', 'YMR294W', '...","['YOR326W', 'YPL174C', 'YKR054C', 'YMR294W', '...",2,-0.008444,-0.188662,-0.040697,-0.197399,0.727273,...,-0.135055,1.0,-0.003724,0.035353,0.159463,0.193210,0.040443,0.119048,0.015815,0.030583
GO:0000142,11,12,"['YNL079C', 'YJR090C', 'YCL014W', 'YIL138C', '...","['YNL079C', 'YJR090C', 'YCL014W', 'YIL138C', '...",3,-0.119934,-0.228117,0.083162,-0.135170,1.000000,...,-0.072860,1.0,0.040189,0.127664,0.106395,0.149875,0.061149,0.026004,0.083927,0.058866


In [12]:
#get GO:0005778 data
go5778_genes = ast.literal_eval(go_term_groups_75.loc['GO:0005778']['all_gene_list'])
go5778_df = df[df['SGD'].isin(go5778_genes)]
go5778_df = df[df['SGD'].isin(go5778_genes)][['SGD', 'STANDARD_NAME','PARENTAL_RIBO_REP1_RAT', 'PARENTAL_RIBO_REP2_RAT', 'PARENTAL_MRNA_REP1_RAT', 'PARENTAL_MRNA_REP2_RAT','PARENTAL_TEST', 'HYBRID_MRNA_REP1_RAT', 'HYBRID_MRNA_REP2_RAT','HYBRID_RIBO_REP1_RAT', 'HYBRID_RIBO_REP2_RAT']]
go5778_df.head()

Unnamed: 0,SGD,STANDARD_NAME,PARENTAL_RIBO_REP1_RAT,PARENTAL_RIBO_REP2_RAT,PARENTAL_MRNA_REP1_RAT,PARENTAL_MRNA_REP2_RAT,PARENTAL_TEST,HYBRID_MRNA_REP1_RAT,HYBRID_MRNA_REP2_RAT,HYBRID_RIBO_REP1_RAT,HYBRID_RIBO_REP2_RAT
8,YAL055W,PEX22,1.286962,1.067292,0.809173,0.507032,0,0.203273,-0.145614,0.805866,-1.401936
143,YJL210W,PEX2,1.064302,1.417218,0.759967,0.979919,1,0.071006,0.310039,-0.186378,0.265502
556,YKL188C,PXA2,-1.166742,-1.854301,1.102583,1.322758,0,1.613205,1.65645,-0.510632,0.982019
738,YLR191W,PEX13,0.362377,0.840829,0.021919,-0.031615,1,0.197299,-0.324963,0.605167,0.057199
852,YLR324W,PEX30,0.907319,1.046354,0.77152,0.691691,1,0.462139,0.376056,0.681517,0.677557


In [13]:
#resample function

def resample_old(go_term):
    
    
    loc = go_term_groups_75.loc[go_term]
    gene_list = ast.literal_eval(loc['tested_gene_list'])
    essential_count = loc['tested_essential_count']
    non_essential_count = len(gene_list) - essential_count
    
    my_parent_mean = np.mean([loc['PARENTAL_MRNA_REP1_RAT_mean'], loc['PARENTAL_MRNA_REP2_RAT_mean']])
    my_hybrid_mean = np.mean([loc['HYBRID_MRNA_REP1_RAT_mean'], loc['HYBRID_MRNA_REP2_RAT_mean']])

    parent_count = 0
    hybrid_count = 0
    
    for i in range(10000):

        sample = essential_df.sample(n=essential_count,replace=True)
        sample = sample.append(non_essential_df.sample(n=non_essential_count,replace=True))

        parent_sample_mean = np.mean([sample['PARENTAL_MRNA_REP1_RAT'].mean(), sample['PARENTAL_MRNA_REP2_RAT'].mean()])
        hybrid_sample_mean = np.mean([sample['HYBRID_MRNA_REP1_RAT'].mean(), sample['HYBRID_MRNA_REP2_RAT'].mean()])

        if parent_sample_mean > my_parent_mean:
            parent_count += 1
        if hybrid_sample_mean > my_hybrid_mean:
            hybrid_count += 1

    print(go_term, parent_count/10000, hybrid_count/10000)
    return [parent_count/10000, hybrid_count/10000]

In [18]:
#resample function

def resample_abs_mean(go_term):
    
    loc = go_term_groups_75.loc[go_term]
    gene_list = ast.literal_eval(loc['tested_gene_list'])
    essential_count = loc['tested_essential_count']
    non_essential_count = len(gene_list) - essential_count
    
    my_parent_mean = loc['PARENTAL_MRNA_RAT_mean_comb']
    my_hybrid_mean = loc['HYBRID_MRNA_RAT_mean_comb']

    parent_count = 0
    hybrid_count = 0
    
    for i in range(10000):

        sample = essential_df.sample(n=essential_count,replace=True)
        sample = sample.append(non_essential_df.sample(n=non_essential_count,replace=True))

        parent_sample_mean = np.abs(np.mean([sample['PARENTAL_MRNA_REP1_RAT'].mean(), sample['PARENTAL_MRNA_REP2_RAT'].mean()]))
        hybrid_sample_mean = np.abs(np.mean([sample['HYBRID_MRNA_REP1_RAT'].mean(), sample['HYBRID_MRNA_REP2_RAT'].mean()]))

        if parent_sample_mean > my_parent_mean:
            parent_count += 1
        if hybrid_sample_mean > my_hybrid_mean:
            hybrid_count += 1

    print(go_term, parent_count/10000, hybrid_count/10000)
    return [parent_count/10000, hybrid_count/10000]

In [None]:
#resample function

def resample_abs_median(go_term):
    
    loc = go_term_groups_75.loc[go_term]
    gene_list = ast.literal_eval(loc['tested_gene_list'])
    essential_count = loc['tested_essential_count']
    non_essential_count = len(gene_list) - essential_count
    
    my_parent_median = loc['PARENTAL_MRNA_RAT_median_comb']
    my_hybrid_median = loc['HYBRID_MRNA_RAT_median_comb']

    parent_count = 0
    hybrid_count = 0
    
    for i in range(10000):

        sample = essential_df.sample(n=essential_count,replace=True)
        sample = sample.append(non_essential_df.sample(n=non_essential_count,replace=True))

        parent_sample_median = np.abs(np.mean([sample['PARENTAL_MRNA_REP1_RAT'].median(), sample['PARENTAL_MRNA_REP2_RAT'].median()]))
        hybrid_sample_median = np.abs(np.mean([sample['HYBRID_MRNA_REP1_RAT'].median(), sample['HYBRID_MRNA_REP2_RAT'].median()]))

        if parent_sample_median > my_parent_median:
            parent_count += 1
        if hybrid_sample_median > my_hybrid_median:
            hybrid_count += 1

    print(go_term, parent_count/10000, hybrid_count/10000)
    return [parent_count/10000, hybrid_count/10000]

In [None]:
resample_abs_median('GO:0005778')