In [None]:
#imports
import random
import pandas as pd
import numpy as np
import ast
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
random.seed(22)

In [None]:
#data source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE38875
# Schraiber JG, Mostovoy Y, Hsu TY, Brem RB. Inferring evolutionary histories of pathway regulation 
# from transcriptional profiling data. PLoS Comput Biol 2013;9(10):e1003255. PMID: 24130471


parents = pd.read_csv('/Users/clairedubin/spur/publishable_data/external_datasets/schraiber_homozygote_exp.csv', header=0).transpose()
parents.columns = parents.iloc[0]
parents = parents.iloc[1:]
parents = parents.apply(pd.to_numeric, errors='coerce')


hybrid = pd.read_csv('/Users/clairedubin/spur/publishable_data/external_datasets/schraiber_hybrid_exp.csv').transpose()
hybrid.columns = hybrid.iloc[0]
hybrid = hybrid.iloc[1:]
hybrid = hybrid.apply(pd.to_numeric, errors='coerce')

In [None]:
#merge parents and hybrid

merged = parents.merge(hybrid,left_index=True, right_index=True,  suffixes=['_parent', '_hybrid'])
merged.head()

In [None]:
#remove any genes not tested in dxy (genes not mapped to GO terms, etc)
dxy_genes = pd.read_csv('/Users/clairedubin/spur/peroxisomes/dxy/raw_data/dxy_EuropeanSpar_WineScer_110619.txt')['gene']

parents = parents[parents.index.isin(dxy_genes)]
hybrid = hybrid[hybrid.index.isin(dxy_genes)]
merged = merged[merged.index.isin(dxy_genes)]

In [None]:
#load go term data as go_terms, remove 3 broad go terms
#source: http://geneontology.org/docs/download-go-annotations/

go_terms = pd.read_csv('/Users/clairedubin/spur/publishable_data/external_datasets/go_terms.csv', header=None)
go_terms = go_terms.drop(columns=[0, 1,3,5,6,7,8,11,12,13,14,15])
go_terms = go_terms.rename(columns={2: 'sgd_name', 4:'go_term', 9:'gene_desc', 10:'gene'})
go_terms['gene'] = [i[0] for i in go_terms['gene'].str.split('|')]
go_terms = go_terms.drop_duplicates()
go_terms = go_terms[~go_terms['go_term'].isin(['GO:0005575', 'GO:0008150', 'GO:0003674'])]
go_terms = go_terms.set_index('gene')
go_terms.head()

In [None]:
#load list of essential genes as essential_genes
#source: http://www-sequence.stanford.edu/group/yeast_deletion_project/Essential_ORFs.txt

essential = pd.read_csv('/Users/clairedubin/spur/publishable_data/external_datasets/essential.csv', header=None)
essential[1] = essential[1].str.strip('\t')
essential_genes = essential[1].str.strip(' ').tolist()

In [None]:
#split expression data into essential and non essential genes

essential_parents = parents[parents.index.isin(essential_genes)]
non_essential_parents = parents[~parents.index.isin(essential_genes)]

essential_hybrid = hybrid[hybrid.index.isin(essential_genes)]
non_essential_hybrid = hybrid[~hybrid.index.isin(essential_genes)]

essential_merged = merged[merged.index.isin(essential_genes)]
non_essential_merged = merged[~merged.index.isin(essential_genes)]

In [None]:
#merge GO term and expression data

merged_parents = go_terms.merge(parents, left_index=True, right_index=True)
merged_hybrid = go_terms.merge(hybrid, left_index=True, right_index=True)
merged = go_terms.merge(merged, left_index=True, right_index=True)

In [None]:
#group by GO term
go_term_groups_merged = merged.groupby('go_term').count()

medians_merged = merged.groupby('go_term').median().add_suffix('_median')
medians_merged_abs = merged.set_index('go_term').drop(columns=['sgd_name', 'gene_desc']).groupby('go_term').median().abs().add_suffix('_abs_median')
medians_merged_abs_mean = merged.set_index('go_term').drop(columns=['sgd_name', 'gene_desc']).groupby('go_term').mean().abs().add_suffix('_abs_mean')


medians_merged = medians_merged.merge(medians_merged_abs, right_index=True, left_index=True
                                    )

medians_merged = medians_merged.merge(medians_merged_abs_mean, right_index=True, left_index=True
                                    )

medians_merged.head()

In [None]:
medians_merged[medians_merged.index == 'GO:0005778']

In [None]:
go5778 = ['YPL112C',
 'YHR150W',
 'YPR165W',
 'YGR028W',
 'YNL214W',
 'YPR128C',
 'YML075C',
 'YGR004W',
 'YGR239C',
 'YLR324W',
 'YGL153W',
 'YHR160C',
 'YMR163C',
 'YDR265W',
 'YLR450W',
 'YDR244W',
 'YMR026C',
 'YKL188C',
 'YDR329C',
 'YLR191W',
 'YDR479C',
 'YOL147C',
 'YOR193W',
 'YBR168W',
 'YPL147W',
 'YBR222C',
 'YNL329C',
 'YDL065C',
 'YOL044W',
 'YAL055W',
'YJL210W',
'YKL197C',
'YMR018W']

In [None]:
#resample function

def resample_abs_median(go_term):

    loc = medians_merged[medians_merged.index == go_term] #i don't know why I need to call it like this but I do
    gene_list = go5778
    essential_count = len(essential_merged[essential_merged.index.isin(go5778)].index)
    non_essential_count = len(go5778) - essential_count
    

    parent_spar_median, parent_smik_median, parent_sbay_median = loc['S_paradoxus_parent_abs_median'], loc['S_mikatae_parent_abs_median'], loc['S_bayanus_parent_abs_median']
    hybrid_spar_median, hybrid_smik_median, hybrid_sbay_median = loc['S_paradoxus_hybrid_abs_median'], loc['S_mikatae_hybrid_abs_median'], loc['S_bayanus_hybrid_abs_median']


    print(parent_spar_median, hybrid_spar_median)
    
    parent_spar_count, parent_smik_count, parent_sbay_count = 0,0,0
    hybrid_spar_count, hybrid_smik_count, hybrid_sbay_count = 0,0,0
    
    for i in range(10000):

        sample = essential_merged.sample(n=essential_count,replace=True)
        sample = sample.append(non_essential_merged.sample(n=non_essential_count,replace=True))

        sample_parent_spar_median, sample_parent_smik_median, sample_parent_sbay_median = np.abs(sample['S_paradoxus_parent'].median()), np.abs(sample['S_mikatae_parent'].median()), np.abs(sample['S_bayanus_parent'].median())
        sample_hybrid_spar_median, sample_hybrid_smik_median, sample_hybrid_sbay_median = np.abs(sample['S_paradoxus_hybrid'].median()), np.abs(sample['S_mikatae_hybrid'].median()), np.abs(sample['S_bayanus_hybrid'].median())

        if sample_parent_spar_median >= parent_spar_median:
            parent_spar_count += 1
            
        if sample_parent_smik_median >= parent_smik_median:
            parent_smik_count += 1
        
        if sample_parent_sbay_median >= parent_sbay_median:
            parent_sbay_count += 1            
        
        if sample_hybrid_spar_median >= hybrid_spar_median:
            hybrid_spar_count += 1
            
        if sample_hybrid_smik_median >= hybrid_smik_median:
            hybrid_smik_count += 1
        
        if sample_hybrid_sbay_median >= hybrid_sbay_median:
            hybrid_sbay_count += 1              


    print(go_term, parent_spar_count/10000, parent_smik_count/10000, parent_sbay_count/10000,
         hybrid_spar_count/10000, hybrid_smik_count/10000, hybrid_sbay_count/10000,)
    return [parent_spar_count/10000, parent_smik_count/10000, parent_sbay_count/10000,
         hybrid_spar_count/10000, hybrid_smik_count/10000, hybrid_sbay_count/10000,]

In [None]:
resample_abs_median('GO:0005778')

In [None]:
def flip_ratio(x):
    return np.log2(1/(2**x))

go5778_gene_list = ast.literal_eval(go_term_groups_75_merged.loc['GO:0005778']['tested_gene_list'])

go5778_exp = merged[merged.index.isin(go5778_gene_list)][['S_paradoxus_parent', 
                                                          'S_mikatae_parent',
                                                          'S_bayanus_parent',
                                                          'S_paradoxus_hybrid', 
                                                          'S_mikatae_hybrid',
                                                          'S_bayanus_hybrid',
                                                          ]]
go5778_exp['Scer:Spar_parent'] = go5778_exp['S_paradoxus_parent'].apply(flip_ratio)
go5778_exp['Scer:Smik_parent'] = go5778_exp['S_mikatae_parent'].apply(flip_ratio)
go5778_exp['Scer:Suva_parent'] = go5778_exp['S_bayanus_parent'].apply(flip_ratio)

go5778_exp['Scer:Spar_hybrid'] = go5778_exp['S_paradoxus_hybrid'].apply(flip_ratio)
go5778_exp['Scer:Smik_hybrid'] = go5778_exp['S_mikatae_hybrid'].apply(flip_ratio)
go5778_exp['Scer:Suva_hybrid'] = go5778_exp['S_bayanus_hybrid'].apply(flip_ratio)


go5778_exp.head()

In [None]:
#change gene labels to PEX names

go_terms = pd.read_csv('/Users/clairedubin/spur/publishable_data/external_datasets/go_terms.csv', header=None)
go_terms = go_terms.drop(columns=[0, 1,3,4,5,6,7,8,11,12,13,14,15])
go_terms = go_terms.rename(columns={2: 'sgd_name', 9:'gene_desc', 10:'gene'})
go_terms['gene'] = [i[0] for i in go_terms['gene'].str.split('|')]
go_terms = go_terms.drop_duplicates()
go_terms = go_terms.set_index('gene')

go5778_exp = go_terms.merge(go5778_exp, right_index=True, left_index=True).set_index('sgd_name')
go5778_exp = go5778_exp.drop_duplicates()
go5778_exp.head()

In [None]:
#make heat map for parents, all species

plt.figure(figsize=(20, 6))

fig, ax = plt.subplots(1, 1, figsize = (15, 7))

go5778_exp['means'] = go5778_exp[['Scer:Spar_parent', 'Scer:Spar_hybrid', 
                        'Scer:Smik_parent', 'Scer:Smik_hybrid', 
                        'Scer:Suva_parent', 'Scer:Suva_hybrid'
                       ]].mean(axis=1)

go5778_exp[['Scer:Spar_parent', 'Scer:Spar_hybrid', 
                        'Scer:Smik_parent', 'Scer:Smik_hybrid', 
                        'Scer:Suva_parent', 'Scer:Suva_hybrid'
                       ]]
go5778_exp = go5778_exp.sort_values('means', ascending=False)

# go5778_exp = go5778_exp.sort_values('Scer:Spar_hybrid', ascending=False)


sns.heatmap(go5778_exp[['Scer:Spar_hybrid','Scer:Spar_parent',  
                        'Scer:Smik_hybrid','Scer:Smik_parent',  
                        'Scer:Suva_hybrid','Scer:Suva_parent', 
                       ]], center=0,
            cmap="RdBu_r", xticklabels=[r'$\dfrac{S. cerevisiae}{S. paradoxus}$ cis', 
                                        r'$\dfrac{S. cerevisiae}{S. paradoxus}$ total',
                                        r'$\dfrac{S. cerevisiae}{S. mikatae}$ cis',
                                        r'$\dfrac{S. cerevisiae}{S. mikatae}$ total',
                                        r'$\dfrac{S. cerevisiae}{S. uvarum}$ cis',
                                        r'$\dfrac{S. cerevisiae}{S. uvarum}$ total'])


ax.set_xlabel('')
ax.set_ylabel('')
ax.tick_params(axis="x", labelsize=11)

fig.savefig('/Users/clairedubin/spur/publishable_data/figures/all_sp_heatmap.eps', format='eps')

In [None]:
#scer:spar heat map, parent and hybrid

#means
go5778_exp = go5778_exp.sort_values('Scer:Spar_hybrid', ascending=False)

fig, ax = plt.subplots(1, 1, figsize = (6, 9))

# plt.figure(figsize=(10, 9))
pal=sns.color_palette("RdBu_r", 1000)
# plt.title('Log2 Fold Change in Expression (GO:0005778)')
sns.heatmap(go5778_exp[['Scer:Spar_hybrid', 'Scer:Spar_parent']], center=0,
            cmap=pal, cbar=True, robust=True, xticklabels=['cis', 'total'],ax=ax),
ax.set_ylabel('')    
ax.set_xlabel('')
ax.tick_params(axis="x", labelsize=13)


fig.savefig('/Users/clairedubin/spur/publishable_data/figures/sc_sp_heatmap.eps', format='eps' )

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (7, 4))

sns.distplot(go5778_exp['Scer:Spar_hybrid'], hist=False, label='cis')
sns.distplot(go5778_exp['Scer:Spar_parent'], hist=False, label='total')
ax.set_xlabel('Log${_2}$ Fold Change Fold Change in Expression (S. cerevisiae : S. paradoxus)')
ax.set_ylabel('Frequency')
ax.legend()


In [None]:
fig, ax = plt.subplots(1, 1, figsize = (7, 4))

sns.distplot(go5778_exp['Scer:Spar_hybrid'].abs(), hist=False, label='cis')
sns.distplot(go5778_exp['Scer:Spar_parent'].abs(), hist=False, label='total')
ax.set_xlabel('Log${_2}$ Fold Change Fold Change in Expression (S. cerevisiae : S. paradoxus)')
ax.set_ylabel('Frequency')
ax.legend()

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (7, 4))

sns.distplot(go5778_exp['Scer:Spar_hybrid'], hist=False, label='cis')
sns.distplot(go5778_exp['Scer:Spar_parent'], hist=False, label='total')
ax.set_xlabel('Log${_2}$ Fold Change Fold Change in Expression (S. cerevisiae : S. paradoxus)')
ax.set_ylabel('Frequency')
ax.legend()


In [None]:
fig, ax = plt.subplots(1, 1, figsize = (7, 4))

sns.distplot(hybrid['S_paradoxus'])
ax.set_xlim(-3,3)

In [None]:
hybrid['S_mikatae'].median()

In [None]:
hybrid['S_bayanus'].median()