# Obtain list of Complex members and Proteins in Osmotic stress pathway
Pathway members from http://www.nature.com/articles/npjsba201518

# Import files and libraries

In [1]:
import os.path
import matplotlib.pyplot as plt
import pandas as pd

DATA_FILEPATH = os.path.join('..', '..', 'data')
OSMOTIC_FILEPATH = os.path.join(DATA_FILEPATH, 'evaluation')

sgd_mapping_file = os.path.join(DATA_FILEPATH, 'SGD_features.tab')

data = os.path.join(OSMOTIC_FILEPATH, 'Kitano2016_data.xls')
df = pd.read_excel(data, header = 1)
print df


                class                     name            compartment  \
0             COMPLEX                Bcy1/Zds1              Cytoplasm   
1             COMPLEX                Bcy1/Tpk3              Cytoplasm   
2             COMPLEX                Bcy1/Tpk2              Cytoplasm   
3             COMPLEX                Bcy1/Tpk1              Cytoplasm   
4             COMPLEX                Bmh2/Msn4              Cytoplasm   
5             COMPLEX                Bmh2/Msn2              Cytoplasm   
6             COMPLEX                 GDP Rho1              Cytoplasm   
7             COMPLEX                 GTP Rho1              Cytoplasm   
8             COMPLEX         SBF complex/Slt2                Nucleus   
9             COMPLEX                Cyc8/Tup1                Nucleus   
10            COMPLEX                Rom2/Slg1              Cytoplasm   
11            COMPLEX              SBF complex                Nucleus   
12            COMPLEX                Glc7/Reg1     

# Drop unnecessary columns

In [2]:
df = df.drop('compartment', 1)
df = df.drop('Heat Shock', 1)
df = df.drop('Ion Homeostasis', 1)
df = df.drop('Nutrient Adapatation', 1)
df = df.drop('Oxidative Stress', 1)
df = df.drop('Pheromone Response', 1)
df = df.drop('number of occurrences', 1)
print df

                class                     name Osmotic Stress
0             COMPLEX                Bcy1/Zds1              +
1             COMPLEX                Bcy1/Tpk3              +
2             COMPLEX                Bcy1/Tpk2              +
3             COMPLEX                Bcy1/Tpk1              +
4             COMPLEX                Bmh2/Msn4              +
5             COMPLEX                Bmh2/Msn2              +
6             COMPLEX                 GDP Rho1              +
7             COMPLEX                 GTP Rho1              +
8             COMPLEX         SBF complex/Slt2              +
9             COMPLEX                Cyc8/Tup1              +
10            COMPLEX                Rom2/Slg1              +
11            COMPLEX              SBF complex              +
12            COMPLEX                Glc7/Reg1              +
13            COMPLEX               Glc7/Bud14              +
14            COMPLEX    PP2A(Pph22/Tpd3/Rts1)              +
15      

# Obtain list of only proteins and complexes in osmotic stress

In [3]:
df = df[df['Osmotic Stress'] != '-']
df = df[((df['class'] == 'COMPLEX') | (df['class'] == 'PROTEIN'))]

df = df.drop('class', 1)

print df

                        name Osmotic Stress
0                  Bcy1/Zds1              +
1                  Bcy1/Tpk3              +
2                  Bcy1/Tpk2              +
3                  Bcy1/Tpk1              +
4                  Bmh2/Msn4              +
5                  Bmh2/Msn2              +
6                   GDP Rho1              +
7                   GTP Rho1              +
8           SBF complex/Slt2              +
9                  Cyc8/Tup1              +
10                 Rom2/Slg1              +
11               SBF complex              +
12                 Glc7/Reg1              +
13                Glc7/Bud14              +
14     PP2A(Pph22/Tpd3/Rts1)              +
15     PP2A(Pph21/Tpd3/Rts1)              +
16    PP2A(Pph22/Tpd3/Cdc55)              +
17    PP2A(Pph21/Tpd3/Cdc55)              +
18                 GTP Cdc42              +
21                 Fks1/Rho1              +
22                 Zeo1/Mid2              +
23                 Rom2/Mid2    

In [4]:
assert len(df) == 221
df = df.drop('Osmotic Stress', 1)

# Extract unique symbols
Split complex's names on ( / ) but not space

In [5]:
gene_symbols = set()
for protein in df['name'].values:
    # split complex names
    gene_list = protein.strip().upper().replace('(','/').replace(')','/').split('/')
    # convert unicode to Python 2 string and strip whitespace
    gene_symbols.update(map(str.strip, map(str, gene_list)))

if '' in gene_symbols:
    gene_symbols.remove('')

assert len(gene_symbols) == 161

# Output gene symbols to file

In [6]:
out_file = os.path.join(OSMOTIC_FILEPATH, 'Kitano2016_Genes.txt')

with open(out_file, 'w') as out_f:
    for gene in sorted(gene_symbols):
        out_f.write(gene + '\n')

# Output ORFs to file
Only complexes or complex members are unmappable

In [7]:
UNIQUE_GENES = 5440

sgd_df = pd.read_csv(sgd_mapping_file, sep = '\t', header = None, usecols = [0, 1, 2, 3, 4, 5])
print('Loaded {} genes'.format(len(sgd_df)))
assert len(sgd_df) == 16454

# remove NaN gene symbols
sgd_df = sgd_df[sgd_df[4].notnull()]
print('Loaded {} non-null gene symbols'.format(len(sgd_df)))
assert len(sgd_df) == UNIQUE_GENES

print('{} unique id types'.format(len(sgd_df[1].unique())))
print('{} unique gene sybmols'.format(len(sgd_df[4].unique())))
print('{} unique ORFs'.format(len(sgd_df[3].unique())))

sgd_df[3] = sgd_df[3].apply(str.upper)
sgd_df[4] = sgd_df[4].apply(str.upper)

sgd_df.head()

Loaded 16454 genes
Loaded 5440 non-null gene symbols
17 unique id types
5440 unique gene sybmols
5440 unique ORFs


Unnamed: 0,0,1,2,3,4,5
9,S000002142,ORF,Verified,YAL068C,PAU8,seripauperin PAU8
14,S000000062,ORF,Verified,YAL067C,SEO1,putative permease SEO1
22,S000002140,ORF,Uncharacterized,YAL064C-A,TDA8,YAL065C-A
29,S000000059,ORF,Verified,YAL063C,FLO9,flocculin FLO9
33,S000000058,ORF,Verified,YAL062W,GDH3,glutamate dehydrogenase (NADP(+)) GDH3|FUN51


In [8]:
# map gene symbols to ORF ids
# ensure no symbols map to the same ORF
gene_map = dict(zip(sgd_df[4], sgd_df[3]))
assert len(gene_map) == UNIQUE_GENES

# convert and write the mappable gene symbols, print the rest
unmappable = set()
mapped_orfs = set()
for gene in gene_symbols:
    if gene in gene_map:
        mapped_orfs.add(gene_map[gene])
    # check if it is already an ORF
    elif gene in gene_map.values():
        mapped_orfs.add(gene)
    else:
        unmappable.add(gene)

assert len(mapped_orfs) + len(unmappable) == 161

print('Mapped {} gold standard genes\nFailed to map {} gold standard genes:'.format(len(mapped_orfs), len(unmappable)))
print('\n'.join(sorted(unmappable)))

out_file = os.path.join(OSMOTIC_FILEPATH, 'Kitano2016_ORFs.txt')

with open(out_file, 'w') as out_f:
    for orf in sorted(mapped_orfs):
        out_f.write(orf + '\n')

Mapped 151 gold standard genes
Failed to map 10 gold standard genes:
CCR4-NOT COMPLEX
GDP RHO1
GTP CDC42
GTP RHO1
KDX2
PFK COMPLEX
PP2A
SBF COMPLEX
SNF1 COMPLEX
TORC1
