In [1]:
import numpy as np
import pandas as pd
import json
import pickle

### Read in the scraped database file

In [2]:

with open('./orf.data', 'rb') as filehandle:
    # read the data as binary data stream
    yaamScrape = pickle.load(filehandle)

### Collect the sequence and PTMs for each ORF

In [3]:
annotations = []
seqs = []
orfs = []

b=yaamScrape


#PTM types
ptm_dict = {'Ca': 1,
 'Methylation': 2,
 'NtAcetylation': 3,
 'Ubiquitination': 4,
 'Disulfide': 5,
 'Phosphorylation': 6,
 'Succinylation': 7,
 'Biotinylation': 8,
 'Oxidation': 9,
 'Nitration': 10,
 'Metal': 11,
 'Lipidation': 12,
 'ActiveSite': 13,
 'Glycosylation': 14,
 'Acetylation': 15,
 'Sumoylation': 16}

for ORF in range(4898):
    
    try:
        ORF = b[ORF]

        orf_seq = json.loads(str(ORF['seq']))['data'][0]['sequence']
        seq_length = len(orf_seq)

        #set up tracking dictionary
        ptm_dict_store ={}
        for ptm_id in ptm_dict.values():
            ptm_dict_store[ptm_id] = []

        #go through the PTMs in the ORF
        for i in json.loads(str(ORF['mod']))['data']:
            #print(i)
            #collect the position/type for each present PTM in the ORF

            #single modification present -> simple
            if i['modification'].count(',') == 0:
                PTM_TOKEN = ptm_dict[i['modification']]
                ptm_dict_store[PTM_TOKEN].append(i['position'])
            #multiple modifications -> add steps
            else:
                mods = i['modification'].split(',')
                for x in mods:
                    x = x.rstrip().lstrip() #bad white space present
                    #print(x)
                    PTM_TOKEN = ptm_dict[x]
                    ptm_dict_store[PTM_TOKEN].append(i['position'])

        #go through each of the PTM types and make the vector containing classification information (1/0 for present/not-present)
        count = 1
        token_matrix = []
        for i in ptm_dict_store.keys():
            ptm_vector = np.zeros(seq_length)
            ptm_locations = ptm_dict_store[i]

            for ptm in ptm_locations:
                ptm_vector[ptm] = 1

            token_matrix.append(ptm_vector)

        token_matrix = np.vstack(token_matrix)
        annotations.append(token_matrix)
        seqs.append(orf_seq)
        orfs.append(ORF['orf'])
    except:
        pass

In [4]:
print(len(orfs))
print(len(seqs))
print(len(annotations))

4820
4820
4820


### Post-process data (tokenizaton) and put in a dataframe

In [5]:
prep_data = {'orf':orfs,
            'seqs':seqs,
            'annotations':annotations}

In [6]:
import pandas as pd
YAAM = pd.DataFrame(prep_data)

In [7]:
#get amino acid set

aa_list = []

for s in YAAM['seqs']:
    if (len(aa_list) < 20):
        for aa in s:
            if aa in aa_list:
                pass
            else:
                aa_list.append(aa)
                
aa_dict = {}
for aa in range(len(aa_list)):
    aa_dict[aa_list[aa]] = aa+1
aa_dict

{'M': 1,
 'L': 2,
 'S': 3,
 'R': 4,
 'A': 5,
 'T': 6,
 'K': 7,
 'V': 8,
 'N': 9,
 'F': 10,
 'H': 11,
 'Q': 12,
 'E': 13,
 'G': 14,
 'I': 15,
 'Y': 16,
 'P': 17,
 'D': 18,
 'C': 19,
 'W': 20}

In [8]:
tokenized_proteins = []
count=0
for protein in YAAM['seqs']:
    protein = ''.join(ch for ch in protein if ch.isupper())
    #print(protein)
    #print()
    tokenized_proteins.append([aa_dict[i] for i in protein])
YAAM['token_proteins'] = tokenized_proteins

In [9]:
YAAM

Unnamed: 0,orf,seqs,annotations,token_proteins
0,YDR148C,MLSRATRTAAAKSLVKSKVARNVMAASFVKRHASTSLFKQANKVES...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 2, 3, 4, 5, 6, 4, 6, 5, 5, 5, 7, 3, 2, 8, ..."
1,YIL037C,MNNVHIIKPLSLPQRFFSCIFHPLLLIFFTSVILTIWGSFSVIDIT...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 9, 9, 8, 11, 15, 15, 7, 17, 2, 3, 2, 17, 1..."
2,YPL195W,MTSLYAPGAEDIRQRLRPFGFFFEKSLKDLIKGIRSHNETPEKLDQ...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 6, 3, 2, 16, 5, 17, 14, 5, 13, 18, 15, 4, ..."
3,YDL194W,MDPNSNSSSETLRQEKQGFLDKALQRVKGIALRRNNSNKDHTTDDT...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 18, 17, 9, 3, 9, 3, 3, 3, 13, 6, 2, 4, 12,..."
4,YNL331C,MTDLFKPLPEPPTELGRLRVLSKTAGIRVSPLILGGASIGDAWSGF...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 6, 18, 2, 10, 7, 17, 2, 17, 13, 17, 17, 6,..."
...,...,...,...,...
4815,YNL309W,MSQPQMSPEKEQELASKILHRAELAQMTRQLKLGLSNVPSTKRKQD...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 3, 12, 17, 12, 1, 3, 17, 13, 7, 13, 12, 13..."
4816,YLR356W,MSVCLAITKGIAVSSIGLYSGLLASASLITSTTPLEVLTGSLTPTL...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 3, 8, 19, 2, 5, 15, 6, 7, 14, 15, 5, 8, 3,..."
4817,YDL006W,MSNHSEILERPETPYDITYRVGVAENKNSKFRRTMEDVHTYVKNFA...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 3, 9, 11, 3, 13, 15, 2, 13, 4, 17, 13, 6, ..."
4818,YIL109C,MSHHKKRVYPQAQLQYGQNATPLQQPAQFMPPQDPAAAGMSYGQMG...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1, 3, 11, 11, 7, 7, 4, 8, 16, 17, 12, 5, 12, ..."


### Count the total number of observed PTMs per type

In [10]:
ptm_counter = {}

for i in range(len(ptm_dict.keys())):
    ptm_counter[i] = 0

for ptms in YAAM['annotations']:
    for c,i in enumerate(ptms):
        ptm_counter[c] += sum(i)
        
ptm_counts = {}
for c,i in enumerate(ptm_dict.keys()):
    ptm_counts[i] = ptm_counter[c]
ptm_counts

{'Ca': 23.0,
 'Methylation': 229.0,
 'NtAcetylation': 746.0,
 'Ubiquitination': 8535.0,
 'Disulfide': 257.0,
 'Phosphorylation': 33967.0,
 'Succinylation': 1701.0,
 'Biotinylation': 1.0,
 'Oxidation': 830.0,
 'Nitration': 14.0,
 'Metal': 1782.0,
 'Lipidation': 141.0,
 'ActiveSite': 1085.0,
 'Glycosylation': 1959.0,
 'Acetylation': 6773.0,
 'Sumoylation': 86.0}

### Save the dataframe in CSV file for later use

In [11]:
YAAM.to_csv('YAAM_tokens.csv')