### This notebook scrapes the YAAM database (which does not have an API). It takes some time so it is done in parallel. The results are stored in a pickled file. The next notebook preprocesses the raw queries from the database.

In [None]:
!pip install selenium
!pip install bs4

import numpy as np
import pandas as pd
import os
from selenium import webdriver
from bs4 import BeautifulSoup as bs


### Read in the textfile containg a snapshot of the database

In [None]:
with open('yaam_dec15_21.txt') as f:
    lines = f.readlines()

PTM_types = ['Phosphorylation',
'Methylation',
'Acetylation',
'Ubiquitination',
'Succinylation',
'Oxidation',
'Nitration',
'NtAcetylation',
'Glycosylation',
'Disulfide',
'Lipidation',
'Sumoylation',
'Metal']


"""
Phosphorylation
Acetylation
Ubiquitination
Succinylation
Glycosylation
Lipidation
"""


ptm_dict = {}

for i in PTM_types:
  ptm_dict[i] = []


minilines = lines

current_type = None
for line in minilines:
  for ptm in PTM_types:
    if ptm in line:
      current_type = ptm 
  if current_type:
    ptm_dict[current_type].append(line)




for KEY in ptm_dict.keys():
  print(f'Modification: {KEY}')
  print(f'Total Hits: {len(ptm_dict[KEY])}')
  print()


ORF_ids = []
for KEY in ptm_dict.keys():
  cutlist = ptm_dict[KEY][1].split('\t')
  #print(cutlist)
  for hit in ptm_dict[KEY][2:]:
    #print(hit)
    try:
      ORF_ids.append(hit.split('\t')[1])
    except:
      pass

yeast_ORFS = set(ORF_ids)
print('-----------------')
print(f"Approximately {round(len(yeast_ORFS)/6000,2)*100}% of yeast proteins contain PTM's")


### Loop over each ORF id to collect the data. Use multithreading to speed up queries.

In [None]:

def do_loop(ORF):
    
    
    LINK=f'http://yaam.ifc.unam.mx/Proteinas/search.php?busqueda1={ORF}'
    re = requests.get(LINK)
    while re.status_code != 200:
        re = requests.get(LINK)
    soup = BeautifulSoup(re.text,'html.parser')
    
    LINK2=f'http://yaam.ifc.unam.mx/Proteinas/search.php?busqueda2={ORF}'
    re2 = requests.get(LINK2)
    while re2.status_code != 200:
      re2 = requests.get(LINK2)
    soup2 = BeautifulSoup(re2.text,'html.parser')
    
    data_in = {'orf':ORF,
               'seq':soup,
               'mod':soup2
              }
    
    return data_in
!pip install selenium
!pip install bs4

import numpy as np
import pandas as pd
import os
from selenium import webdriver
from bs4 import BeautifulSoup as bs

# install chromium, its driver, and selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium
# set options to be headless, ..
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# open it, go to a website, and get results
wd = webdriver.Chrome('chromedriver',options=options)


%time
from multiprocessing.pool import ThreadPool
from bs4 import BeautifulSoup
import requests
biglist = []
missing = 0
with ThreadPool(8) as pool:
    for result in pool.map(do_loop, list(yeast_ORFS)):
        #print("HERE")
        biglist.append(result)

### Store results

In [None]:

with open('orf.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(biglist, filehandle)