In [49]:
#Python 3.10.6
import pandas as pd
import numpy as np
from tqdm import tqdm
import json, time, os, requests, gzip

URL_UNIPROT = 'https://rest.uniprot.org/uniprotkb/search?query='

In [63]:
def fetch_uniprot_data(session,name):
    '''Fetch uniprot data of reviewed entries searching by name'''
    url = URL_UNIPROT+'(reviewed:true)%20AND%20'+name
    headers = {'Accept-Encoding': 'gzip'}
    response = session.get(url=url, headers=headers)
    if response.status_code == 200:
        response =json.loads(gzip.decompress(response.content).decode('utf-8'))
        return response['results']
    else:
        return 'Error'

In [149]:
def filter_kinetoplastids_targets(data_path):
   '''Receives data in CSV format, looks for a 'Name' column, and filters ,using UNIPROT, the elements
      that belong to the Kinetoplastea class'''
   if 'PHARM' in data_path:
      original_df = pd.read_csv(data_path, skiprows=1)
      names = original_df['Name']
   else:
      original_df = pd.read_csv(data_path)
      names = original_df['name']
   result_df = pd.DataFrame()
   #
   session = requests.Session()
   #
   for name in tqdm(names):
      try:
         for char in ['[',']','(',')',':','?',',','>','<']:
            name = name.replace(char, ' ')
         data = fetch_uniprot_data(session,name)
         for entry in data:
            if 'lineage' in entry['organism'].keys() and 'Kinetoplastea' in entry['organism']['lineage']:
               organism = entry['organism']['scientificName']
               if 'proteinDescription' in entry.keys():
                  if 'recommendedName' in entry['proteinDescription'].keys() and 'fullName' in entry['proteinDescription']['recommendedName']:
                     target = entry['proteinDescription']['recommendedName']['fullName']['value']
                  if 'ecNumbers' in entry['proteinDescription']['recommendedName'].keys():
                     ecnum = entry['proteinDescription']['recommendedName']['ecNumbers'][0]['value']
                  else:
                     ecnum = ' - '
               if 'primaryAccession' in entry.keys():
                  uniId = entry['primaryAccession']
               else:
                  uniId = ' - '
               bioproc = []
               if 'keywords' in entry.keys():
                  for keyword in entry['keywords']:
                     if 'Biological' in keyword['category']:
                           bioproc.append(keyword['name'])
               triId = []
               if "uniProtKBCrossReferences" in entry.keys():
                  for db in entry["uniProtKBCrossReferences"]:
                     if 'TriTryp' in db['id']:
                        triId.append(db['id'])
               #
               elemDict = {'Search':[name], 'Target': [target], 'Organism': [organism], 'EC Number': [ecnum],
                         'Biological Process': [bioproc], 'Uniprot ID': [uniId], 'TriTrypDB ID': [triId]}
               element_df = pd.DataFrame.from_dict(elemDict)
               result_df = pd.concat([result_df,element_df], ignore_index= True)
      except:
         print('Error in: ', name)
   print('Done')
   return result_df

In [155]:
result = filter_kinetoplastids_targets('./nqh-500-conformNO-PHARMMAPPER.csv')
result

  0%|          | 0/499 [00:00<?, ?it/s]

100%|██████████| 499/499 [05:32<00:00,  1.50it/s]

Done





Unnamed: 0,Search,Target,Organism,EC Number,Biological Process,Uniprot ID,TriTrypDB ID
0,S-adenosylmethionine decarboxylase proenzyme,S-adenosylmethionine decarboxylase proenzyme,Trypanosoma brucei brucei,4.1.1.50,"[Polyamine biosynthesis, Spermidine biosynthesis]",P50244,[]
1,S-adenosylmethionine decarboxylase proenzyme,S-adenosylmethionine decarboxylase proenzyme,Trypanosoma cruzi,4.1.1.50,"[Polyamine biosynthesis, Spermidine biosynthesis]",O76240,"[TriTrypDB:BCY84_01143, TriTrypDB:C3747_28g21,..."
2,S-adenosylmethionine decarboxylase proenzyme,Inactive S-adenosylmethionine decarboxylase pr...,Trypanosoma brucei brucei,-,"[Polyamine biosynthesis, Spermidine biosynthesis]",A5HNV6,[]
3,Major envelope glycoprotein,Leishmanolysin,Leishmania major,3.4.24.36,[Cell adhesion],P08148,"[TriTrypDB:LmjF.10.0480, TriTrypDB:LMJFC_10001..."
4,RNA ligase 2,"RNA-editing ligase 1, mitochondrial",Trypanosoma brucei brucei,6.5.1.3,[mRNA processing],P86926,[]
5,RNA ligase 2,"RNA-editing ligase 1, mitochondrial",Trypanosoma brucei brucei (strain 927/4 GUTat1...,6.5.1.3,[mRNA processing],P86927,[TriTrypDB:Tb927.9.4360]
6,RNA ligase 2,"RNA-editing ligase 2, mitochondrial",Trypanosoma brucei brucei,6.5.1.3,[],P86924,[]
7,Glycerol-3-phosphate dehydrogenase NAD+ gly...,"Glycerol-3-phosphate dehydrogenase [NAD(+)], g...",Leishmania mexicana,1.1.1.8,[],P90551,[TriTrypDB:LmxM.10.0510]
8,Glycerol-3-phosphate dehydrogenase NAD+ gly...,"Glycerol-3-phosphate dehydrogenase [NAD(+)], g...",Trypanosoma brucei rhodesiense,1.1.1.8,[],Q26756,[]
9,Glycerol-3-phosphate dehydrogenase NAD+ gly...,"Glycerol-3-phosphate dehydrogenase [NAD(+)], g...",Trypanosoma brucei brucei,1.1.1.8,[],P90593,[]


In [156]:
result.to_csv("./ProcessedData_v2/PHMPR-nqh")