In [13]:
#Python 3.10.6
import pandas as pd
import numpy as np
from tqdm import tqdm
import json, time, os, requests, gzip

URL_UNIPROT = 'https://rest.uniprot.org/uniprotkb/search?query='

In [14]:
def fetch_uniprot_data(session,name):
    '''Fetch uniprot data of reviewed entries searching by name'''
    url = URL_UNIPROT+'(reviewed:true)%20AND%20'+name
    headers = {'Accept-Encoding': 'gzip'}
    response = session.get(url=url, headers=headers)
    if response.status_code == 200:
        response =json.loads(gzip.decompress(response.content).decode('utf-8'))
        return response['results']
    else:
        return 'Error'

In [15]:
def filter_kinetoplastids_targets(data_path):
   '''Receives data in CSV format, looks for a 'Name' column, and filters ,using UNIPROT, the elements
      that belong to the Kinetoplastea class'''
   if 'PHARM' in data_path:
      original_df = pd.read_csv(data_path, skiprows=1)
      names = original_df['Name']
   else:
      original_df = pd.read_csv(data_path)
      names = original_df['name']
   result_df = pd.DataFrame()
   #
   session = requests.Session()
   #
   for name in tqdm(names):
      try:
         for char in ['[',']','(',')',':','?',',','>','<']:
            name = name.replace(char, ' ')
         data = fetch_uniprot_data(session,name)
         for entry in data:
            if 'lineage' in entry['organism'].keys() and 'Kinetoplastea' in entry['organism']['lineage']:
               organism = entry['organism']['scientificName']
               if 'proteinDescription' in entry.keys():
                  if 'recommendedName' in entry['proteinDescription'].keys() and 'fullName' in entry['proteinDescription']['recommendedName']:
                     target = entry['proteinDescription']['recommendedName']['fullName']['value']
                  if 'ecNumbers' in entry['proteinDescription']['recommendedName'].keys():
                     ecnum = entry['proteinDescription']['recommendedName']['ecNumbers'][0]['value']
                  else:
                     ecnum = ' - '
               if 'primaryAccession' in entry.keys():
                  uniId = entry['primaryAccession']
               else:
                  uniId = ' - '
               bioproc = []
               if 'keywords' in entry.keys():
                  for keyword in entry['keywords']:
                     if 'Biological' in keyword['category']:
                           bioproc.append(keyword['name'])
               triId = []
               if "uniProtKBCrossReferences" in entry.keys():
                  for db in entry["uniProtKBCrossReferences"]:
                     if 'TriTryp' in db['id']:
                        triId.append(db['id'])
               #
               elemDict = {'Search':[name], 'Target': [target], 'Organism': [organism], 'EC Number': [ecnum],
                         'Biological Process': [bioproc], 'Uniprot ID': [uniId], 'TriTrypDB ID': [triId]}
               element_df = pd.DataFrame.from_dict(elemDict)
               result_df = pd.concat([result_df,element_df], ignore_index= True)
      except:
         print('Error in: ', name)
   print('Done')
   return result_df

In [16]:
# result = filter_kinetoplastids_targets('./nqh-SHAFTS.csv')
# result

In [17]:
# result.to_csv("./ProcessedData_v2/CHMPR-nqh")