In [1]:
#Python 3.10.6
import pandas as pd
import numpy as np
from tqdm import tqdm
import json, requests, gzip

URL_UNIPROT = 'https://rest.uniprot.org/uniprotkb/search?query='

In [2]:
def fetch_uniprot_data(session,name):
    '''Fetch uniprot data of reviewed entries searching by name'''
    url = URL_UNIPROT+'(reviewed:true)%20AND%20'+name
    headers = {'Accept-Encoding': 'gzip'}
    response = session.get(url=url, headers=headers)
    if response.status_code == 200:
        response =json.loads(gzip.decompress(response.content).decode('utf-8'))
        return response['results']
    else:
        return 'Error'

In [3]:
def filter_kinetoplastids_targets(data_path):
   '''Receives data in CSV format, looks for a 'Name' column, and filters ,using UNIPROT, the elements
      that belong to the Kinetoplastea class'''
   if 'PHARM' in data_path:
      original_df = pd.read_csv(data_path, skiprows=1)
      names = original_df['Name']
   else:
      original_df = pd.read_csv(data_path)
      names = original_df['name']
   result_df = pd.DataFrame()
   #
   session = requests.Session()
   #
   for name in tqdm(names):
      try:
         for char in ['[',']','(',')',':','?',',','>','<']:
            name = name.replace(char, ' ')
         data = fetch_uniprot_data(session,name)
         for entry in data:
            if 'lineage' in entry['organism'].keys() and 'Kinetoplastea' in entry['organism']['lineage']:
               organism = entry['organism']['scientificName']
               if 'proteinDescription' in entry.keys():
                  if 'recommendedName' in entry['proteinDescription'].keys() and 'fullName' in entry['proteinDescription']['recommendedName']:
                     target = entry['proteinDescription']['recommendedName']['fullName']['value']
                  if 'ecNumbers' in entry['proteinDescription']['recommendedName'].keys():
                     ecnum = entry['proteinDescription']['recommendedName']['ecNumbers'][0]['value']
                  else:
                     #Consultar BRENDA id
                     if "uniProtKBCrossReferences" in entry.keys():
                        ecnum = ''
                        for db in entry["uniProtKBCrossReferences"]:
                           if 'BRENDA' in db['database']:
                              ecnum += db['id'] + ' '
                           else:
                              ecnum = ' - '
                     else:
                        ecnum = ' - '
               if 'primaryAccession' in entry.keys():
                  uniId = entry['primaryAccession']
               else:
                  uniId = ' - '
               bioproc = []
               if 'keywords' in entry.keys():
                  for keyword in entry['keywords']:
                     if 'Biological' in keyword['category']:
                           bioproc.append(keyword['name'])
               triId = []
               if "uniProtKBCrossReferences" in entry.keys():
                  for db in entry["uniProtKBCrossReferences"]:
                     if 'TriTryp' in db['id']:
                        triId.append(db['id'])
               #
               elemDict = {'Search':[name], 'Target': [target], 'Organism': [organism], 'EC Number': [ecnum],
                         'Biological Process': [bioproc], 'Uniprot ID': [uniId], 'TriTrypDB ID': [triId]}
               element_df = pd.DataFrame.from_dict(elemDict)
               result_df = pd.concat([result_df,element_df], ignore_index= True)
      except:
         print('Error in: ', name)
   print('Done')
   return result_df

In [4]:
input_list = ['./nqb-500-conformNO-PHARMMAPPER.csv','./nqb-SHAFTS.csv',
              './nqh-500-conformNO-PHARMMAPPER.csv','./nqh-SHAFTS.csv',
              './f6-300-conformNO-PHARMMAPPER.csv','./f6-SHAFTS.csv',
              './f9-300-conform1-PHARMMAPPER.csv','./f9-SHAFTS.csv',
              './t8-500-conformNO-PHARMMAPPER.csv','./t8-SHAFTS.csv',
              './t11-300-conformNO-PHARMMAPPER.csv','./t11-SHAFTS.csv']

output_list = ['PHMPR-nqb','CHMPR-nqb',
              'PHMPR-nqh','CHMPR-nqh',
              'PHMPR-f6','CHMPR-f6',
              'PHMPR-f9','CHMPR-f9',
              'PHMPR-t8','CHMPR-t8',
              'PHMPR-t11','CHMPR-t11']

In [5]:
for i in range(len(input_list)):
    try:
        print('Running ' + output_list[i])
        result = filter_kinetoplastids_targets(input_list[i])
        result.to_csv("./ProcessedData_v2/Results/" + output_list[i])
    except Exception as err:
        print(err)
        print(f'In {input_list[i]} execution')

Running PHMPR-nqb


100%|██████████| 499/499 [08:24<00:00,  1.01s/it]


Done
Running CHMPR-nqb


100%|██████████| 381/381 [05:50<00:00,  1.09it/s]


Done
Running PHMPR-nqh


100%|██████████| 499/499 [05:45<00:00,  1.45it/s]


Done
Running CHMPR-nqh


100%|██████████| 402/402 [04:29<00:00,  1.49it/s]


Done
Running PHMPR-f6


100%|██████████| 299/299 [02:57<00:00,  1.68it/s]


Done
Running PHMPR-f9


100%|██████████| 299/299 [02:23<00:00,  2.09it/s]


Done
Running CHMPR-f9


100%|██████████| 280/280 [02:35<00:00,  1.80it/s]


Done
Running PHMPR-t8


100%|██████████| 498/498 [05:34<00:00,  1.49it/s]


Done
Running CHMPR-t8


100%|██████████| 279/279 [02:49<00:00,  1.64it/s]


Done
Running PHMPR-t11


100%|██████████| 299/299 [02:21<00:00,  2.11it/s]

Done





In [5]:
#f6-CHMPR
result = filter_kinetoplastids_targets(input_list[5])
result.to_csv("./ProcessedData_v2/Results/" + output_list[5])

#T11-CHMPR
result = filter_kinetoplastids_targets(input_list[11])
result.to_csv("./ProcessedData_v2/Results/" + output_list[11])

100%|██████████| 323/323 [04:42<00:00,  1.14it/s]


Done


100%|██████████| 295/295 [03:42<00:00,  1.33it/s]

Done





In [6]:
#Listas sin resultados duplicados
files = output_list
for file in files:
    search_df = pd.read_csv('./ProcessedData_v2/Results/'+file, index_col=0)
    search_df = search_df.drop_duplicates(subset=['Uniprot ID'], ignore_index=True)
    search_df.to_csv('./ProcessedData_v2/NoDuplicates/'+file)

In [7]:
#Lista unica
unified_df = pd.read_csv('./ProcessedData_v2/NoDuplicates/CHMPR-f9', index_col=0)
unified_df['Found In'] = 'CHMPR-f9'
files = output_list
for file in files:
    aux_df = pd.read_csv('./ProcessedData_v2/NoDuplicates/'+file, index_col=0)
    aux_df['Found In'] = file
    for i in range(len(unified_df)):
        if unified_df.iloc[i,5] in list(aux_df['Uniprot ID']):
            unified_df.loc[i,'Found In'] = unified_df.loc[i,'Found In']+' '+file
    unified_df = pd.concat([unified_df,aux_df], ignore_index=True) 
unified_df = unified_df.drop_duplicates(subset=['Uniprot ID'], ignore_index=True)
unified_df.to_csv('./ProcessedData_v2/AllTargets')