## Accessing the UniProt API
### Trying to write a simple script or utility to look up aliases in Uniprot

https://www.uniprot.org/help/api_queries

In [None]:
import requests
import re
from pathlib import Path
import yaml

In [None]:
# file with genes that would like to know alternative names and uniprot id of
local_path = Path("/Users/choderalab/asapdiscovery/notebook/")
file_path = local_path / 'genes_of_interest.txt'

In [None]:
# Homo sapien organism id
human = 'organism_id:9606'

In [None]:
# Generate URL based on the input
# Get url for gene is the intended format
def gen_url_gene(gene, format):
    url = 'https://rest.uniprot.org/uniprotkb/search?query=' + gene
    url = url + '+AND+reviewed:true+AND+organism_id:9606&' + format
    return url


In [None]:
# Get uniprot id from the fasta string requested
def get_uniprot_id(fasta_match,gene):
    match = re.search(r'\|(\w+)\|', fasta_match)
    if match:
        uniprot_id = match.group(1)
        return uniprot_id
    else:
        print("Uniprot id not found for" + gene)
    

In [None]:
# Get names of the gene from the fasta string requested
def get_names(fasta_match,gene):
    match = re.search(r'\|[^|]*\|([^|]*)OS=', fasta_match)
    if match:
        gene_names = match.group(1)
        return gene_names
    else:
        print("Gene names not found " + gene)

In [None]:
# Make this into a 
def parse_file(file_path):
    entries = []
    with open(file_path, 'r') as file:
        for line in file:
            # Splitting each line by commas and appending to the entries list
            entries.extend(line.strip().split(','))
    return entries


In [None]:
gene_list = parse_file(file_path)
print(gene_list)

In [None]:
def split_names(gene_names):
    # Check if parentheses are present in the string
    if '(' in gene_names and ')' in gene_names:
        # Use regular expression to split the string
        parts = re.split(r'\s*\(([^)]+)\)\s*', gene_names)
        # Remove empty strings from the split result
        parts = [part for part in parts if part.strip()]
        return parts
    else:
        return [gene_names]

# Make a file with only the best matched entry

In [None]:
# Make dictionary of lists that consist of: gene, other names for gene, uniprot id
# Key is the gene querying
uniprot_dict = dict()
# Loop through list of genes of interest
fastas_list = []
format = "format=fasta"
for gene in gene_list:
    # Some genes have alternative name provided in brackets, separate into two names
    name_list = split_names(gene)
    if len(name_list) == 2:
        # Go with the first name first
        # May want to get the second name if find that the first one isnt working well
        # May want to iterate through
        gene1 = name_list[0]
    # If just one name provided, use that one
    else:
        gene1 = gene
    
    url = gen_url_gene(gene1, format)
    all_fastas = requests.get(url).text
    fasta_list = re.split(r'\n(?=>)', all_fastas)
    # Getting the first 
    fastas_list.append(fasta_list)
    probable_match = fasta_list[0]
    # Get the Uniprot name of the first match
    uniprot_id = get_uniprot_id(probable_match,gene)
    gene_names = get_names(probable_match,gene)
    # Update found things to list
    uniprot_dict[gene] = [gene, gene_names, uniprot_id]

The target structure want out put to have:
 - name: ABCB1
  alternatives: P-gp, MDR1
  uniprot: P08183

Want yaml file output

In [None]:
# Open a file in write mode
with open('adme.yml', 'w') as file:
    # Iterate through the list of lists
    for key in uniprot_dict:
        # Convert the inner list to YAML format
        inner_list = uniprot_dict[key]
        # Write each item in the inner list with appropriate headings
        file.write("- name: {}\n".format(inner_list[0]))
        file.write("  alternatives: {}\n".format(inner_list[1]))
        file.write("  uniprot: {}\n".format(inner_list[2]))
file.close()

# Make a file with all the fasta entries that match

In [None]:
all_uniprot_dict = dict()
# Get all the names that were pulled down
for gene in gene_list:
    # Some genes have alternative name provided in brackets, separate into two names
    name_list = split_names(gene)
    if len(name_list) == 2:
        # Go with the first name first
        # May want to get the second name if find that the first one isnt working well
        # May want to iterate through
        gene1 = name_list[0]
    # If just one name provided, use that one
    else:
        gene1 = gene
    
    url = gen_url_gene(gene1, format)
    all_fastas = requests.get(url).text
    fasta_list = re.split(r'\n(?=>)', all_fastas)
    # Getting all
    for fasta in fasta_list:
        # Get the Uniprot name of the first match
        uniprot_id = get_uniprot_id(fasta,gene)
        gene_names = get_names(fasta,gene)
        if gene in all_uniprot_dict:
            all_uniprot_dict[gene].append([gene, gene_names, uniprot_id])
        else:
            all_uniprot_dict[gene] = [[gene, gene_names, uniprot_id]]

In [None]:
with open('adme_all_found.yml', 'w') as file:
    # Iterate through the list of lists
    for key in all_uniprot_dict:
        # Convert the inner list to YAML format
        outer_list = all_uniprot_dict[key]
        for inner_list in outer_list:
            # Write each item in the inner list with appropriate headings
            file.write("- name: {}\n".format(inner_list[0]))
            file.write("  alternatives: {}\n".format(inner_list[1]))
            file.write("  uniprot: {}\n".format(inner_list[2]))
file.close()