## Wrangling UniProt site annotation
Notebook to prepare UniProt site annotation in a nicer format

In [178]:
from pathlib import Path
from math import isnan
import pandas as pd

In [179]:
df = pd.read_csv("../data/processed/uniprot_site_annotation/idmapping_2024_04_04.tsv",sep='\t')
columns = ["uniprot_id","residue","site_type"]


In [180]:
def get_range(residue_range):
    start, end = residue_range.split("..")
    residue_list = list(range(int(start),int(end)+1))
    return residue_list

def parse_active_site(act_site_row):
    act_site_row = act_site_row.split("ACT_SITE")
    act_site_residues = []
    for item in act_site_row:        
        if item:
            split_item = item.split(";")
            residue_idx = int(split_item[0])
            header, description = split_item[1].split("=")
            if header != " /note": 
                description = float('nan')
            else:
                description = description.replace("\"","")
            act_site_residues.append((residue_idx, description))
    return act_site_residues

def parse_binding_site(binding_site_row):
    binding_site_row = binding_site_row.split("BINDING")
    binding_site_residues = []
    for item in binding_site_row:
        if item:
            split_item = item.split(";")
            if ".." in split_item[0]: # Range of residues
                residue_idx = get_range(split_item[0])
            else: # Single residue
                residue_idx = int(split_item[0])
            header, ligand = split_item[1].split("=")
            if header != " /ligand":
                ligand = float('nan')
            else:
                ligand = ligand.replace("\"","")
            
            if isinstance(residue_idx, list):
                for res in residue_idx:
                    binding_site_residues.append((res, ligand))
            elif isinstance(residue_idx, int):
                binding_site_residues.append((residue_idx, ligand))
    
    return binding_site_residues

def parse_dna_binding_site(dna_binding_row):
    
    dna_binding_row = dna_binding_row.split("DNA_BIND")
    dna_binding_residues = []
    for item in dna_binding_row:
        if item:
            split_item = item.split(";")
            if ".." in split_item[0]: # Range of residues
                residue_idx = get_range(split_item[0])
            else: # Single residue
                residue_idx = int(split_item[0])
            
            if isinstance(residue_idx, list):
                for res in residue_idx:
                    dna_binding_residues.append(res)
            elif isinstance(residue_idx, int):
                dna_binding_residues.append(residue_idx)
    return dna_binding_residues

def parse_site(site_row):
    # The space is necessary!
    # Don't remove or PROSITE will break things
    site_row = site_row.split("SITE ") 
    site_residues = []
    for item in site_row:        
        if item:
            split_item = item.split(";")
            
            if ".." in split_item[0]: # Range of residues
                residue_idx = get_range(split_item[0])
            else: # Single residue
                residue_idx = int(split_item[0])

            header, description = split_item[1].split("=")
            if header != " /note": 
                description = float('nan')
            else:
                description = description.replace("\"","")
                        
            if isinstance(residue_idx, list):
                for res in residue_idx:
                    site_residues.append((res, description))
            elif isinstance(residue_idx, int):
                    site_residues.append((residue_idx, description))

    return site_residues



In [181]:
unrolled_df = []
columns = ["uniprot_id","site_type","residue","description","ligand"]
for idx, row in df.iterrows():

    act_site_residues = []
    binding_site_residues = []
    dna_binding_residues = []
    site_residues = []
    
    prot_name = row["Entry"]
    active_site = row["Active site"]
    binding_site = row['Binding site']
    dna_binding_site = row["DNA binding"]
    site = row["Site"]
    if isinstance(active_site, str):
        act_site_residues = parse_active_site(active_site)
    if isinstance(binding_site, str):
        binding_site_residues = parse_binding_site(binding_site)
    if isinstance(dna_binding_site, str):
        dna_binding_residues = parse_dna_binding_site(dna_binding_site)
    if isinstance(site, str):
        site_residues = parse_site(site)
    
    for res in act_site_residues:
        unrolled_df.append([prot_name, "Active site", res[0], res[1], float('nan')])
    for res in binding_site_residues:
        unrolled_df.append([prot_name, "Binding site", res[0], float('nan'), res[1]])
    for res in dna_binding_residues:
        unrolled_df.append([prot_name, "DNA binding site", res, float('nan'), float('nan')])
    for res in site_residues:
        unrolled_df.append([prot_name, "Site", res[0], res[1], float('nan')])



In [183]:
unrolled_df = pd.DataFrame(unrolled_df, columns=columns)
unrolled_df.to_csv("../data/processed/uniprot_site_annotation/uniprot_site_annotation.csv",index=None)

In [184]:
unrolled_df

Unnamed: 0,uniprot_id,site_type,residue,description,ligand
0,A0A0K8P8E7,Active site,225,Acyl-ester intermediate,
1,A0A0K8P8E7,Active site,492,Charge relay system,
2,A0A0K8P8E7,Active site,528,Charge relay system,
3,A0A0K8P8E7,Binding site,132,,4-[(2-hydroxyethoxy)carbonyl]benzoate
4,A0A0K8P8E7,Binding site,226,,4-[(2-hydroxyethoxy)carbonyl]benzoate
...,...,...,...,...,...
3120,Q9Y6E0,Binding site,114,,ATP
3121,Q9Y6E0,Binding site,161,,Mg(2+)
3122,Q9Y6E0,Binding site,174,,Mg(2+)
3123,Q9Y6E0,Site,325,Cleavage,


In [76]:
#df.iloc[0]['Binding site'].split("BINDING")
df.iloc[0]['Binding site'].split("BINDING")[1].split(";")

[' 132',
 ' /ligand="4-[(2-hydroxyethoxy)carbonyl]benzoate"',
 ' /ligand_id="ChEBI:CHEBI:131704"',
 ' /evidence="ECO:0000305|PubMed:30979881"',
 ' ']

In [24]:
df.iloc[0]['Binding site'].split("BINDING")[1].split(";")

[' 132',
 ' /ligand="4-[(2-hydroxyethoxy)carbonyl]benzoate"',
 ' /ligand_id="ChEBI:CHEBI:131704"',
 ' /evidence="ECO:0000305|PubMed:30979881"',
 ' ']

In [None]:
../data/processed/uniprot_site_annotation