## Calculate UniProtKB domain-gene coordinates

This script matches the positions of UniProtKB domains/regions within each protein, to the corresponding genomic coordinates of the gene the protein is encoded on.

The UniProtKB domains/regions dataset are downloaded from:  
Mycobacterium bovis: https://www.uniprot.org/uniprotkb?query=%28taxonomy_id%3A233413%29  
Mycobacterium tuberculosis: https://www.uniprot.org/uniprotkb?query=%28taxonomy_id%3A83332%29


In [5]:
import pandas as pd
import gffpandas.gffpandas as gffpd

First, read in the corresponding UniProtKB domain/region dataset and gene assembly GFF3, depending on the _Mycobacterium_ species.

In [6]:
# For Mycobacterium bovis:
uniprotkb_file = '../data/Mycobacterium bovis/Mbovis_uniprotkb_organism_id_233413_2024_06_02.xlsx'
gene_assembly_file = '../data/Mycobacterium bovis/LT708304 gene assembly.gff3'
gff3_header = 'LT708304.1 1 4349904'
output_gff3_file = '../data/Mycobacterium bovis/LT708304 UniProtKB domain assembly.gff3'

In [7]:
# For Mycobacterium tuberculosis:
uniprotkb_file = '../data/Mycobacterium tuberculosis/Mtuberculosis_domains_uniprotkb_organism_id_83332_2024_06_24.xlsx'
gene_assembly_file = '../data/Mycobacterium tuberculosis/NC_018143.1 gene assembly updated.gff3'
gff3_header = 'NC_018143.1 1 4411708'
output_gff3_file = '../data/Mycobacterium tuberculosis/NC_018143.1 UniProtKB domain assembly.gff3'

In [8]:
uniprot_df = pd.read_excel(uniprotkb_file)
gene_gff = gffpd.read_gff3(gene_assembly_file)

  warn("Workbook contains no default style, apply openpyxl's default")


Remove any rows without a gene_locus, as these cannot be matched to the GFF3 gene assemblies. Also remove any genes that have no domains/regions.

In [9]:
# Delete rows without ordered locus gene names
uniprot_df = uniprot_df[uniprot_df['Gene Names (ordered locus)'].notna()]

# Remove rows with no domain and no region information
uniprot_df.dropna(subset = ['Domain [FT]', 'Region'], how = 'all', inplace = True)

Generate a seperate dictionary for data associated with domains and data associated with regions.

In [10]:
# Create seperate dictionaries of gene locus names with each associated domain or associated region.
gene_domain_dict = {}
gene_region_dict = {}

for index, row in uniprot_df.iterrows():
    # Extract gene locus, domain, and region information:
    gene = row['Gene Names (ordered locus)']
    domains = row['Domain [FT]']
    regions = row['Region']

    # If there are associated domains:
    if pd.notna(domains):
        # Split the domains up into seperate values in a list
        for domain in domains.split('DOMAIN '):
            domain_list = ['DOMAIN; /' + domain.strip() for domain in domains.split('DOMAIN ') if domain.strip()]
            # Save under the gene locus name in the dictionary
            gene_domain_dict[gene] = domain_list

    # If there are associated regions:
    if pd.notna(regions):
        # Split the domains up into seperate values in a list
        for region in regions.split('REGION '):
            region_list = ['REGION; /' + region.strip() for region in regions.split('REGION ') if region.strip()]
            # Save under the gene locus name in the dictionary
            gene_region_dict[gene] = region_list

Convert the gene and region dictionaries into dataframes, and merge into a single dataframe.

In [11]:
# Convert dictionary into a dataframe
domain_df = pd.DataFrame(gene_domain_dict.items(), columns = ['locus_tag', 'Feature'])
region_df = pd.DataFrame(gene_region_dict.items(), columns = ['locus_tag', 'Feature'])

# Modify the dataframes
# Create an individual row for each domain
domain_df = domain_df.explode('Feature')
region_df = region_df.explode('Feature')

# Reset index so that each row is individually indexed
domain_df.reset_index(drop = True, inplace = True)
region_df.reset_index(drop = True, inplace = True)

In [12]:
# Combine into a single feature database
feature_df = pd.concat([domain_df, region_df])
feature_df = feature_df.explode('Feature')
feature_df.reset_index(drop = True, inplace = True)

Modify the dataframe to have amino acid co-ordinates in seperate columns and use regex to clean up domain/region metadata.

In [13]:
# Split the feature column into seperate columns
feature_df['feature'] = ''
feature_df['amino loc'] = ''
feature_df['note'] = ''
feature_df['evidence'] = ''

feature_df[['feature', 'amino loc', 'note', 'evidence']] = feature_df['Feature'].str.split('; /', expand=True)

In [14]:
# Split the amino locations column into start & end coords
feature_df['amino start'] = ''
feature_df['amino end'] = ''

for index, row in feature_df.iterrows():
    amino_start, amino_end = row['amino loc'].split('..')
    feature_df.at[index, 'amino start'] = amino_start
    feature_df.at[index, 'amino end'] = amino_end

In [15]:
import re

# Clean up the note column using regex
for index, row in feature_df.iterrows():
    note = row['note']
    stripped_note = re.sub(r'^note="(.+?)(?:";|")?$', r'\1', note)
    feature_df.loc[index, 'note'] = stripped_note

# Clean up the evidence column using regex
for index, row in feature_df.iterrows():
    if pd.notna(row['evidence']):
        evidence = row['evidence']
        stripped_evidence = re.sub(r'^evidence="(.+?)(?:";|")?$', r'\1', evidence)
        feature_df.loc[index, 'evidence'] = stripped_evidence

# Clean up the locus tag to remove unncessary 'RVBD_'
feature_df['locus_tag'] = feature_df['locus_tag'].str.replace(r' RVBD_[\w]*', '', regex=True)

In [16]:
# Remove the columns that are no longer needed
feature_df = feature_df.drop(columns = ['Feature', 'amino loc'])

The information within the feature dataframe can now be matched with the gene information from the corresponding GFF3 to calculate domain genomic coordinates.

In [17]:
# Filter for just genes from the GFF3
gene_gff = gene_gff.filter_feature_of_type(['gene'])
gene_gff_header = gene_gff.header

# Convert each individual attribute to its own column
gene_df = gene_gff.attributes_to_columns()

In [18]:
# Merge the datasets using the locus_tag

feature_df['locus_tag'] = feature_df['locus_tag'].str.capitalize()
gene_df['locus_tag'] = gene_df['locus_tag'].str.capitalize()

merged_df = pd.merge(feature_df, gene_df, on = 'locus_tag')

In [19]:
# Calculate the domain coordinates

# Ensure all coordinates are numeric
merged_df['start'] = merged_df['start'].astype(int)
merged_df['end'] = merged_df['end'].astype(int)
merged_df['amino start'] = merged_df['amino start'].astype(int)
merged_df['amino end'] = merged_df['amino end'].astype(int)

# Calculate genomic start co-ordinates, taking into account if the strand is positive or negative
for index, row in merged_df.iterrows():
    if row['strand'] == '+':
        merged_df.loc[index, 'feature start'] = (row['start']+(row['amino start']*3-2)-1)
        merged_df.loc[index, 'feature end'] = (row['start']+(row['amino end']*3)-1)
    elif row['strand'] == '-':
        merged_df.loc[index, 'feature end'] = (row['end']-(row['amino start']*3-2)+1)
        merged_df.loc[index, 'feature start'] = (row['end']-(row['amino end']*3)+1)

# Ensure correct types
merged_df['feature start'] = merged_df['feature start'].astype(int)
merged_df['feature end'] = merged_df['feature end'].astype(int)

Convert the merged dataframe of domain & region genomic coordinates into a GFF3 dataframe and export as a GFF3 file.

In [20]:
# Generate an attibrutes column
for index, row in merged_df.iterrows():
    attribute_str = f"ID=domain-{row['locus_tag']}_{index+1};Name={row['note']};locus_tag={row['locus_tag']};gene_identifier={row['Name']};evidence={row['evidence']}"
    if pd.isna(row['gene']) == False:
        attribute_str += f";gene={row['gene']}"
    merged_df.at[index, 'attributes'] = attribute_str

In [21]:
# Remove unnecessary columns
col_names = ['seq_id', 'source', 'feature', 'feature start', 'feature end', 'score', 'strand', 'phase', 'attributes']
merged_df = merged_df[col_names]

merged_df = merged_df.rename(columns = {'feature':'type', 'feature start':'start', 'feature end':'end'})
merged_df.loc[merged_df['type'] == 'DOMAIN', 'type'] = 'UniprotKB_Domain'
merged_df.loc[merged_df['type'] == 'REGION', 'type'] = 'UniprotKB_Region'

In [22]:
# Convert the merged dataframe back to a GFF3 dataframe
header = f"##gff-version 3\n##sequence-region {gff3_header}\n"
merged_gff_df = gffpd.Gff3DataFrame(input_df=merged_df, input_header=header)

In [23]:
# Save the GFF3 dataframe as a file
merged_gff_df.to_gff3(output_gff3_file)