## Combining all genetic features to GFF3

This script takes the GFF3 files of all the genetic features that have been found so far (__gene assembly, UniprotKB domains, Pfam CD-search domains__) and combines them into a single dataframe. It also finds which regions of each gene is _not_ covered by a domain, calculates the coordinates of these, and records them as an __'unannotated region'__. All genetic features are then combined into a single GFF3.

In [11]:
import pandas as pd
import gffpandas.gffpandas as gffpd

First, find the corresponding gene assembly GFF3, UniprotKB domain assembly, and Pfam CD-search domain assembly, depending on the _Mycobacterium_ species.

In [12]:
# For Mycobacterium bovis
gene_gff = '../data/Mycobacterium bovis/LT708304 gene assembly.gff3'
uniprot_gff = '../data/Mycobacterium bovis/LT708304 UniProtKB domain assembly.gff3'
pfam_gff = '../data/Mycobacterium bovis/LT708304 Pfam domain assembly.gff3'
gff3_header = 'LT708304.1 1 4349904'
gff3_output = '../data/Mycobacterium bovis/LT708304 full genetic features assembly.gff3'

In [13]:
# For Mycobacterium tuberculosis
gene_gff = '../data/Mycobacterium tuberculosis/NC_018143.1 gene assembly updated.gff3'
uniprot_gff = '../data/Mycobacterium tuberculosis/NC_018143.1 UniProtKB domain assembly.gff3'
pfam_gff = '../data/Mycobacterium tuberculosis/NC_018143.1 Pfam domain assembly.gff3'
gff3_hearder = 'NC_018143.1 1 4411708'
gff3_output = '../data/Mycobacterium tuberculosis/NC_018143.1 full genetic features assembly.gff3'

Convert to dataframes.

In [14]:
gene_gff = gffpd.read_gff3(gene_gff)
gene_df = gene_gff.df

uniprot_gff = gffpd.read_gff3(uniprot_gff)
uniprot_df = uniprot_gff.df

pfam_gff = gffpd.read_gff3(pfam_gff)
pfam_df = pfam_gff.df

Combine into a single dataframe and remove unnecessary columns and rows with no locus tags.

In [15]:
# Combine the DataFrames using pd.concat
features_df = pd.concat([gene_df, uniprot_df, pfam_df], ignore_index = True)

# Reset the index of the combined DataFrame
features_df.reset_index(drop=True, inplace=True)

In [16]:
features_df['Name'] = ""
features_df['locus_tag'] = ""

# Process the attributes of each row to extract just the name
for index, row in features_df.iterrows():
    # Check if the line contains 'Name='
    if 'Name=' in row['attributes']:
        # Split the line by semicolon and iterate through the parts
        parts = row['attributes'].split(';')
        for part in parts:
            if part.startswith('Name='):
                # Extract the name and remove the 'Name=' part
                name = part.split('=')[1]
                features_df.loc[index, 'Name'] = name
            if part.startswith('locus_tag'):
                # Extract the locus_tag and remove the 'locus_tag=' part
                locus_tag = part.split('=')[1]
                features_df.loc[index, 'locus_tag'] = locus_tag

In [17]:
# Remove rows with no locus_tag (some Pfam CD-search domains)
features_df = features_df.drop(features_df[features_df['locus_tag'] == "nan"].index)

# Sort the DataFrame
sorted_features_df = features_df.sort_values(by=['locus_tag', 'start'])
sorted_features_df = sorted_features_df.reset_index(drop=True)

# Ensure all locus tags are capiatlised to match
sorted_features_df['locus_tag'] = sorted_features_df['locus_tag'].str.title()

Next, calculate which regions of each gene are __not__ occupied by a domain. The terminal 10% of each gene is excluded from the 'unannotated region' areas.

In [18]:
# Create a list of each individual locus_tag
locus_list = []

for index, row in sorted_features_df.iterrows():
    locus_list.append(row['locus_tag'])

# Convert to a set to find unique locus_tags
locus_set = set(locus_list)

# Then convert back to a list
locus_list = list(locus_set)
locus_list.sort()

In [19]:
sorted_features_df['gene start 10per'] = None
sorted_features_df['gene end 10per'] = None

# Adjust gene start and end co-ordinates to remove the first & last non-informative 10%
for index, row in sorted_features_df.iterrows():
    if row['type'] == 'gene':
        start = int(row['start'])
        end = int(row['end'])
        distance = int(end-start)
        ten_percent = int(distance * 0.1)
        start_10 = start + ten_percent
        end_10 = end - ten_percent

        sorted_features_df.at[index, 'gene start 10per'] = start_10
        sorted_features_df.at[index, 'gene end 10per'] = end_10

In [20]:
# Script for finding the 'unannotated region' co-ordinates within the domain containing genes.

all_unannotated_coords = []

# For each locus tag (aka each gene)...
for locus in locus_list:
    domain_coord_list = []

    # Create a mini dataframe of just the genetic features associated with that locus_tag
    locus_df = sorted_features_df[sorted_features_df['locus_tag'] == locus]

    # Sort the mini dataframe by starting coordinate
    locus_df = locus_df.sort_values(by=['start'])
    locus_df = locus_df.reset_index(drop=True)

    unannoted_coord_list = []

    # If the locus_tag contains more than just the gene (> 1 genetic feature)...
    if len(locus_df) > 1:

        # Record gene coordinates
        gene_coords = (locus_df.loc[locus_df['type'] == 'gene', 'gene start 10per'].values[0], 
                       locus_df.loc[locus_df['type'] == 'gene', 'gene end 10per'].values[0])
        
        # Iterate through the dataframe of other genetic features
        for index, row in locus_df.iterrows():
            # Search for non-gene rows
            if row['type'] != "gene":
                # Record all domain co-ordinates
                domain_coords = (row['start'], row['end'])
                domain_coord_list.append(domain_coords)

        # Merge the overlapping or directly adjacent domains
        merged_domain_coord_list = []
        # Search through the coordinate list
        for start, end in sorted(domain_coord_list):
            if merged_domain_coord_list and merged_domain_coord_list[-1][1] >= start - 1:
                # If the domain overlaps with another domain, create a 'merged' pseudodomain with the total start & end coordinates
                merged_domain_coord_list[-1] = (merged_domain_coord_list[-1][0], max(merged_domain_coord_list[-1][1], end))
            else:
                merged_domain_coord_list.append((start, end))
    
        # Set the current nucleotide position to the start of the gene
        current_bp = gene_coords[0]
        start_bp = current_bp
        last_index = len(merged_domain_coord_list) - 1

        # Loop through the overlapped domain coordinates
        for index, domain in enumerate(merged_domain_coord_list):
            # Skip unannotated region if the first domain starts at the same point or before the gene:
            if index == 0 and domain[0] <= gene_coords[0]:
                start_bp = domain[1] + 1
                print("First domain starts at gene start, skipping unannotated calculation")
            else:
                # Find end position of the uannotated region (1 nucleotide before the next domain begins)
                if current_bp < domain[0]:
                    end_bp = domain[0] - 1
                    if start_bp <= end_bp:
                        # Record the unannotated region coordinates
                        unannotated_coord = (start_bp, end_bp)
                        all_unannotated_coords.append((locus, unannotated_coord[0], unannotated_coord[1]))
                        print("Unannotated region found")
                    start_bp = domain[1] + 1
            current_bp = domain[1]

            # Special error handling for the final domain within a gene:
            if index == last_index:
                if domain[1] < gene_coords[1]:
                    start_bp = domain[1] + 1
                    end_bp = gene_coords[1]
                    if start_bp <= end_bp:
                        unannotated_coord = (start_bp, end_bp)
                        all_unannotated_coords.append((locus, unannotated_coord[0], unannotated_coord[1]))
                else:
                    print("Last domain overlapped with end of gene")


Unannotated region found
Unannotated region found
Unannotated region found
Last domain overlapped with end of gene
First domain starts at gene start, skipping unannotated calculation
Unannotated region found
Unannotated region found
Last domain overlapped with end of gene
First domain starts at gene start, skipping unannotated calculation
First domain starts at gene start, skipping unannotated calculation
Unannotated region found
Last domain overlapped with end of gene
Unannotated region found
Unannotated region found
Unannotated region found
Last domain overlapped with end of gene
First domain starts at gene start, skipping unannotated calculation
Last domain overlapped with end of gene
First domain starts at gene start, skipping unannotated calculation
Unannotated region found
Last domain overlapped with end of gene
First domain starts at gene start, skipping unannotated calculation
Last domain overlapped with end of gene
First domain starts at gene start, skipping unannotated calcul

Convert the unannoated region coordinates into a dataframe.

In [21]:
unannotated_region_df = pd.DataFrame(all_unannotated_coords, columns = ['locus_tag', 'start', 'end'])

Prepare dataframe for GFF3 format:

In [22]:
unannotated_region_df['seq_id'] = "LT708304.1"
unannotated_region_df['source'] = "HMM states"
unannotated_region_df['type'] = 'Unannotated region'
unannotated_region_df['score'] = "."
unannotated_region_df['strand'] = "."
unannotated_region_df['phase'] = "."
unannotated_region_df['attributes'] = ""

Combine dataframes of all genetic features, including unannotated regions.

In [23]:
# Combine the DataFrames using pd.concat
all_features_df = pd.concat([features_df, unannotated_region_df], ignore_index = True)

for index, row in all_features_df.iterrows():
    attribute_str = f"ID={row['locus_tag']}:{row['start']}-{row['end']};Name={row['Name']};Description={row['type']}"
    all_features_df.loc[index, 'attributes'] = attribute_str

# Reset the index of the combined DataFrame
all_features_df.reset_index(drop=True, inplace=True)

Convert into a GFF3 dataframe and export GFF3.

In [24]:
# Convert the merged dataframe back to a GFF3 dataframe
header = f"##gff-version 3\n##sequence-region {gff3_header}\n"
all_features_gff3 = gffpd.Gff3DataFrame(input_df=all_features_df, input_header=header)

In [25]:
# Save the GFF3 dataframe as a file
all_features_gff3.to_gff3(gff3_output)