## Creating Pfam CD-search domain-gene coordinates

This script takes the output files from the NCBI Pfam CD-search and matches them with the correct genomic coordinates and associated gene names.

In [25]:
import os
import pandas as pd
import gffpandas.gffpandas as gffpd

First, identify the NCBI Pfam CD-search files and gene assembly GFF3, depending on the _Mycobacterium_ species.

In [26]:
# For Mycobacterium bovis:
CDsearch_folder = '../data/Mycobacterium bovis/text files from NCBI CD-search'
gene_assembly_file = '../data/Mycobacterium bovis/LT708304 gene assembly.gff3'
species_id = 'LT708304.1'
gff3_header = 'LT708304.1 1 4349904'
gff3_output = '../data/Mycobacterium bovis/LT708304 Pfam domain assembly.gff3'

In [27]:
# For Mycobacterium tuberculosis:
CDsearch_folder = '../data/Mycobacterium tuberculosis/text files from NCBI CD-search'
gene_assembly_file = '../data/Mycobacterium tuberculosis/NC_018143.1 gene assembly updated.gff3'
species_id = 'NC_018143.1'
gff3_hearder = 'NC_018143.1 1 4411708'
gff3_output = '../data/Mycobacterium tuberculosis/NC_018143.1 Pfam domain assembly.gff3'

Loop through the text file outputs containing the predicted domain locations, and isolate the document names.

In [28]:
# Create an empty dataframe containing the columns found in the CDsearch output
records_df = pd.DataFrame(columns = ['Name', 'Accession', 'Description', 'Interval', 'E-value', 'seq start'])

# Loop through and get all the filenames
filenames = []
for name in os.listdir(CDsearch_folder):
    filename = CDsearch_folder + '/' + str(name)
    filenames.append(filename)

Within each file, each entry (row in dataframe) is split across three rows. These 3 rows need to be combined into a single entry, and then split across the columns of the dataframe.

In [29]:
# For each file...
for filename in filenames:
    with open(filename, 'r') as f:
        lines = f.readlines()

    # Get the start co-ordinate of the segment
    coordinates = filename.split(' - ')[1]
    start_coordinate = int(coordinates.split(':')[0])

    # Loop through the lines
    records = []
    data_lines = lines[1:]
    current_line = 0

    # Concatenate each set of 3 lines to create a single record
    while current_line < len(data_lines)-2:
        record = ""
        record += data_lines[current_line].strip() + '\t'
        record += data_lines[current_line+1].strip() + '\t'
        record += data_lines[current_line+2].strip() + '\t' + str(start_coordinate)
        records.append(record)
        current_line += 3

    # Split the record based on tabs and turn into a dataframe
    rows = [row.split('\t') for row in records]
    column_names = ['Name', 'Accession', 'Description', 'Interval', 'E-value', 'seq start']
    current_df = pd.DataFrame(rows[1:], columns = column_names)

    # Concatenate to the exisiting dataframe
    records_df = pd.concat([records_df, current_df], ignore_index=True)

# Convert numeric columns to appropriate types
records_df['seq start'] = pd.to_numeric(records_df['seq start'])

Calculate the start & end genomic co-ordinates for each domain.

In [30]:
records_df['domain start position'] = 0
records_df['domain end position'] = 0

# Split the 'Interval' column to get start and end coordinates of each domain within its sequence segment
for index, row in records_df.iterrows():
    locs = row['Interval'].split('-')
    start_position = int(locs[0])
    end_position = int(locs[1])
    records_df.loc[index, 'domain start position'] = start_position
    records_df.loc[index, 'domain end position'] = end_position

# Convert to numerics
records_df['seq start'] = pd.to_numeric(records_df['seq start'])
records_df['domain start position'] = pd.to_numeric(records_df['domain start position'])
records_df['domain end position'] = pd.to_numeric(records_df['domain end position'])

In [31]:
records_df['domain start coord'] = 0
records_df['domain end coord'] = 0

# Calculate the actual genomic start & end coordinates of the domains by taking the coordinates of
# the sequence segment into account

for index, row in records_df.iterrows():
    
    # Isolate the original coordinate where the sequence segment begins
    seq_start = records_df.loc[index, 'seq start']

    # Isolate the positions within that segment where the domain is
    domain_start = records_df.loc[index, 'domain start position']
    domain_end = records_df.loc[index, 'domain end position']

    # Calculate the genomic start and end coordinates of the domain
    genomic_start = seq_start + domain_start - 1
    genomic_end = seq_start + domain_end - 1

    # Update entry
    records_df.loc[index, 'domain start coord'] = genomic_start
    records_df.loc[index, 'domain end coord'] = genomic_end

Read in gene assembly GFF3 file.

In [32]:
# Read in file
gene_gff = gffpd.read_gff3(gene_assembly_file)

# Convert each individual attribute to its own column
gene_df = gene_gff.attributes_to_columns()

Find the genomic coordinates of all genes.

In [33]:
gene_coords = []
gene_info = []

# Store the gene coordinates in a tuple
for index, row in gene_df.iterrows():
    gene_coords.append((row['start'], row['end']))  # Append tuple for current row
    gene_info.append((row['start'], row['end'], row['locus_tag'], row['Name']))

Find the associated gene name and locus tag for each domain identified using NCBI Pfam CD-search.

In [34]:
# Function to check if a position is within a gene
def is_within_gene(position, gene_coords):
    for start, end in gene_coords:
        if start <= position <= end:
            return True
    return False

# Function to find the associated locus_tag and name with a position
def find_gene(position, gene_info):
    for start, end, locus_tag, name in gene_info:
        if start <= position <= end:
            return (locus_tag, name)
    return None

In [35]:
records_df['associated_gene'] = None

outliers = []

for index, row in records_df.iterrows():
    # Find the midpoint coordinate of the domain
    start_coord = int(row['domain start coord'])
    end_coord = int(row['domain end coord'])
    mid_coord = (start_coord + end_coord)/2

    # Find the gene which the midpoint coordinate matches too
    if is_within_gene(mid_coord, gene_coords):
        gene = find_gene(mid_coord, gene_info)
        # Find the associated locus tag and gene name
        records_df.at[index, 'associated_locus_tag'] = gene[0]
        records_df.at[index, 'associated_gene'] = gene[1]
    else:
        outliers.append((row['Name'], start_coord, mid_coord, end_coord))

Convert the dataframe containing domain coordinates and associated genes into a GFF3 Dataframe and export GFF3.

In [36]:
# Create the attributes column
for index, row in records_df.iterrows():
    attribute_str = f"ID=pfamdomain-{row['associated_locus_tag']}_{index+1};Name={row['Name']};accession={row['Accession']};description={row['Description']};locus_tag={row['associated_locus_tag']};gene_name={row['associated_gene']}"
    records_df.loc[index, 'attributes'] = attribute_str

In [37]:
# Create other fields required for a GFF3
records_df['seq_id'] = species_id
records_df['source'] = "Pfam CD-search"
records_df['type'] = "Pfam_Domain/Family"
records_df['score'] = "."
records_df['strand'] = "."
records_df['phase'] = "."

# Isolate only the necessary 9 columns for a standard GFF3
col_names = ['seq_id', 'source', 'type', 'domain start coord', 'domain end coord', 'score', 'strand', 'phase', 'attributes']
records_df = records_df[col_names]

# Rename the start and end columns to match GFF3 requirements
records_df = records_df.rename(columns = {'domain start coord':'start', 'domain end coord':'end'})

In [38]:
# Convert the merged dataframe back to a GFF3 dataframe
header = f"##gff-version 3\n##sequence-region {gff3_header}\n"
records_gff_df = gffpd.Gff3DataFrame(input_df=records_df, input_header=header)

In [39]:
# Save the GFF3 dataframe as a file
records_gff_df.to_gff3(gff3_output)