<a href="https://colab.research.google.com/github/edgeemer/genome-assembly/blob/main/gbff_gbk_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GBFF/GBK file analysis for CDS with Genes/Product filtration

In [96]:
!pip install biopython pandas



In [97]:
import pandas as pd
from Bio import SeqIO
from Bio import SeqFeature

import plotly.express as px

In [98]:
# GBFF/GBK
gbff_file = "genomic.gbff"

In [99]:
gbff_data = []

# Function to parse GBFF?/GBK file
def parse_gbff(file_path):
    with open(file_path) as handle:
        for record in SeqIO.parse(handle, 'gb'):
          for feature in record.features:
              if feature.type == "CDS":
                  gene_id = feature.qualifiers.get('gene', [''])[0]
                  product = feature.qualifiers.get('product', [''])[0]
                  gbff_data.append({
                      'Type': feature.type,
                      'GeneID': gene_id,
                      'Product': product
                      })

# Parsing the GFF file
parse_gbff(gbff_file)

# Creating a DataFrame
df = pd.DataFrame(gbff_data)

In [100]:
print(df.head())

  Type GeneID                                       Product
0  CDS                 glycoside hydrolase family 3 protein
1  CDS         UTP--glucose-1-phosphate uridylyltransferase
2  CDS                                   phosphoglucomutase
3  CDS   trpS                       tryptophan--tRNA ligase
4  CDS                             AI-2E family transporter


In [101]:
# Total CDS
total_cds = len(df)

# CDS with genes and hypothetical protein as a product
cds_with_gene_hypothetical = len(df[(df['GeneID'] != '') & (df['Product'].str.contains('hypothetical protein'))])
# CDS with genes and non-hypothetical protein as a product
cds_with_gene_non_hypothetical = len(df[(df['GeneID'] != '') & (~df['Product'].str.contains('hypothetical protein'))])
# CDS without genes and with hypothetical protein as a product
cds_without_gene_hypothetical = len(df[(df['GeneID'] == '') & (df['Product'].str.contains('hypothetical protein'))])
# CDS without genes and with non-hypothetical protein as a product
cds_without_gene_non_hypothetical = len(df[(df['GeneID'] == '') & (~df['Product'].str.contains('hypothetical protein'))])

In [102]:
print(f"Total CDS: {total_cds}")
print(f"CDS with Genes and Hypothetical Protein: {cds_with_gene_hypothetical}")
print(f"CDS with Genes and Non-Hypothetical Protein: {cds_with_gene_non_hypothetical}")
print(f"CDS without Genes and with Hypothetical Protein: {cds_without_gene_hypothetical}")
print(f"CDS without Genes and with Non-Hypothetical Protein: {cds_without_gene_non_hypothetical}")

Total CDS: 1495
CDS with Genes and Hypothetical Protein: 0
CDS with Genes and Non-Hypothetical Protein: 369
CDS without Genes and with Hypothetical Protein: 254
CDS without Genes and with Non-Hypothetical Protein: 872


In [103]:
# Data for pie chart
labels = ['CDS with Genes (Non-Hypothetical Protein)', 'CDS with Genes (Hypothetical Protein)',
          'CDS without Genes (Non-Hypothetical Protein)', 'CDS without Genes (Hypothetical Protein)']
values = [cds_with_gene_non_hypothetical,cds_with_gene_hypothetical,
          cds_without_gene_non_hypothetical, cds_without_gene_hypothetical]
colors = ['blue', 'green', 'red', 'purple']

# Create the pie chart
fig = px.pie(names=labels, values=values, title='CDS Categories', hole=0.4, color_discrete_sequence=colors)

# Add total CDS in the center
fig.update_layout(
    annotations=[dict(text=f'Total CDS:' +'\n' + f'{total_cds}', x=0.5, y=0.5, font_size=20, showarrow=False)]
)

# Show the pie chart
fig.show()