# 23S rRNA Discovery

The purpose of this experiment is to extract all 23S rRNA related data from NCBI/Genbank files. The 23S rRNA is another conserved region in bacteria we can attempt to use for designing scientific assays to detect the specific region as it is thought of as having conserved and unique regions across all bacterial species; allowing us to detect and discriminate between different bacterias within a sample.

Biopython will be used to interact with the Entrez API to extract a list of predetermined bacterial species we want. The dataset used for this experiment will be for the vaginal microbiome, approximately 900+ species have been identified residing in the vaginal microbiome according to past studies such as ones done for the analysis of the causitive mechanism of bacterial vaginosis.

### Data Exploration

Let's read in the data

In [None]:
import os

# SETTINGS
""" Directory Structure

../downloads/<date of data>

"""
DIRECTORY_MAIN = 'downloads'
DIRECTORY_DATE = 'test'
DIRECTORY_PATH = os.path.join(DIRECTORY_MAIN, DIRECTORY_DATE)

In [None]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

def parse_record(genbankfile):
    """ Reads the Genbank Files 
    
    Args:
        genbankfile - *.gbk file
    Returns:
        seq_location - gene of interest
    """
    for record in SeqIO.parse(genbankfile, 'gb'):
        print("Searching record ", record.description[:10])
        parse_features(record)

def parse_features(genbankrecord):
    """ Reads in the entire seq feature
    and extracts only the ones we want; in this case
    the 23S rRNA feature.
    
    Args:
        genbankrecord - individual genbank record
    """
    INTERESTED_FEATURE = 'rRNA'
    FEATURE_PRESENT = False
    print(genbankrecord.annotations)
    for features in genbankrecord.features:
        print(features)
        if features.type == 'rRNA':
            try:
                if(features.qualifiers['product'][0].lower().find('23s')>-1):
                    print(gene.qualifiers['product'][0])
                    print(gene.location)
                    return [1]
                else:
                    return []
            except KeyError:
                print("ERROR>> ", features)
                return []
                
        else:
            print("none found")
            return []


In [11]:
def run(directory):
    """ Read all of the files within the directory.
    
    Args:
        directory - the directory containing the genbank files
    """
    
    for files in os.listdir(directory):
        file_dir = os.path.join(DIRECTORY_PATH, files)
        #print(file_dir)
        parse_record(file_dir)
        break
        
run(DIRECTORY_PATH)

Searching record  Fusobacter
['__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_per_letter_annotations', '_seq', '_set_per_letter_annotations', '_set_seq', 'annotations', 'dbxrefs', 'description', 'features', 'format', 'id', 'letter_annotations', 'lower', 'name', 'reverse_complement', 'seq', 'upper']
type: source
location: [0:1451](+)
qualifiers:
    Key: culture_collection, Value: ['ATCC:25533']
    Key: db_xref, Value: ['taxon:854']
    Key: mol_type, Value: ['rRNA']
    Key: organism, Value: ['Fusobacterium russii']
    Key: strain, Value: ['ATCC 25533']

none found
Searching record  Fusobacter
['__add__', '__b