In [1]:
from Bio import SeqIO
import pandas as pd
from numpy import float64, int32

file_list = ["data/Genomes/Escherichia_coli_str._K-12_substr._MG1655/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.gbff", "data/Genomes/Agrobacterium_tumefaciens_LBA4213_(Ach5)/GCF_000576515.1_ASM57651v1/GCF_000576515.1_ASM57651v1_genomic.gbff"]

df_genomes = pd.DataFrame(columns = ['genome_id', 'name', 'tax_id', 'domain', 'num_replicons', 'num_genes', 'size_bp', 'assembly'])
df_genomes = df_genomes.astype({'genome_id':int32, 'name':object, 'tax_id':object, 'domain':object, 'num_replicons':int32, 'num_genes':int32, 'size_bp':int32, 'assembly':object})

df_replicons = pd.DataFrame(columns=['replicon_id', 'genome_id', 'name', 'type', 'shape', 'num_genes', 'size_bp', 'accession', 'release_date'])
df_replicons = df_replicons.astype({'replicon_id':int32, 'genome_id':int32, 'name':object, 'type':object, 'shape':object, 'num_genes':int32, 'size_bp':int32, 'accession':object, 'release_date':object})

df_genes = pd.DataFrame(columns=['gene_id', 'genome_id', 'replicon_id', 'locus_tag', 'protein_id', 'name', 'strand', 'num_exons', 'length', 'product'])
df_genes = df_genes.astype({'gene_id':int32, 'genome_id':int32, 'replicon_id':int32, 'locus_tag':object, 'protein_id':object, 'name':object, 'strand':int32, 'num_exons':int32, 'length':int32, 'product':object})

df_xrefs = pd.DataFrame(columns=['gene_id', 'xdb', 'xid'])
df_xrefs = df_xrefs.astype({'gene_id':int32, 'xdb':object, 'xid':object})

df_exons = pd.DataFrame(columns=['gene_id', 'exon', 'l_pos', 'r_pos', 'length'])
df_exons = df_exons.astype({'gene_id':int32, 'exon':int32, 'l_pos':int32, 'r_pos':int32, 'length':int32})


In [2]:
genome_counter = 0
replicon_counter = 0
gene_counter = 0

for file in file_list:
    genome_counter += 1
    genome_id = genome_counter
    genome_size = 0
    g_num_genes = 0
    g_num_replicons = 0
    
    for record_count, seq_record in enumerate(SeqIO.parse(file, "genbank")):
        g_num_replicons += 1
        # Genome Fields
        genome_assembly = seq_record.dbxrefs[-1].split(':')[-1]
        genome_name = seq_record.annotations['source']
        domain = seq_record.annotations['taxonomy'][0]
        num_replicons = record_count+1
        tax_id = seq_record.features[0].qualifiers['db_xref'][0].split(':')[-1]
        genome_size += len(seq_record)
        
        # Replicon Fields
        replicon_counter += 1
        replicon_id = replicon_counter
        rep_name = seq_record.description.split(',')[0] #seq_record.id #seq_record.dbxrefs[-1].split(':')[-1]
        if 'COMPLETE GENOMES' in seq_record.description.upper():
            r_type = 'Chromosome'
        else:
            r_type = 'Plasmid'
        r_shape = seq_record.annotations['topology']
        r_num_genes = 0
        rep_size = len(seq_record)
        r_accession = seq_record.id
        release_date = seq_record.annotations['date']

        

        for gene in seq_record.features:
            if not gene.type == 'CDS':
                continue
            if 'protein_id' not in gene.qualifiers.keys():
                protein_id = 'psuedo'
                protein_name = 'pseudo'
            else:
                protein_id = str(gene.qualifiers['protein_id']).replace("'", '').replace('[', '').replace(']', '').split('.')[0]
                protein_name = str(gene.qualifiers['product']).replace("'", '').replace('[', '').replace(']', '')

            r_num_genes += 1
            g_num_genes += 1
            gene_counter += 1
            # ['gene_id', 'genome_id', 'replicon_id', 'locus_tag', 'protein_id', 'name', 'strand', 'num_exons', 'length', 'product']
            
            gene_id = gene_counter
            #gene_id = int([s for s in gene.qualifiers['db_xref'] if 'GeneID' in s][0].split(':')[-1])
            locus_tag = str(gene.qualifiers['locus_tag']).replace("'", '').replace('[', '').replace(']', '')
            protein_id = protein_id # defined above
            gene_name = locus_tag
            try:
                gene_name = str(gene.qualifiers['gene']).replace("'", '').replace('[', '').replace(']', '')
            except: KeyError
            
            strand = gene.location.strand
            num_exons = len(gene.location.parts)
            protein_name = protein_name
            coordinates_start = gene.location.start
            coordinates_end = gene.location.end
            gene_length = abs(coordinates_end - coordinates_start) + 1
            if 'db_xref' in gene.qualifiers.keys():
                for xref in gene.qualifiers['db_xref']:
                    source, entry = xref.split(':')
                    df_xrefs.loc[len(df_xrefs)] = [gene_id, source, entry]
                
            exon_count = 0
            for part in gene.location.parts:
                exon_count+=1
                l_pos = part.start
                r_pos = part.end
                exon_length = r_pos - l_pos + 1
                df_exons.loc[len(df_exons)] = [gene_id, exon_count, l_pos, r_pos, exon_length]
            
            # Populate genes dataframe
            df_genes.loc[len(df_genes)] = [gene_id, genome_id, replicon_id, locus_tag, protein_id, gene_name, strand, num_exons, gene_length, protein_name]
        
        # Populate replicons dataframe
        df_replicons.loc[len(df_replicons)] = [replicon_id, genome_id, rep_name, r_type, r_shape, r_num_genes, rep_size, r_accession, release_date ]

    df_genomes.loc[len(df_genomes)] = [genome_id, genome_name, tax_id, domain, g_num_replicons, g_num_genes, genome_size, genome_assembly]



        

In [3]:
df_genes

Unnamed: 0,gene_id,genome_id,replicon_id,locus_tag,protein_id,name,strand,num_exons,length,product
0,1,1,1,b0001,NP_414542,thrL,1,1,67,thr operon leader peptide
1,2,1,1,b0002,NP_414543,thrA,1,1,2464,Bifunctional aspartokinase/homoserine dehydrog...
2,3,1,1,b0003,NP_414544,thrB,1,1,934,homoserine kinase
3,4,1,1,b0004,NP_414545,thrC,1,1,1288,L-threonine synthase
4,5,1,1,b0005,NP_414546,yaaX,1,1,298,DUF2502 family putative periplasmic protein
5,6,1,1,b0006,NP_414547,yaaA,-1,1,778,"peroxide resistance protein, lowers intracellu..."
6,7,1,1,b0007,NP_414548,yaaJ,-1,1,1432,putative transporter
7,8,1,1,b0008,NP_414549,talB,1,1,955,transaldolase B
8,9,1,1,b0009,NP_414550,mog,1,1,589,molybdochelatase incorporating molybdenum into...
9,10,1,1,b0010,NP_414551,satP,-1,1,568,succinate-acetate transporter


In [4]:
df_replicons

Unnamed: 0,replicon_id,genome_id,name,type,shape,num_genes,size_bp,accession,release_date
0,1,1,Escherichia coli str. K-12 substr. MG1655,Plasmid,circular,4319,4641652,NC_000913.3,08-AUG-2016
1,2,2,Agrobacterium tumefaciens LBA4213 (Ach5) circu...,Plasmid,circular,2640,2773134,NZ_CP007225.1,15-AUG-2015
2,3,2,Agrobacterium tumefaciens LBA4213 (Ach5) linea...,Plasmid,linear,1848,2095074,NZ_CP007226.1,15-AUG-2015
3,4,2,Agrobacterium tumefaciens LBA4213 (Ach5) plasm...,Plasmid,circular,537,556485,NZ_CP007227.1,15-AUG-2015
4,5,2,Agrobacterium tumefaciens LBA4213 (Ach5) plasm...,Plasmid,circular,190,205997,NZ_CP007228.1,15-AUG-2015


In [5]:
df_genomes

Unnamed: 0,genome_id,name,tax_id,domain,num_replicons,num_genes,size_bp,assembly
0,1,Escherichia coli str. K-12 substr. MG1655,511145,Bacteria,1,4319,4641652,GCF_000005845.2
1,2,Agrobacterium tumefaciens LBA4213 (Ach5),1435057,Bacteria,4,5215,5630690,GCF_000576515.1


In [6]:
df_xrefs

Unnamed: 0,gene_id,xdb,xid
0,1,GI,16127995
1,1,ASAP,ABE-0000006
2,1,UniProtKB/Swiss-Prot,P0AD86
3,1,EcoGene,EG11277
4,1,GeneID,944742
5,2,GI,16127996
6,2,ASAP,ABE-0000008
7,2,UniProtKB/Swiss-Prot,P00561
8,2,EcoGene,EG10998
9,2,GeneID,945803


In [7]:
df_exons

Unnamed: 0,gene_id,exon,l_pos,r_pos,length
0,1,1,189,255,67
1,2,1,336,2799,2464
2,3,1,2800,3733,934
3,4,1,3733,5020,1288
4,5,1,5233,5530,298
5,6,1,5682,6459,778
6,7,1,6528,7959,1432
7,8,1,8237,9191,955
8,9,1,9305,9893,589
9,10,1,9927,10494,568


In [1]:
df_genes.to_csv('OutputFiles/SQL/genes_table.txt', sep='\t', header=False, index=False)
df_genomes.to_csv('OutputFiles/SQL/genomes_table.txt', sep='\t', header=False, index=False)
df_replicons.to_csv('OutputFiles/SQL/replicons_table.txt', sep='\t', header=False, index=False)
df_xrefs.to_csv('OutputFiles/SQL/xrefs_table.txt', sep='\t', header=False, index=False)
df_exons.to_csv('OutputFiles/SQL/exons_table.txt', sep='\t', header=False, index=False)

NameError: name 'df_genes' is not defined