In [35]:
import json
import os
import pandas as pd
from tqdm import tqdm
import re

%config Completer.use_jedi = False

In [36]:
def file_parser(folder, pattern = '.json'):
    """
    Retrieve json file list from a specific folder
    Outputs two lists, a complete file path for each file, and the 
    name of the file (json)
    """
    path = os.getcwd()
    path_to_folder = os.path.join(path, folder)
    files = []
    jsons = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path_to_folder):
        for file in f:
            if pattern in file:
                files.append(os.path.join(r, file))
                jsons.append(file)

    return((files,jsons))



def read_json(json_file):
    """
    Reads a json file and outputs its content
    """
    with open(json_file) as f:
        content = json.load(f)
    return(content)

def getList(dict):
    return dict.keys()

def get_item(element):
    try:
        cluster['qualifiers'][element]
    except:
        'NA'

In [37]:
root = read_json('antiSMASH/NT12013_22/NT12013_22.json')

In [38]:
root.keys()

dict_keys(['version', 'input_file', 'records', 'timings', 'taxon', 'schema'])

In [39]:
records = root['records']

In [40]:
# each record corresponds to each one of the contigs in the fasta file
len(records)

1

In [41]:
# records[0]

In [42]:
print(getList(records[0]))

dict_keys(['id', 'seq', 'features', 'name', 'description', 'dbxrefs', 'annotations', 'letter_annotations', 'areas', 'modules'])


In [43]:
# records[0]['features'][6]['qualifiers']['gene_functions']

In [44]:
# records[4]['features']

In [45]:
res = pd.DataFrame()
for record in records:
    contig = record['id']
    for feature in record['features']:
        feature_type = feature['type']
        if feature_type == 'region':
            cluster = feature
            cluster_idx = cluster['qualifiers']['region_number']
            location = cluster['location'].split(':')
            start = re.sub('\[', '', location[0])
            end = re.sub('\]', '', location[1])
            cluster_type = '|'.join(cluster['qualifiers']['product'])
            # append to results
            row = pd.DataFrame({'genome': 'genome',
                                            'cluster': cluster_idx,
                                            'type': cluster_type,
                                            'contig': contig,
                                            'start': start,
                                            'end': end }, index=[0])
            res = res.append(row)

In [46]:
res

Unnamed: 0,genome,cluster,type,contig,start,end
0,genome,1,NRPS,CP009273.1,589612,633494
0,genome,2,thiopeptide,CP009273.1,935795,962088


In [47]:
list(set(res['contig'].tolist()))

['CP009273.1']

In [48]:
# get contigs from previous dataset and loop them
contigs = list(set(res['contig'].tolist())) # remove duplicates

res_genes = pd.DataFrame()
for record in records:
    contig = record['id']
    if contig in contigs:
        for feature in record['features']:
            feature_type = feature['type']
            if feature_type == 'CDS':
                cluster = feature
                cluster_idx = cluster['qualifiers']['codon_start']
                location = cluster['location'].split(':')
                # gene
                try:
                    gene = cluster['qualifiers']['gene']
                except:
                    gene = 'NA'
                
                # gene_kind
                try:
                    gene_kind = cluster['qualifiers']['gene_kind']
                except:
                    gene_kind = 'NA'
                
                # EC_number
                
                try:
                    EC_number = cluster['qualifiers']['EC_number']
                except:
                    EC_number = 'NA'

                
                # gene_functions
                try:
                    gene_functions = cluster['qualifiers']['gene_functions']
                except:
                    gene_functions = 'NA'
                
                # product
                try:
                    product = cluster['qualifiers']['product']
                except:
                    product = 'NA'
                
                locus_tag = cluster['qualifiers']['locus_tag']

                # append to results
                row = pd.DataFrame({'genome': 'genome',
                                    'cluster': cluster_idx,
                                    'contig': contig,
                                    'gene': gene,
                                    'gene_kind': gene_kind,
                                    'EC_number':EC_number,
#                                     'gene_functions': gene_functions,
                                    'product': product,
                                    'locus_tag': locus_tag}, 
                                   index=[0])
                res_genes = res_genes.append(row)

In [62]:
pd.set_option('display.max_rows', 10)
res_genes

Unnamed: 0,genome,cluster,contig,gene,gene_kind,EC_number,product,locus_tag
0,genome,1,CP009273.1,thrA,,1.1.1.3,fused aspartate kinase/homoserine dehydrogenase 1,FNMEMAOE_00001
0,genome,1,CP009273.1,thrB,,2.7.1.39,homoserine kinase,FNMEMAOE_00002
0,genome,1,CP009273.1,thrC,,4.2.3.1,threonine synthase,FNMEMAOE_00003
0,genome,1,CP009273.1,,,,hypothetical protein,FNMEMAOE_00004
0,genome,1,CP009273.1,yaaA,,,peroxide stress resistance protein YaaA,FNMEMAOE_00005
...,...,...,...,...,...,...,...,...
0,genome,1,CP009273.1,creB,,,DNA-binding transcriptional regulator CreB,FNMEMAOE_04406
0,genome,1,CP009273.1,creC,,2.7.13.3,sensory histidine kinase CreC,FNMEMAOE_04407
0,genome,1,CP009273.1,creD,,,putative inner membrane protein CreD,FNMEMAOE_04408
0,genome,1,CP009273.1,arcA,,,DNA-binding transcriptional dual regulator ArcA,FNMEMAOE_04409


In [13]:
cosa = records[4]['modules']

In [52]:
# cosa

In [53]:
x = 'NT12705_21.json'
x[:-5]

'NT12705_21'

In [54]:
# records as contig containing info
# the '1' after by_region related to

# get the scores from region2region
reg_reg = records[0]['modules']['antismash.modules.cluster_compare']['db_results']['MIBiG']['by_region']['1']['RegionToRegion_RiQ']
# get scores in a pandas dataframe
reg_reg_df = pd.DataFrame.from_dict(reg_reg['scores_by_region'],
                       orient='index',
                       columns=['score'])

In [55]:
ref_reg = reg_reg['reference_regions']

In [56]:
ref_reg

{'BGC0002075.1: 27344-31238': {'accession': 'BGC0002075.1',
  'cdses': {'A8O26_RS14270': {'function': 'biosynthetic',
    'components': {'secmet': ['Condensation', 'AMP-binding', 'PP-binding'],
     'modules': [{'type': 'nrps',
       'domains': ['Condensation', 'AMP-binding', 'PCP', 'Thioesterase'],
       'complete': True}]},
    'location': '[27344:31238](+)'}},
  'cds_mapping': {'1': 'A8O26_RS14270'},
  'start': 27344,
  'end': 31238,
  'products': ['NRP', 'Alkaloid'],
  'organism': 'Pseudomonas fluorescens',
  'description': 'Pyreudione A, Pyreudione B, Pyreudione C, Pyreudione D, Pyreudione E',
  'protoclusters': []},
 'BGC0001615.1: 35-10914': {'accession': 'BGC0001615.1',
  'cdses': {'UCFS10_04341': {'function': 'other',
    'components': {'secmet': [], 'modules': []},
    'location': '[10422:10914](+)'},
   'mysB': {'function': 'other',
    'components': {'secmet': [], 'modules': []},
    'location': '[2763:3654](+)'},
   'UCFS10_04335': {'function': 'other',
    'components':

In [63]:
reg_reg_df

Unnamed: 0,score
BGC0002075.1: 27344-31238,0.658864
BGC0001615.1: 35-10914,0.657680
BGC0001185.1: 3021249-3032970,0.649197
BGC0000410.1: 162-16011,0.609956
BGC0000401.1: 0-12882,0.606361
...,...
BGC0000457.1: 511805-523079,0.523231
BGC0000257.1: 2579-43239,0.521342
BGC0001043.1: 950357-966540,0.520160
BGC0000454.1: 24-19884,0.516741


In [58]:
ref_reg['BGC0002075.1: 27344-31238']['organism']

'Pseudomonas fluorescens'

In [64]:
res_2 = pd.DataFrame()


for reference in ref_reg:
    ref = reference
    org = ref_reg[reference]['organism']
    bgc_type = ref_reg[reference]['description']
    product = ref_reg[reference]['products'][0]
    row = pd.DataFrame(
        {
            'reference':ref,
            'organism':org,
            'bgc_type':bgc_type,
            'product':product
        }, index=[0]
    )
    
    res_2 = res_2.append(row)
res_2 = res_2.set_index('reference')
pd.concat([res_2, reg_reg_df], axis=1)

Unnamed: 0,organism,bgc_type,product,score
BGC0002075.1: 27344-31238,Pseudomonas fluorescens,"Pyreudione A, Pyreudione B, Pyreudione C, Pyre...",NRP,0.658864
BGC0001615.1: 35-10914,Heteroscytonema crispum UCFS10,"hexose-palythine-serine, hexose-shinorine",NRP,0.657680
BGC0001185.1: 3021249-3032970,Bacillus velezensis FZB42,bacillibactin,NRP,0.649197
BGC0000410.1: 162-16011,Pseudomonas fluorescens,pseudomonine,NRP,0.609956
BGC0000401.1: 0-12882,Paenibacillus elgii B69,paenibactin,NRP,0.606361
...,...,...,...,...
BGC0000457.1: 511805-523079,Rhizobium etli CFN 42,vicibactin,NRP,0.523231
BGC0000257.1: 2579-43239,Streptomyces griseoviridis,prodigiosin,Polyketide,0.521342
BGC0001043.1: 950357-966540,Streptomyces griseus subsp. griseus NBRC 13350,SGR PTMs,NRP,0.520160
BGC0000454.1: 24-19884,Vibrio anguillarum RV22,vanchrobactin,NRP,0.516741


In [65]:
reg_reg

{'name': 'RegionToRegion_RiQRiQ',
 'scores_by_region': {'BGC0002075.1: 27344-31238': 0.6588635638908951,
  'BGC0001615.1: 35-10914': 0.6576798757346354,
  'BGC0001185.1: 3021249-3032970': 0.6491969160169269,
  'BGC0000410.1: 162-16011': 0.6099564617298631,
  'BGC0000401.1: 0-12882': 0.6063611439256317,
  'BGC0001132.1: 1902004-1914421': 0.582736321615584,
  'BGC0001302.1: 0-25686': 0.5744553937945509,
  'BGC0001131.1: 144016-173487': 0.5676798592842177,
  'BGC0001833.1: 14964-30015': 0.5666996143787307,
  'BGC0001844.1: 510845-531583': 0.5655481828754316,
  'BGC0001758.1: 23939-46916': 0.5612387946742029,
  'BGC0001128.1: 3865126-3880777': 0.5603675763724318,
  'BGC0001825.1: 2673981-2688831': 0.5587489634525925,
  'BGC0000375.1: 0-9457': 0.5507163791131043,
  'BGC0000427.1: 4798899-4805360': 0.5487507495093645,
  'BGC0001641.1: 3124570-3174107': 0.5461163304492854,
  'BGC0001343.1: 61-19132': 0.5443289946681192,
  'BGC0001479.1: 3055743-3062289': 0.5412776690620485,
  'BGC0001824.1: 3

In [66]:
# records as contig containing info
# the '1' after by_region related to

# get the scores from region2region
reg_reg = records[0]['modules']['antismash.modules.cluster_compare']['db_results']['MIBiG']['by_region']['1']['RegionToRegion_RiQ']
# get scores in a pandas dataframe
reg_reg_df = pd.DataFrame.from_dict(reg_reg['scores_by_region'],
                       orient='index',
                       columns=['score'])
# reference region
ref_reg = reg_reg['reference_regions']

res_2 = pd.DataFrame()


for reference in ref_reg:
    ref = reference
    org = ref_reg[reference]['organism']
    bgc_type = ref_reg[reference]['description']
    product = ref_reg[reference]['products'][0]
    row = pd.DataFrame(
        {
            'reference':ref,
            'organism':org,
            'bgc_type':bgc_type,
            'product':product
        }, index=[0]
    )
    
    res_2 = res_2.append(row)
res_2 = res_2.set_index('reference')
pd.concat([res_2, reg_reg_df], axis=1)

Unnamed: 0,organism,bgc_type,product,score
BGC0002075.1: 27344-31238,Pseudomonas fluorescens,"Pyreudione A, Pyreudione B, Pyreudione C, Pyre...",NRP,0.658864
BGC0001615.1: 35-10914,Heteroscytonema crispum UCFS10,"hexose-palythine-serine, hexose-shinorine",NRP,0.657680
BGC0001185.1: 3021249-3032970,Bacillus velezensis FZB42,bacillibactin,NRP,0.649197
BGC0000410.1: 162-16011,Pseudomonas fluorescens,pseudomonine,NRP,0.609956
BGC0000401.1: 0-12882,Paenibacillus elgii B69,paenibactin,NRP,0.606361
...,...,...,...,...
BGC0000457.1: 511805-523079,Rhizobium etli CFN 42,vicibactin,NRP,0.523231
BGC0000257.1: 2579-43239,Streptomyces griseoviridis,prodigiosin,Polyketide,0.521342
BGC0001043.1: 950357-966540,Streptomyces griseus subsp. griseus NBRC 13350,SGR PTMs,NRP,0.520160
BGC0000454.1: 24-19884,Vibrio anguillarum RV22,vanchrobactin,NRP,0.516741


In [79]:
res_2

Unnamed: 0_level_0,cluster,organism,bgc_type,product
reference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BGC0000784.1: 2-17453,2,Pseudomonas aeruginosa,O-antigen,Saccharide
BGC0000703.1: 0-204967,2,Streptomyces kanamyceticus,kanamycin,Saccharide
BGC0001137.1: 0-18476,2,Marinactinospora thermotolerans,"marinacarboline A, marinacarboline B, marinaca...",Alkaloid
BGC0000781.1: 1-16589,2,Pseudomonas aeruginosa,O-antigen,Saccharide
BGC0000800.1: 32-15282,2,Xanthomonas campestris,xanthan,Saccharide
...,...,...,...,...
BGC0001283.1: 1081205-1113162,2,Streptomyces lividans 1326,arsono-polyketide,Polyketide
BGC0001048.1: 1197-80111,2,Streptoalloteichus hindustanus,tallysomycin A,NRP
BGC0000148.1: 1134-79984,2,Saccharopolyspora spinosa,A83543A,Polyketide
BGC0001814.1: 0-57737,2,Streptomyces sp. KCB13F003,ulleungmycin,NRP


In [81]:
# records as contig containing info
# the '1' after by_region related to


records[0]['modules']['antismash.modules.cluster_compare']['db_results']['MIBiG']['by_region']


# get the scores from region2region
reg_reg = records[0]['modules']['antismash.modules.cluster_compare']['db_results']['MIBiG']['by_region']['1']['RegionToRegion_RiQ']
# get scores in a pandas dataframe
reg_reg_df = pd.DataFrame.from_dict(reg_reg['scores_by_region'],
                       orient='index',
                       columns=['score'])
# reference region
ref_reg = reg_reg['reference_regions']

res_2 = pd.DataFrame()


for reference in ref_reg:
    ref = reference
    org = ref_reg[reference]['organism']
    bgc_type = ref_reg[reference]['description']
    product = ref_reg[reference]['products'][0]
    row = pd.DataFrame(
        {
            'reference':ref,
            'organism':org,
            'bgc_type':bgc_type,
            'product':product
        }, index=[0]
    )
    
    res_2 = res_2.append(row)
res_2 = res_2.set_index('reference')
pd.concat([res_2, reg_reg_df], axis=1)

Unnamed: 0,organism,bgc_type,product,score
BGC0002075.1: 27344-31238,Pseudomonas fluorescens,"Pyreudione A, Pyreudione B, Pyreudione C, Pyre...",NRP,0.658864
BGC0001615.1: 35-10914,Heteroscytonema crispum UCFS10,"hexose-palythine-serine, hexose-shinorine",NRP,0.657680
BGC0001185.1: 3021249-3032970,Bacillus velezensis FZB42,bacillibactin,NRP,0.649197
BGC0000410.1: 162-16011,Pseudomonas fluorescens,pseudomonine,NRP,0.609956
BGC0000401.1: 0-12882,Paenibacillus elgii B69,paenibactin,NRP,0.606361
...,...,...,...,...
BGC0000457.1: 511805-523079,Rhizobium etli CFN 42,vicibactin,NRP,0.523231
BGC0000257.1: 2579-43239,Streptomyces griseoviridis,prodigiosin,Polyketide,0.521342
BGC0001043.1: 950357-966540,Streptomyces griseus subsp. griseus NBRC 13350,SGR PTMs,NRP,0.520160
BGC0000454.1: 24-19884,Vibrio anguillarum RV22,vanchrobactin,NRP,0.516741


In [103]:
## THIS IS THE CHUNK THAT'S WORKING!

regions = len(records[0]['modules']['antismash.modules.cluster_compare']['db_results']['MIBiG']['by_region'])

reg_df = pd.DataFrame()

for i in range(1,regions+1):
    reg_reg = records[0]['modules']['antismash.modules.cluster_compare']['db_results']['MIBiG']['by_region'][str(i)]['RegionToRegion_RiQ']
    # get scores in a pandas dataframe
    reg_reg_df = pd.DataFrame.from_dict(reg_reg['scores_by_region'],
                       orient='index',
                       columns=['score'])
    reg_reg_df = reg_reg_df.head(10)
    
    # reference region
    ref_reg = reg_reg['reference_regions']

    # initiate the temp pandas df
    temp_res = pd.DataFrame()
    
    for reference in ref_reg:
        ref = reference
        org = ref_reg[reference]['organism']
        bgc_type = ref_reg[reference]['description']
        product = ref_reg[reference]['products'][0]
        row = pd.DataFrame(
            {
                'reference':ref,
                'cluster':i,
                'organism':org,
                'bgc_type':bgc_type,
                'product':product
            }, index=[0]
        )

        temp_res = temp_res.append(row)

    temp_res = temp_res.set_index('reference')
    temp_res = temp_res.head(10)

    temp_res = pd.concat([temp_res, reg_reg_df], axis=1)
#     print(temp_res)
    reg_df = pd.concat([reg_df, temp_res])

reg_df
    

Unnamed: 0,cluster,organism,bgc_type,product,score
BGC0002075.1: 27344-31238,1,Pseudomonas fluorescens,"Pyreudione A, Pyreudione B, Pyreudione C, Pyre...",NRP,0.658864
BGC0001615.1: 35-10914,1,Heteroscytonema crispum UCFS10,"hexose-palythine-serine, hexose-shinorine",NRP,0.657680
BGC0001185.1: 3021249-3032970,1,Bacillus velezensis FZB42,bacillibactin,NRP,0.649197
BGC0000410.1: 162-16011,1,Pseudomonas fluorescens,pseudomonine,NRP,0.609956
BGC0000401.1: 0-12882,1,Paenibacillus elgii B69,paenibactin,NRP,0.606361
...,...,...,...,...,...
BGC0001540.1: 633-38461,2,Streptomyces zelensis,CC-1065,Other,0.410657
BGC0000466.1: 0-35805,2,Streptomyces sp. TP-A2060,yatakemycin,NRP,0.409777
BGC0000255.1: 796-9440,2,Uncultured bacterium,pederin,Polyketide,0.397025
BGC0001285.1: 8-22479,2,Pseudomonas putida,"pseudopyronine A, pseudopyronine B",Other,0.392021


In [93]:
reg_reg = records[0]['modules']['antismash.modules.cluster_compare']['db_results']['MIBiG']['by_region']['1']['RegionToRegion_RiQ']
reg_reg_df = pd.DataFrame.from_dict(reg_reg['scores_by_region'],
                       orient='index',
                       columns=['score'])
reg_reg_df.head(10)

Unnamed: 0,score
BGC0002075.1: 27344-31238,0.658864
BGC0001615.1: 35-10914,0.65768
BGC0001185.1: 3021249-3032970,0.649197
BGC0000410.1: 162-16011,0.609956
BGC0000401.1: 0-12882,0.606361
BGC0001132.1: 1902004-1914421,0.582736
BGC0001302.1: 0-25686,0.574455
BGC0001131.1: 144016-173487,0.56768
BGC0001833.1: 14964-30015,0.5667
BGC0001844.1: 510845-531583,0.565548


# Let's try with a new file that has more contigs

In [104]:
root = read_json('antiSMASH/210/210.json')
records = root['records']

In [110]:
res = pd.DataFrame()
for record in records:
    contig = record['id']
    for feature in record['features']:
        feature_type = feature['type']
        if feature_type == 'region':
            cluster = feature
            cluster_idx = cluster['qualifiers']['region_number']
            location = cluster['location'].split(':')
            start = re.sub('\[', '', location[0])
            end = re.sub('\]', '', location[1])
            cluster_type = '|'.join(cluster['qualifiers']['product'])
            # append to results
            row = pd.DataFrame({'genome': 'genome',
                                            'cluster': cluster_idx,
                                            'type': cluster_type,
                                            'contig': contig,
                                            'start': start,
                                            'end': end }, index=[0])
            res = res.append(row)
res

Unnamed: 0,genome,cluster,type,contig,start,end
0,genome,1,thiopeptide,7,126823,153116
0,genome,1,NRPS,16,0,28687
0,genome,1,siderophore,54,0,13441


In [114]:
records[6]

{'id': '7',
 'seq': {'data': 'AAATGCCGCTTTATGGTGACTCACAAAAATTCACCACATCCCTCACAACCACCTTATTACGCCCGGTCTCTTTCGCCTCGTAAAGTGCGTTATCAGCCAGTCGATAGATCTCATTTGGGTTTAACGCATTTTCCTGAGTCACAACAGCGCCAATACTAATCGTCACTTTTTGTGGTATAGCGTATTCAGGATTATCGCCAGTTAAACGCTCAACATTTTCCCGAATCCTTTCCGCTAAAGCTTTCGCACGTTCAGTATCGATGTCCGTCAGCAAGACGCCAAACTCTTCGCCGCCGACGCGCGCTAAAAGGTCATCTGGTCGTATGCTTTTCCCAATGATATTCACCACTGTTTTTATCACCTGATCGCCAACCGGATGCCCCCAGGTGTCGTTGACTTTTTTGAAGTGGTCGATATCCATAATCATGACGCAATAAGGCGTTTTTTGGGCTGATGCTGATTGAACTGTCAGTTCATTAAAAAAATAATTTCTGTTAAATATATTGGTTAAGGGATCGCGATGTGCAATGTTCTTTGTTACTCGTAGCGCACTGAATATATGGCACATAAATATTACCATAACGGTTAACTTACTGACTACTTCAATAGTGCGACTAATATACCAAGTGGACACGCTATACCTGCTCATAAATAACAGAGAGATATTGTACAGTACGGCACAAAAAGCAATCACCGTCACTCCATTCCAGATATCATAGCGTAAACCATTAAAGTATAGATTAACCGAAAGCAAGGTTAACCACATGAAAATCATTATGGTAACATATGAGGCTTTCCAAACCACCTGACCATTTTCATTGGTTAACTCGGCAATGTGTAAATTGTAGCTCTCATAATGACTAGAAAGTATGTGAGCAACTATCGGCCCCCCAAAAACAAAAAACAAACTTATGCACAAAGTCATTTTTTTGGAAAATAAATTGCGCTGTTTTGTGTTTTCACTAACTTT

In [117]:
records[6]['modules']['antismash.modules.cluster_compare']['db_results']['MIBiG']['by_region']['1']['RegionToRegion_RiQ']

{'name': 'RegionToRegion_RiQRiQ',
 'scores_by_region': {'BGC0000784.1: 2-17453': 0.58544008463188,
  'BGC0000703.1: 0-204967': 0.4849593020239755,
  'BGC0001137.1: 0-18476': 0.47617230910600056,
  'BGC0000781.1: 1-16589': 0.46958996307963263,
  'BGC0000800.1: 32-15282': 0.4383317541925318,
  'BGC0001540.1: 633-38461': 0.41065661056490554,
  'BGC0000466.1: 0-35805': 0.40977702529955856,
  'BGC0000255.1: 796-9440': 0.39702518494126277,
  'BGC0001285.1: 8-22479': 0.3920212766593816,
  'BGC0001441.1: 0-27983': 0.3558594350965662,
  'BGC0001302.1: 0-25686': 0.31188443079340494,
  'BGC0000997.1: 236-35505': 0.29181808661788383,
  'BGC0001127.1: 0-45262': 0.2911216483556965,
  'BGC0002012.1: 8348-34051': 0.27824424415983684,
  'BGC0000070.1: 325-54575': 0.2739749179218612,
  'BGC0001283.1: 1081205-1113162': 0.25283543581282036,
  'BGC0001048.1: 1197-80111': 0.21292914200637605,
  'BGC0000148.1: 1134-79984': 0.2109001156140495,
  'BGC0001814.1: 0-57737': 0.20963593979512835,
  'BGC0000380.1: 4

In [141]:
## THIS IS THE CHUNK THAT'S WORKING!

regions = len(records[6]['modules']['antismash.modules.cluster_compare']['db_results']['MIBiG']['by_region'])

reg_df = pd.DataFrame()

for i in range(0,regions):
    reg_reg = records[6]['modules']['antismash.modules.cluster_compare']['db_results']['MIBiG']['by_region'][str(i+1)]['RegionToRegion_RiQ']
    # get scores in a pandas dataframe
    reg_reg_df = pd.DataFrame.from_dict(reg_reg['scores_by_region'],
                       orient='index',
                       columns=['score'])
    reg_reg_df = reg_reg_df.head(10)
    # reference region
    ref_reg = reg_reg['reference_regions']

    # initiate the temp pandas df
    temp_res = pd.DataFrame()
    
    for reference in ref_reg:
        ref = reference
        org = ref_reg[reference]['organism']
        bgc_type = ref_reg[reference]['description']
        product = ref_reg[reference]['products'][0]
        row = pd.DataFrame(
            {
                'reference':ref,
                'cluster':i,
                'organism':org,
                'bgc_type':bgc_type,
                'product':product
            }, index=[0]
        )

        temp_res = temp_res.append(row)

    temp_res = temp_res.set_index('reference')
    temp_res = temp_res.head(10)

    temp_res = pd.concat([temp_res, reg_reg_df], axis=1)
#     print(temp_res)
    reg_df = pd.concat([reg_df, temp_res])

reg_df

Unnamed: 0,cluster,organism,bgc_type,product,score
BGC0000784.1: 2-17453,0,Pseudomonas aeruginosa,O-antigen,Saccharide,0.58544
BGC0000703.1: 0-204967,0,Streptomyces kanamyceticus,kanamycin,Saccharide,0.484959
BGC0001137.1: 0-18476,0,Marinactinospora thermotolerans,"marinacarboline A, marinacarboline B, marinaca...",Alkaloid,0.476172
BGC0000781.1: 1-16589,0,Pseudomonas aeruginosa,O-antigen,Saccharide,0.46959
BGC0000800.1: 32-15282,0,Xanthomonas campestris,xanthan,Saccharide,0.438332
BGC0001540.1: 633-38461,0,Streptomyces zelensis,CC-1065,Other,0.410657
BGC0000466.1: 0-35805,0,Streptomyces sp. TP-A2060,yatakemycin,NRP,0.409777
BGC0000255.1: 796-9440,0,Uncultured bacterium,pederin,Polyketide,0.397025
BGC0001285.1: 8-22479,0,Pseudomonas putida,"pseudopyronine A, pseudopyronine B",Other,0.392021
BGC0001441.1: 0-27983,0,Streptomyces sp.,"belactosin A, belactosin C",Other,0.355859


# GENERALISATION FOR EVERY CONTIG WITHIN A GENOME

In [150]:
# GENERALISATION FOR EVERY CONTIG WITHIN A GENOME
contigs = res['contig'].tolist()

mibig_df = pd.DataFrame()

for contig in contigs:
    contig = int(contig) - 1
    
    # get region annotation
    regions = len(records[contig]['modules']['antismash.modules.cluster_compare']['db_results']['MIBiG']['by_region'])
    # initiate FIRST pandas df
    reg_df = pd.DataFrame()
    
    # there might be more than 1 
    for i in range(0,regions):
        reg_reg = records[6]['modules']['antismash.modules.cluster_compare']['db_results']['MIBiG']['by_region'][str(i+1)]['RegionToRegion_RiQ']
        # get scores in a pandas dataframe
        reg_reg_df = pd.DataFrame.from_dict(reg_reg['scores_by_region'],
                           orient='index',
                           columns=['score'])
        reg_reg_df = reg_reg_df.head(10)
        # reference region
        ref_reg = reg_reg['reference_regions']

        # initiate the temp pandas df
        temp_res = pd.DataFrame()

        for reference in ref_reg:
            ref = reference
            org = ref_reg[reference]['organism']
            bgc_type = ref_reg[reference]['description']
            product = ref_reg[reference]['products'][0]
            row = pd.DataFrame(
                {
                    'reference':ref,
                    'cluster':i,
                    'contig': contig+1,
                    'organism':org,
                    'bgc_type':bgc_type,
                    'product':product
                }, index=[0]
            )

            temp_res = temp_res.append(row)

        temp_res = temp_res.set_index('reference')
        temp_res = temp_res.head(10)

        temp_res = pd.concat([temp_res, reg_reg_df], axis=1)
    #     print(temp_res)
        reg_df = pd.concat([reg_df, temp_res])
    
    mibig_df = pd.concat([mibig_df, reg_df])
    



In [151]:
mibig_df

Unnamed: 0,cluster,contig,organism,bgc_type,product,score
BGC0000784.1: 2-17453,0,7,Pseudomonas aeruginosa,O-antigen,Saccharide,0.585440
BGC0000703.1: 0-204967,0,7,Streptomyces kanamyceticus,kanamycin,Saccharide,0.484959
BGC0001137.1: 0-18476,0,7,Marinactinospora thermotolerans,"marinacarboline A, marinacarboline B, marinaca...",Alkaloid,0.476172
BGC0000781.1: 1-16589,0,7,Pseudomonas aeruginosa,O-antigen,Saccharide,0.469590
BGC0000800.1: 32-15282,0,7,Xanthomonas campestris,xanthan,Saccharide,0.438332
...,...,...,...,...,...,...
BGC0001540.1: 633-38461,0,54,Streptomyces zelensis,CC-1065,Other,0.410657
BGC0000466.1: 0-35805,0,54,Streptomyces sp. TP-A2060,yatakemycin,NRP,0.409777
BGC0000255.1: 796-9440,0,54,Uncultured bacterium,pederin,Polyketide,0.397025
BGC0001285.1: 8-22479,0,54,Pseudomonas putida,"pseudopyronine A, pseudopyronine B",Other,0.392021
