In [None]:
# Install biopython
! pip install biopython

In [None]:
from Bio import Entrez, SeqIO

In [None]:
Entrez.email = "fpichardom@yahoo.com"

In [None]:
# Search param
ORGN =  'Pereskia'
RETMAX = 300

In [None]:
search_handle = Entrez.esearch(
    db='nucleotide', # NCBI database
    term = f'{ORGN}[ORGN]', # Search term as a string
    idtype = "acc", 
    retmax = RETMAX
)                               

In [None]:
#search_handle.read()

In [None]:
search_results = Entrez.read(search_handle)

In [None]:
search_results

In [None]:
id_list_str = ",".join(search_results['IdList'])

In [None]:
fetch_handle = Entrez.efetch(
    db='nucleotide',
    id=id_list_str,
    retmode='xml'
)

In [None]:
fetch_results = Entrez.read(fetch_handle)

In [None]:
len(fetch_results)

In [None]:
type(fetch_results)

In [None]:
fetch_results[0]

In [None]:
entry = fetch_results[0]

In [None]:
for key, value in entry.items():
    print(key, type(value))
    print("+==================================+")

In [None]:
gb_record = {
    "accession_id": entry['GBSeq_accession-version'],
    "taxonomy": {
        "taxon_name": entry['GBSeq_organism']
    },
    "features": [],
    "sequence": entry['GBSeq_sequence']
}

In [None]:
features = entry['GBSeq_feature-table']

In [None]:
for feature in features:
    if feature['GBFeature_key'] != "source":
        feat_entry = {
            "feature_key": feature['GBFeature_key'],
            "location": feature['GBFeature_location'],
            "qualifier_name": feature['GBFeature_quals'][0]['GBQualifier_name'],
            "qualifier_value": feature['GBFeature_quals'][0]['GBQualifier_value']
    }
        gb_record['features'].append(feat_entry)

In [None]:
gb_record

# Combine All

In [None]:
from Bio import Entrez, SeqIO

In [None]:
Entrez.email = "fpichardom@yahoo.com"

In [None]:
# Search param
ORGN =  'Pereskia'
RETMAX = 300

In [None]:
search_handle = Entrez.esearch(
    db='nucleotide', # NCBI database
    term = f'{ORGN}[ORGN]', # Search term as a string
    idtype = "acc", 
    retmax = RETMAX
)                               

In [None]:
search_results = Entrez.read(search_handle)

In [None]:
id_list_str = ",".join(search_results['IdList'])

In [None]:
fetch_handle = Entrez.efetch(
    db='nucleotide',
    id=id_list_str,
    retmode='xml'
)

In [None]:
fetch_results = Entrez.read(fetch_handle)

In [None]:
len(fetch_results)

In [None]:

all_records = []
for entry in fetch_results:
    
    try:
        gb_record = {
            "accession_id": entry['GBSeq_accession-version'],
            "taxonomy": {
                "taxon_name": entry['GBSeq_organism']
            },
            "features": [],
            "sequence": entry['GBSeq_sequence']
        }

        features = entry['GBSeq_feature-table']

        for feature in features:
            if feature['GBFeature_key'] != "source":
                feat_entry = {
                    "feature_key": feature['GBFeature_key'],
                    "location": feature['GBFeature_location'],
                    "qualifier_name": feature['GBFeature_quals'][0]['GBQualifier_name'],
                    "qualifier_value": feature['GBFeature_quals'][0]['GBQualifier_value']
                }
                gb_record['features'].append(feat_entry)
        all_records.append(gb_record)
    
    except KeyError:
        print(entry['GBSeq_accession-version'])


In [None]:
len(all_records)

In [None]:
all_records[0:8]

In [None]:
# How many empty features
count = 0
for record in all_records:
    if record['features']:
        count+= 1
    

In [None]:
count

In [None]:
unnatural = []

In [None]:
for record in all_records:
    if record['features']:
        new = {
            "taxon_name": record['taxonomy']['taxon_name'],
            "marker": ""
        }
        for feature in record['features']:
            new['marker'] = feature['qualifier_value']
            unnatural.append(new)

In [None]:
len(unnatural)

In [None]:
unnatural[0:2]

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame.from_records(unnatural)

In [None]:
df.head()

In [None]:
accum_markers = df.groupby('taxon_name').agg(lambda col: list(set(col))).reset_index()

In [None]:
accum_markers['count'] = accum_markers.apply(lambda x: len(x['marker']), axis=1)

In [None]:
accum_markers

In [None]:
import json

In [None]:
# Write basic json
with open('gb_results.json', 'w') as out:
    json.dump(all_records, out)

In [None]:
# Write prettier json
with open('gb_results.json', 'w') as out:
    json.dump(all_records, out, indent= 4, sort_keys=True)