In [ ]:
from pathlib import Path
import yaml
results_dir = Path('/home/ilya/tools/nerpa2/results/individual')

all_matches = []
for bgc_id in results_dir.iterdir():
    matches_file = results_dir / Path(f'{bgc_id}/matches_details/matches.yaml')
    if matches_file.exists():
        matches_bgc = yaml.safe_load(matches_file.open())
        all_matches.extend(matches_bgc)
    else:
        print('No output for', bgc_id)


def write_yaml(data, out_file: Path,
               compress: bool = False):
    # dirty hack to erase information about types and make output less verbose
    # https://github.com/yaml/pyyaml/issues/408
    yaml.emitter.Emitter.prepare_tag = lambda self, tag: ''

    # another hack (albeit less dirty) to forbid yaml creating references
    # https://stackoverflow.com/questions/13518819/avoid-references-in-pyyaml
    if not compress:
        yaml.Dumper.ignore_aliases = lambda *args: True

    with open(out_file, 'w') as out:
        yaml.dump(data, out,
                  default_flow_style=None, sort_keys=False)


write_yaml(all_matches, Path('all_matches.yaml'))



In [1]:
# for local run
import yaml
all_matches = yaml.safe_load(open('/home/ilianolhin/git/nerpa2/training/mibig_filtering/all_matches.yaml'))

In [2]:
all_matches.sort(key=lambda match: match['NormalisedScore'], reverse=True)
print('Total number of matches: ')
print(len(all_matches)) 
print('Best score: ')
print(all_matches[0]['NormalisedScore'])
print('Worst score: ')
print(all_matches[-1]['NormalisedScore'])

Total number of matches: 
2269
Best score: 
-0.2568675308800029
Worst score: 
-6.350681801219633


In [3]:
from prettytable import PrettyTable

def show_match(match: dict) -> str:
    result = ''
    result += f"Genome: {match['Genome']}\n"
    result += f"BGC_variant_idx: {match['BGC_variant_idx']}\n"
    result += f"NRP: {match['NRP']}\n"
    result += f"NRP_variant_idx: {match['NRP_variant_idx']}\n"
    result += f"NormalisedScore: {match['NormalisedScore']}\n"
    result += f"Score: {match['Score']}\n"
    result += f"Alignment:\n"
    for i, alignment in enumerate(match['Alignments']):
        if len(match['Alignments']) > 1:
            result += f'Fragment_{i}\n'
        t = PrettyTable(alignment[0].keys(), align='l', border=False)
        t.add_rows(alignment_step.values() for alignment_step in alignment)
        result += str(t) + '\n'

    return result

In [4]:
# q: group matches by NRP
from collections import defaultdict
nrp_matches = defaultdict(list)
for match in all_matches:
    nrp_matches[match['NRP']].append(match)

nrp_noniterative_matches = {}
nrp_iterative_matches = {}
for nrp, matches in nrp_matches.items():
    if any(len(match['Alignments']) > 1 for match in matches):
        nrp_iterative_matches[nrp] = matches
    else:
        nrp_noniterative_matches[nrp] = max(matches, key=lambda match: match['NormalisedScore'])

print('Total number of BGCs', len(set(match['NRP'].split('.')[0] for match in all_matches)))
print('Total number of NRPs')
print(len(nrp_matches.keys()))
print('Total number of non-iterative NRPs: ')
print(len(nrp_noniterative_matches))
print('Total number of matches for iterative NRPs: ')
print(sum(len(matches) for matches in nrp_iterative_matches.values()))

Total number of BGCs 291
Total number of NRPs
514
Total number of non-iterative NRPs: 
461
Total number of matches for iterative NRPs: 
637


In [5]:
def num_skips(match):
    return sum(1 for alignment in match['Alignments'] for alignment_step in alignment
               if alignment_step['Alignment_step'] == 'NRP_MONOMER_SKIP'
               or alignment_step['Alignment_step'] == 'BGC_MODULE_SKIP')

def num_monomers(match):
    return sum(1 for alignment in match['Alignments'] for alignment_step in alignment
               if alignment_step['Alignment_step'] == 'NRP_MONOMER_SKIP'
               or alignment_step['Alignment_step'] == 'MATCH')

def num_modules(match):
    return sum(1 for alignment in match['Alignments'] for alignment_step in alignment
               if alignment_step['Alignment_step'] == 'MATCH'
               or alignment_step['Alignment_step'] == 'BGC_MODULE_SKIP')
    
filtered_noniterative_matches = []
for match in nrp_noniterative_matches.values():
    if all([-4 < match['NormalisedScore'] < -1, 
            num_skips(match) <= 4,
            num_monomers(match) >= 3,
            num_modules(match) >= 3]):
        filtered_noniterative_matches.append(match)

print('Total number of filtered non-iterative matches: ')
print(len(filtered_noniterative_matches))

with open('/home/ilianolhin/git/nerpa2/training/filtered_noniterative_matches.txt', 'w') as out:
    for match in filtered_noniterative_matches:
        out.write(show_match(match) + '\n\n')

with open('/home/ilianolhin/git/nerpa2/training/iterative_matches.txt', 'w') as out:
    for matches in nrp_iterative_matches.values():
        for match in matches:
            out.write(show_match(match) + '\n\n')


Total number of filtered non-iterative matches: 
273


In [6]:
from typing import Tuple, List
def get_monomers(match) -> List[Tuple[str, str, str]]:
    monomers = []
    for alignment in match['Alignments']:
        for alignment_step in alignment:
            if alignment_step['Alignment_step'] == 'MATCH' or alignment_step['Alignment_step'] == 'NRP_MONOMER_SKIP':
                monomers.append((alignment_step['rBAN_name'], alignment_step['NRP_chirality']))
    return monomers


filtered_iterative_matches = []
for nrp, matches in nrp_iterative_matches.items():
    max_len = max(len(get_monomers(match)) for match in matches)  # max len means permutations of fragments
    best_score_max_len = max(match['NormalisedScore'] for match in matches if len(get_monomers(match)) == max_len)  # best permutation
    seen_variants = [] 
    for match in matches:
        monomers = get_monomers(match)
        if any([match['NormalisedScore'] < -4,
                num_skips(match) >= min(5, min(num_modules(match), num_monomers(match))),
                len(monomers) == max_len and match['NormalisedScore'] < best_score_max_len,  # not the best permutation
                monomers in seen_variants,
                len(match['Alignments']) > 1]):
            continue
        seen_variants.append(monomers)
        filtered_iterative_matches.append(match)

print('Total number of filtered iterative matches: ')
print(len(filtered_iterative_matches))

with open('/home/ilianolhin/git/nerpa2/training/filtered_iterative_matches.txt', 'w') as out:
    for match in filtered_iterative_matches:
        out.write(show_match(match) + '\n\n')

Total number of filtered iterative matches: 
72
