In [61]:
# load approved_nrp_ids
with open('approved_nrp_ids.txt', 'r') as f:
    approved_nrp_ids = [nrp_id.strip() for nrp_id in f.readlines()]


In [62]:
# load matches table
import pandas as pd
matches_table = pd.read_csv('matches_inspection_table.tsv', sep='\t')
# q: correct table: for each value x in 'NRP variant' column x -> x.split('#')[0]
matches_table['NRP variant'] = matches_table['NRP variant'].apply(lambda x: x.split('#')[0])


In [63]:
# q: get all NRP variants that are not in approved_nrp_ids but have a Verdict in ['good', 'to be corrected', 'was corrected']

matches_table_good = matches_table[matches_table['Verdict'].isin(['good', 'to be corrected', 'was corrected'])]
nrp_variants_good = set(matches_table_good['NRP variant'].unique())
print(f'NRP variants with verdicts good, to be corrected, was corrected ({len(nrp_variants_good)} in total):')
nrp_variants_approved = set(approved_nrp_ids)
print(f'Approved NRP variants ({len(nrp_variants_approved)} in total):')
nrp_variants_to_be_approved = sorted(nrp_variants_good - nrp_variants_approved)
print(f'NRP variants to be approved ({len(nrp_variants_to_be_approved)} in total):')
print(nrp_variants_to_be_approved)

NRP variants with verdicts good, to be corrected, was corrected (238 in total):
Approved NRP variants (127 in total):
NRP variants to be approved (111 in total):
['BGC0000289.0', 'BGC0000296.0', 'BGC0000307.0', 'BGC0000313.0', 'BGC0000314.0', 'BGC0000322.0', 'BGC0000349.0', 'BGC0000354.1', 'BGC0000354.3', 'BGC0000359.0', 'BGC0000359.1', 'BGC0000368.0', 'BGC0000374.4', 'BGC0000383.0', 'BGC0000385.0', 'BGC0000386.1', 'BGC0000386.3', 'BGC0000399.0', 'BGC0000399.1', 'BGC0000416.0', 'BGC0000416.1', 'BGC0000423.0', 'BGC0000424.1', 'BGC0000434.4', 'BGC0000439.0', 'BGC0000452.0', 'BGC0000464.1', 'BGC0000985.1', 'BGC0000985.4', 'BGC0001035.0', 'BGC0001050.0', 'BGC0001095.0', 'BGC0001127.0', 'BGC0001130.1', 'BGC0001131.0', 'BGC0001132.0', 'BGC0001133.0', 'BGC0001135.0', 'BGC0001189.0', 'BGC0001214.4', 'BGC0001230.3', 'BGC0001230.6', 'BGC0001230.7', 'BGC0001240.1', 'BGC0001290.1', 'BGC0001312.0', 'BGC0001330.6', 'BGC0001344.8', 'BGC0001346.0', 'BGC0001346.1', 'BGC0001346.2', 'BGC0001402.0', 'BGC0

In [64]:
if False:
    from pathlib import Path
    to_be_approved_bgc_ids = sorted({nrp_id.split('.')[0] for nrp_id in nrp_variants_to_be_approved})
    mibig_results_dir = '/home/ilianolhin/programs/antismash7/mibig_results/'
    as_paths = [mibig_results_dir + bgc_id for bgc_id in to_be_approved_bgc_ids]
    with open('to_be_approved_as_paths.txt', 'w') as f:
        f.write('\n'.join(as_paths))

In [65]:
if False:
    import json
    rban_records_dir = Path('/home/ilianolhin/git/nerpa2_old/test_data/rban_records/')
    rban_records = []
    for nrp_id in nrp_variants_to_be_approved:
        with open(rban_records_dir / f'{nrp_id}.json', 'r') as f:
            rban_records.append(json.load(f))

    with open('to_be_approved_rban_records.json', 'w') as f:
        json.dump(rban_records, f, indent=4)


In [66]:
from src.matching.match_type import Match
from typing import List
from pathlib import Path

def load_matches_from_txt(matches_txt: Path) -> List[Match]:
    matches_strs = matches_txt.read_text().split('\n\n')
    matches_strs = [match_str for match_str in matches_strs
                    if match_str.strip()]
    matches = []
    for match_str in matches_strs:
        try:
            matches.append(Match.from_str(match_str))
        except Exception as e:
            print(f'Error while parsing match:\n{match_str}\n{e}')
            raise e
    return [Match.from_str(matches_str)
            for matches_str in matches_strs]

new_approved_matches = load_matches_from_txt(Path('/home/ilianolhin/git/nerpa2/matches_inspection/new_approved_matches.txt'))
print(f'New approved matches ({len(new_approved_matches)} in total)')
missing_nrp_ids = [nrp_id
                      for nrp_id in nrp_variants_to_be_approved
                      if not any(match.nrp_variant_id.nrp_id == nrp_id for match in new_approved_matches)]
print(f'Missing NRP ids ({len(missing_nrp_ids)} in total):')
print('\n'.join(missing_nrp_ids))

New approved matches (203 in total)
Missing NRP ids (35 in total):
BGC0000386.1
BGC0000386.3
BGC0000416.0
BGC0000416.1
BGC0000985.1
BGC0001035.0
BGC0001050.0
BGC0001130.1
BGC0001132.0
BGC0001133.0
BGC0001135.0
BGC0001214.4
BGC0001290.1
BGC0001330.6
BGC0001344.8
BGC0001346.0
BGC0001346.1
BGC0001346.2
BGC0001402.0
BGC0001402.1
BGC0001402.2
BGC0001716.0
BGC0002172.2
BGC0002259.0
BGC0002259.1
BGC0002370.2
BGC0002408.0
BGC0002413.0
BGC0002484.0
BGC0002503.6
BGC0002532.0
BGC0002548.2
BGC0002569.2
BGC0002581.1
BGC0002581.4
