In [1]:
import os
import sys
import re
import csv
import collections
import shutil
import random
import multiprocessing
import json
import pandas as pd 
import numpy as np
from tqdm import tqdm

# CSpell Output Parsing

This script parses the outputs from CSpell run on the pre-processed version of the CSpell dataset.
The pre-processed dataset can be obtained by running `scripts/preprocessing_cspell.ipynb` and will be placed under `data/cspell/TrainSet_brat/examples/` and `data/cspell/TestSet_reconciled/examples/`.
After install [CSpell](https://lsg3.nlm.nih.gov/LexSysGroup/Projects/cSpell/current/web/index.html) (version `cSpell.2018.1.1.0`), please run CSpell to get the detailed output with command like this:  
```
$ CSpell -t -i:123.txt -o:123_out.txt -d > 123_debug.txt
```
To run the command in the whole dataset (421 train or 611 test examples), you can try a command like this:
```
$ for i in {0..420}; do CSpell -d -i:${i}.txt -t -o:${i}_out.txt > ${i}_debug.txt; done
```

In [2]:
dataset_dir = '../data/cspell/'
train_dir = os.path.join(dataset_dir, 'TrainSet_brat')
test_dir = os.path.join(dataset_dir, 'TestSet_reconciled')
train_example_dir = os.path.join(train_dir, 'examples')
test_example_dir = os.path.join(test_dir, 'examples')

- Read dataset

In [3]:
with open(os.path.join(train_example_dir, 'example_stat.json')) as fd:
    train_typo_examples = json.load(fd)

for example in train_typo_examples:
    ex_id, note_id, start, end, typo, correction = example
    with open(os.path.join(train_example_dir, f'{ex_id}.txt')) as fd:
        text = fd.read()
    assert text[start:end] == typo
    example.append(text)

print(f'Train {len(train_typo_examples)} examples')

Train 421 examples


In [4]:
with open(os.path.join(test_example_dir, 'example_stat.json')) as fd:
    test_typo_examples = json.load(fd)

for example in test_typo_examples:
    ex_id, note_id, start, end, typo, correction = example
    with open(os.path.join(test_example_dir, f'{ex_id}.txt')) as fd:
        text = fd.read()
    assert text[start:end] == typo
    example.append(text)
    
print(f'Test {len(test_typo_examples)} examples')

Test 611 examples


- Check the CSpell inputs/outputs

In [5]:
# Check output files for train set
for i in range(421):
    assert os.path.exists(os.path.join(train_dir, 'examples', f'{i}.txt'))
    assert os.path.exists(os.path.join(train_dir, 'examples', f'{i}_out.txt'))
    assert os.path.exists(os.path.join(train_dir, 'examples', f'{i}_debug.txt'))

In [6]:
# Check output files for test set
for i in range(611):
    assert os.path.exists(os.path.join(test_dir, 'examples', f'{i}.txt'))
    assert os.path.exists(os.path.join(test_dir, 'examples', f'{i}_out.txt'))
    assert os.path.exists(os.path.join(test_dir, 'examples', f'{i}_debug.txt'))

## Parsing CSpell outputs

### Detection / correction parsing processes

- 1. NonDictionary Process
    - Detection & Correction: Performed at the same time (detection == correction)
- 2. NonWord-Merge Process
    - Detection: Non-dictionary & non-exception word (the word itself AND the rmEndPuncStr)
    - Correction: At the end of the process (skip Context, CScore, and FScore lines) 
- 3-4. NonWord-Split & 1To1 Process
    - Detection: Non-dictionary & non-exception word (the word itself AND the rmEndPuncStr)
    - Correction: After detection (skip Content and Score lines)
- 5. RealWord-Merge Process
    - Detection: Dictionary & non-exception (the word itself OR the rmEndPuncStr)
    - Correction: At the end of the process (skip Context, CScore, and FScore lines)
- 6. RealWord-Split Process
    - Detection: Dictionary & non-exception & word2vec exists & min frequency (the word itself OR the rmEndPuncStr)
    - Correction: After detection (skip Content and CScore lines)
- 7. RealWord-1To1 Process
    - Detection: Dictionary word & Non-exception word & word2vec exists & min frequency (the word itself OR the rmEndPuncStr)
    - Correction: After detection (skip Content and Score lines)

(rmEndPuncStr: the word that its ending punctuation is removed)

Sample output (`404_debug.txt`)

```
- Dictionary Files: [data/Dictionary/check.dic].
--- Add Dictionary: [/usr1/juyongk/workspace/cSpell2018/data/Dictionary/check.dic].
- Dictionary Files: [data/Dictionary/check.dic].
--- Add Dictionary: [/usr1/juyongk/workspace/cSpell2018/data/Dictionary/check.dic].
- Dictionary Files: [data/Dictionary/split.dic].
--- Add Dictionary: [/usr1/juyongk/workspace/cSpell2018/data/Dictionary/split.dic].
====== SpellApi.Process( ), funcMode: 10, rankMode: 5 ======
====== 1. NonDictionary Process ======
--- inText: [ClinicalTrials.gov - Suggestion]
====== 2. NonWord-Merge Process ======
--- inText: [ClinicalTrials.gov - Suggestion]
- Detect: [ClinicalTrials.gov|false (true & false)]
- Detect: [-|false (true & false)]
- Detect: [Suggestion|false (false & true)]
====== 3-4. NonWord-Split & 1To1 Process ======
--- inText: [ClinicalTrials.gov - Suggestion]
- Detect: [ClinicalTrials.gov|false (true & false)]
- Detect: [-|false (false & false)]
- Detect: [Suggestion|false (false & true)]
====== 5. RealWord-Merge Process ======
--- inText: [ClinicalTrials.gov - Suggestion]
- Detect: [ClinicalTrials.gov|false (false & false)]
- Detect: [-|false (false & false)]
- Detect: [Suggestion|true (true & true)]
====== 6. RealWord-Split Process ======
--- inText: [ClinicalTrials.gov - Suggestion]
- Detect: [ClinicalTrials.gov|false (false & false & true & false & false)]
- Detect: [ClinicalTrials.gov|false (false & false & true & false & false)]
- Detect: [-|false (true & false & false & false & true)]
- Detect: [-|false (true & false & false & false & true)]
- Detect: [Suggestion|false (true & true & true & true & false)]
- Detect: [Suggestion|false (true & true & true & true & false)]
====== 7. RealWord-1To1 Process ======
--- inText: [ClinicalTrials.gov - Suggestion]
- Detect: [ClinicalTrials.gov|false (false & false & true & false & false)]
- Detect: [ClinicalTrials.gov|false (false & false & true & false & false)]
- Detect: [-|false (true & false & false & false & true)]
- Detect: [-|false (true & false & false & false & true)]
- Detect: [Suggestion|false (true & true & true & true & false)]
- Detect: [Suggestion|false (true & true & true & true & false)]
====== SpellApi.Process( ), funcMode: 10, rankMode: 5 ======
...
```

- The main parser method

In [7]:
# Parse debug outputs

line_header = "====== SpellApi.Process( ), funcMode: 10, rankMode: 5 ======"
step_headers = [
    "====== 1. NonDictionary Process ======",
    "====== 2. NonWord-Merge Process ======",
    "====== 3-4. NonWord-Split & 1To1 Process ======",
    "====== 5. RealWord-Merge Process ======",
    "====== 6. RealWord-Split Process ======",
    "====== 7. RealWord-1To1 Process ======"
]
intext_header = "--- inText: ["
 
def read_debug_text(lines):
    outputss = []
    line_idx = 6
    while line_idx < len(lines): # Iterate over input lines
        # Line process header
        assert lines[line_idx] == line_header
        line_idx += 1
        
        step_outputss = []
        
        # Parse each steps -> next line idx, list of (word, detect, output)
        for step in range(6):
            # Step header
            assert lines[line_idx] == step_headers[step]
            line_idx += 1
            
            if step == 0:
                line_idx, outputs = parse_nondictionary_process(lines, line_idx)
            elif step == 1:
                line_idx, outputs = parse_nonword_merge_process(lines, line_idx)
            elif step == 2:
                line_idx, outputs = parse_nonword_split_1to1_process(lines, line_idx)
            elif step == 3:
                line_idx, outputs = parse_realword_merge_process(lines, line_idx)
            elif step == 4:
                line_idx, outputs = parse_realword_split_process(lines, line_idx)
            elif step == 5:
                line_idx, outputs = parse_realword_1to1_process(lines, line_idx)
    
            step_outputss.append(outputs)
            
        # Get the final input-output token relations
        outputs_ds = step_outputss[-1]
        for i in range(4, -1, -1):
            outputs_ds = merge_step_outputs(step_outputss[i], step_outputss[i+1], outputs_ds)
        outputss.append(outputs_ds)
    return outputss

- Basic line parsing functions

In [8]:
# Split a line of input text as the CSpell processes (tokens + spaces)
def split_line_tokens(txt):
    tokens = []
    token = ''
    for i, c in enumerate(txt):
        if c.isspace():
            if token:
                tokens.append(token)
                token = ''
            tokens.append(c)
        else:
            token += c   
    if token:
        tokens.append(token)
    return tokens

# - Detect: [beebe,|true (true & true)]
# - Detect: [[CONTACT]|false (false & true & true & false & false)]
def parse_detect_line(line):
    assert line.startswith('- Detect: [')
    in_token, detect_str = line[11:-1].split('|')
    flag = detect_str[:detect_str.index(' ')] == 'true'
    subflags = [t == 'true' for t in detect_str[:detect_str.index(' ')+2:-1].split(' & ')]
    
    in_token = in_token.strip()
    return in_token, flag, subflags

# - Correct: [dianosed|diagnosed|NW|NonWordCorrector-1To1]
def parse_correct_line(line):
    assert line.startswith('- Correct: [')
    in_token, correction, typo_type, typo_handler = line[12:-1].split('|')
    return in_token, correction, typo_type, typo_handler

# - Context: [is, in, there, is] 
def parse_context_line(line):
    assert line.startswith('- Context: [')
    return line[12:-1].split(', ')

# - CScore: understand|-0.09475540
# - CScore: under stand|-0.10908410
def parse_cscore_line(line):
    assert line.startswith('- CScore: ')
    in_tokens, score = line[10:].split('|')
    return in_tokens, float(score)

# - FScore: nonformulary|0.00000000
# - FScore: non-formulary|0.00000000
def parse_fscore_line(line):
    assert line.startswith('- FScore: ')
    in_tokens, score = line[10:].split('|')
    return in_tokens, float(score)

- Parsing method for each misspelling correction process

In [9]:
# Type: ND
# Handler: InformalExpHandler / XmlHtmlHandler / LeadingPuncSplitter / EndingPuncSplitter / LeadingDigitSplitter / EndingDigitSplitter
# - Correct: [im|i'm|ND|InformalExpHandler]
# - Correct: [colitis,"|colitis, "|ND|EndingPuncSplitter]
# - Correct: [<[CONTACT]&gt;.|<[CONTACT]>.|ND|XmlHtmlHandler]

def parse_nondictionary_process(lines, line_idx, verbose=False):
    # First line: '--- inText: [~~~]'
    assert lines[line_idx].startswith("--- inText: [")
    input_tokens = split_line_tokens(lines[line_idx][13:-1])
    if verbose: print(input_tokens)
    line_idx += 1
    
    # Correction lines: '- Correct: [~~~|~~~|~~~|~~~]'
    outputs = []  # token, detected, correction, (typo_type, typo_handler)
    prev_token_idx = 0
    while lines[line_idx].startswith('- Correct'):
        typo, correction, typo_type, typo_handler = parse_correct_line(lines[line_idx])
        # First check the last correction and later tokens
        if outputs and (typo in split_line_tokens(outputs[-1][2])):
            outputs[-1] = (outputs[-1][0], outputs[-1][1], outputs[-1][2].replace(typo, correction, 1), outputs[-1][3])
            if verbose: print(f'\tFound in-correction typo [{input_tokens[typo_token_idx]}] Mark [{correction}] and {(typo_type, typo_handler)}')
        else:
            assert typo in input_tokens[prev_token_idx:]
            typo_token_idx = input_tokens.index(typo, prev_token_idx)
            for i in range(prev_token_idx, typo_token_idx):
                outputs.append((input_tokens[i], False, None, None))
            outputs.append((input_tokens[typo_token_idx], True, correction, (typo_type, typo_handler)))
            if verbose: print(f'\tFound [{input_tokens[typo_token_idx]}] Mark [{correction}] and {(typo_type, typo_handler)}')
            prev_token_idx = typo_token_idx + 1
        line_idx += 1
    
    for token in input_tokens[prev_token_idx:]:
        outputs.append((token, False, None, None))
    
    return line_idx, outputs

In [10]:
# Type: NW_MERGE
# Handler: 'MergeCorrector ({word})'
# - Correct: [non formulary|nonformulary|NW_MERGE|MergeCorrector (non)]
# - Correct: [pub med/|pubmed/|NW_MERGE|MergeCorrector (pub)]

def parse_nonword_merge_process(lines, line_idx, verbose=False):
    # First line: '--- inText: [~~~]'
    assert lines[line_idx].startswith("--- inText: [")
    input_tokens = split_line_tokens(lines[line_idx][13:-1])
    if verbose: print(input_tokens)
    line_idx += 1
    
    outputs = []  # token, detected, correction, (typo_type, typo_handler)
    last_correct_output_idx = 0
    token_idx = 0
    while token_idx < len(input_tokens) and input_tokens[token_idx].isspace():
        outputs.append((input_tokens[token_idx], False, None, None))
        token_idx += 1
    
    while not lines[line_idx].startswith('=' * 6):
        if verbose: print(f'Parsing line: {lines[line_idx]}')
        if lines[line_idx].startswith('- Detect: ['):  # All word detection
            # First detection
            in_token, nonWordMergeFlag, subflags = parse_detect_line(lines[line_idx])
            while input_tokens[token_idx] != in_token:  # Skipped tokens (maybe space) -> not detected
                outputs.append((input_tokens[token_idx], False, None, None))
                token_idx += 1

            if nonWordMergeFlag:
                if verbose: print("\tFirst detection")
                # Detected -> Test second detection (same word without punctuation)                
                line_idx += 1
                in_token2, nonWordMergeFlag2, subflags2 = parse_detect_line(lines[line_idx])
                if nonWordMergeFlag2:
                    # Detected as a NW_MERGE -> Mark as detected from this to the next detection
                    outputs.append((in_token, True, None, None))
                    if verbose: print(f'\tSecond detection\n\tDetected: [{in_token}]')
                    token_idx += 1
                    line_idx += 1
                    while lines[line_idx].startswith('- Context: [') or \
                          lines[line_idx].startswith('- CScore: ') or \
                          lines[line_idx].startswith('- FScore: '):
                        if verbose: print(f'\tSkip line: {lines[line_idx]}')
                        line_idx += 1
                    if lines[line_idx].startswith('- Detect: ['):
                        in_token_next, _, _ = parse_detect_line(lines[line_idx])
                    else:
                        in_token_next = 'Not a valid token'  # Add all remaining tokens
                    while token_idx < len(input_tokens) and input_tokens[token_idx] != in_token_next:  # Skipped token (maybe space) -> not detected
                        outputs.append((input_tokens[token_idx], True, None, None))
                        if verbose: print(f'\tDetected: [{input_tokens[token_idx]}]')
                        token_idx += 1
                    for i in range(len(outputs) -1, -1, -1):  # Mark not detected on the last spaces
                        if outputs[i][0].isspace():
                            outputs[i] = (outputs[i][0], False, None, None)
                            if verbose: print(f'\tWithdraw detect: [{outputs[i][0]}]')
                        else:
                            break
                else:
                    # Not detected as second detection
                    outputs.append((in_token, False, None, None))
                    token_idx += 1
                    line_idx += 1   
            else:
                # Not detected
                outputs.append((in_token, False, None, None))
                token_idx += 1
                line_idx += 1   
        elif lines[line_idx].startswith('- Correct: ['):  # Corrections at the last
            in_tokens, correction, typo_type, typo_handler = parse_correct_line(lines[line_idx])
            in_tokens = split_line_tokens(in_tokens)
            for i in range(last_correct_output_idx, len(outputs)):
                flag = True
                for j in range(len(in_tokens)):
#                     if not outputs[i + j][1] or outputs[i + j][0] != in_tokens[j]:
                    if outputs[i + j][0] != in_tokens[j]:
                        flag = False
                        break
                if flag:
                    flag = any([outputs[i + j][1] for j in range(len(in_tokens))])
                if flag:
                    if verbose: print(f'\tFound {in_tokens} @ {i}. Mark [{correction}] and {(typo_type, typo_handler)}')
                    for j in range(len(in_tokens)):
                        outputs[i + j] = (outputs[i + j][0], True, correction, (typo_type, typo_handler))
                    last_correct_output_idx = i + len(in_tokens)
                    break
            if not flag:
                raise ValueError(f'Not found {in_tokens}')
            line_idx += 1
                
    # Add remaining tokens (perhaps space tokens)
    for token in input_tokens[token_idx:]:
        outputs.append((token, False, None, None))
    
    return line_idx, outputs

In [11]:
# Type: NW
# Handler: 'NonWordCorrector-1To1', 'NonWordCorrector-Split'
# - Correct: [bevon|bevan|NW|NonWordCorrector-1To1]
# - Correct: [anovate.|an ovate.|NW|NonWordCorrector-Split]

def parse_nonword_split_1to1_process(lines, line_idx, verbose=False):
    # First line: '--- inText: [~~~]'
    assert lines[line_idx].startswith("--- inText: [")
    input_tokens = split_line_tokens(lines[line_idx][13:-1])
    if verbose: print(input_tokens)
    line_idx += 1
    
    outputs = []  # token, detected, correction, (typo_type, typo_handler)
    last_correct_output_idx = 0
    token_idx = 0
    while token_idx < len(input_tokens) and input_tokens[token_idx].isspace():
        outputs.append((input_tokens[token_idx], False, None, None))
        token_idx += 1

    while not lines[line_idx].startswith('=' * 6):
        if verbose: print(f'Parsing line: {lines[line_idx]}')
        if not lines[line_idx].startswith('- Detect: ['):  # All word detection
            raise ValueError(f'Not a detect line: {lines[line_idx]}')
        
        # First detection
        in_token, nonWordFlag, subflags = parse_detect_line(lines[line_idx])
        while input_tokens[token_idx] != in_token:  # Skipped tokens (maybe space) -> not detected
            outputs.append((input_tokens[token_idx], False, None, None))
            token_idx += 1
            
        if nonWordFlag:
            # Detected -> Test second detection (same word without punctuation)
            if verbose: print("\tFirst detection")
            line_idx += 1
            in_token2, nonWordFlag2, subflags2 = parse_detect_line(lines[line_idx])
            if nonWordFlag2:
                # Detected as a NW -> Skip context and score line, then find correct line (may not exist)
                if verbose: print(f'\tSecond detection\n\tDetected: [{in_token}]')
                line_idx += 1
                while lines[line_idx].startswith('- Context: [') or \
                      lines[line_idx].startswith('- Score: '):
                    if verbose: print(f'\tSkip line: {lines[line_idx]}')
                    line_idx += 1
                if lines[line_idx].startswith('- Correct: ['):
                    _, correction, typo_type, typo_handler = parse_correct_line(lines[line_idx])
                    outputs.append((in_token, True, correction, (typo_type, typo_handler)))
                    line_idx += 1
                    token_idx += 1
                else:  # No correction. Detected but not corrected
                    outputs.append((in_token, True, None, None))
                    token_idx += 1
            else:  # Not detected at second detection
                outputs.append((in_token, False, None, None))
                line_idx += 1
                token_idx += 1
        else:  # Not detected at first detection
            outputs.append((in_token, False, None, None))
            line_idx += 1
            token_idx += 1
                
    # Add remaining tokens (perhaps space tokens)
    for token in input_tokens[token_idx:]:
        outputs.append((token, False, None, None))
        
    return line_idx, outputs

In [12]:
# Type: RW_MERGE
# Handler: 'MergeCorrector ({word})'
# - Correct: [multi vitamin|multivitamin|RW_MERGE|MergeCorrector (multi)]
# - Correct: [can not|cannot|RW_MERGE|MergeCorrector (can)]

def parse_realword_merge_process(lines, line_idx, verbose=False):
    # First line: '--- inText: [~~~]'
    assert lines[line_idx].startswith("--- inText: [")
    input_tokens = split_line_tokens(lines[line_idx][13:-1])
    if verbose: print(input_tokens)
    line_idx += 1
    
    outputs = []  # token, detected, correction, (typo_type, typo_handler)
    last_correct_output_idx = 0
    token_idx = 0
    while token_idx < len(input_tokens) and input_tokens[token_idx].isspace():
        outputs.append((input_tokens[token_idx], False, None, None))
        token_idx += 1
        
    while not lines[line_idx].startswith('=' * 6):
        if verbose: print(f'Parsing line: {lines[line_idx]}')
        if lines[line_idx].startswith('- Detect: ['):  # All word detection
            # Detection
            in_token, realWordMergeFlag, subflags = parse_detect_line(lines[line_idx])
            while input_tokens[token_idx] != in_token:  # Skipped tokens (maybe space) -> not detected
                outputs.append((input_tokens[token_idx], False, None, None))
                token_idx += 1

            if realWordMergeFlag:
                # Detected as a RW_MERGE -> Mark as detected from this to the next detection
                outputs.append((in_token, True, None, None))
                if verbose: print(f"\tDetected: [{in_token}]")
                line_idx += 1
                token_idx += 1
                
                # Skip results
                while lines[line_idx].startswith('- Context: [') or \
                      lines[line_idx].startswith('- CScore: ') or \
                      lines[line_idx].startswith('- FScore: '):
                    if verbose: print(f'\tSkip line: {lines[line_idx]}')
                    line_idx += 1

                # To which token we need to include
                if lines[line_idx].startswith('- Detect: ['):
                    in_token_next, _, _ = parse_detect_line(lines[line_idx])
                else:
                    in_token_next = 'Not a valid token'  # Add all remaining tokens
                    
                while token_idx < len(input_tokens) and input_tokens[token_idx] != in_token_next:  # Skipped token (maybe space) -> not detected
                    outputs.append((input_tokens[token_idx], True, None, None))
                    if verbose: print(f'\tDetected: [{input_tokens[token_idx]}]')
                    token_idx += 1
                for i in range(len(outputs) -1, -1, -1):  # Mark not detected on the last spaces
                    if outputs[i][0].isspace():
                        outputs[i] = (outputs[i][0], False, None, None)
                        if verbose: print(f'\tWithdraw detect: [{outputs[i][0]}]')
                    else:
                        break 
            else:
                # Not detected
                outputs.append((in_token, False, None, None))
                token_idx += 1
                line_idx += 1   
        elif lines[line_idx].startswith('- Correct: ['):  # Corrections at the last
#             print('\n'.join([str(o) for o in outputs]))
            in_tokens, correction, typo_type, typo_handler = parse_correct_line(lines[line_idx])
            in_tokens = split_line_tokens(in_tokens)
            for i in range(last_correct_output_idx, len(outputs)):
#                 if outputs[i][1]:  # Detected token, will be mapped to the correction line
                flag = True
                for j in range(len(in_tokens)):
#                     if not outputs[i + j][1] or outputs[i + j][0] != in_tokens[j]:
                    if outputs[i + j][0] != in_tokens[j]:
                        flag = False
                        break
                if flag:
                    flag = any([outputs[i + j][1] for j in range(len(in_tokens))])
                if flag:
                    if verbose: print(f'\tFound {in_tokens} @ {i}. Mark [{correction}] and {(typo_type, typo_handler)}')
                    for j in range(len(in_tokens)):
                        outputs[i + j] = (outputs[i + j][0], True, correction, (typo_type, typo_handler))
                    last_correct_output_idx = i + len(in_tokens)
                    break
#                 else:  # Uncomment this line if we don't count as detected when no condidate is generated for a line
#                     outputs[i] = (outputs[i + j][0], False, None, None)
            if not flag:
                raise ValueError(f'Not found {in_tokens}')
            line_idx += 1
                
    # Add remaining tokens (perhaps space tokens)
    for token in input_tokens[token_idx:]:
        outputs.append((token, False, None, None))
        
    return line_idx, outputs

In [13]:
# Type: RW
# Handler: RealWordSplitCorrector
# - Correct: [without|with out|RW|RealWordSplitCorrector]
# - Correct: [anywhere|any where|RW|RealWordSplitCorrector]
# - Correct: [another.|an other.|RW|RealWordSplitCorrector]

def parse_realword_split_process(lines, line_idx, verbose=False):
    # First line: '--- inText: [~~~]'
    assert lines[line_idx].startswith("--- inText: [")
    input_tokens = split_line_tokens(lines[line_idx][13:-1])
    if verbose: print(input_tokens)
    line_idx += 1
    
    outputs = []  # token, detected, correction, (typo_type, typo_handler)
    last_correct_output_idx = 0
    token_idx = 0
    while token_idx < len(input_tokens) and input_tokens[token_idx].isspace():
        outputs.append((input_tokens[token_idx], False, None, None))
        token_idx += 1

    while not lines[line_idx].startswith('=' * 6):
        if verbose: print(f'Parsing line: {lines[line_idx]}')
        if not lines[line_idx].startswith('- Detect: ['):  # All word detection
            raise ValueError(f'Not a detect line: {lines[line_idx]}')
        
        # First detection
        in_token, realWordMergeFlag, subflags = parse_detect_line(lines[line_idx])
        while input_tokens[token_idx] != in_token:  # Skipped tokens (maybe space) -> not detected
            outputs.append((input_tokens[token_idx], False, None, None))
            token_idx += 1
            
        # If not detected, try again
        realWordMergeFlag2 = False
        if not realWordMergeFlag:
            line_idx += 1
            in_token2, realWordMergeFlag2, subflags2 = parse_detect_line(lines[line_idx])
        
        if realWordMergeFlag or realWordMergeFlag2:
            if verbose: print(f'\tDetected: [{in_token}]')
            # Skip results
            line_idx += 1
            while lines[line_idx].startswith('- Context: [') or \
                  lines[line_idx].startswith('- CScore: '):
                if verbose: print(f'\tSkip line: {lines[line_idx]}')
                line_idx += 1
            if lines[line_idx].startswith('- Correct: ['):  # Corrected
                _, correction, typo_type, typo_handler = parse_correct_line(lines[line_idx])
                outputs.append((in_token, True, correction, (typo_type, typo_handler)))
                line_idx += 1
                token_idx += 1
            else:  # No correction. Detected but not corrected
                outputs.append((in_token, True, None, None))
                token_idx += 1
        else:  # Not detected
            outputs.append((in_token, False, None, None))
            line_idx += 1
            token_idx += 1
    
    # Add remaining tokens (perhaps space tokens)
    for token in input_tokens[token_idx:]:
        outputs.append((token, False, None, None))
    
    return line_idx, outputs

In [14]:
# Type: RW
# Handler: RealWord1To1Corrector
# - Correct: [wait|what|RW|RealWord1To1Corrector]
# - Correct: [effect|affect|RW|RealWord1To1Corrector]

def parse_realword_1to1_process(lines, line_idx, verbose=False):
    # First line: '--- inText: [~~~]'
    assert lines[line_idx].startswith("--- inText: [")
    input_tokens = split_line_tokens(lines[line_idx][13:-1])
    if verbose: print(input_tokens)
    line_idx += 1
    
    outputs = []  # token, detected, correction, (typo_type, typo_handler)
    last_correct_output_idx = 0
    token_idx = 0
    while token_idx < len(input_tokens) and input_tokens[token_idx].isspace():
        outputs.append((input_tokens[token_idx], False, None, None))
        token_idx += 1

    while line_idx < len(lines) and not lines[line_idx].startswith('=' * 6):
        if verbose: print(f'Parsing line: {lines[line_idx]}')
        if not lines[line_idx].startswith('- Detect: ['):  # All word detection
            raise ValueError(f'Not a detect line: {lines[line_idx]}')
        
        # First detection
        in_token, realWordMergeFlag, subflags = parse_detect_line(lines[line_idx])
        while input_tokens[token_idx] != in_token:  # Skipped tokens (maybe space) -> not detected
            outputs.append((input_tokens[token_idx], False, None, None))
            token_idx += 1
            
        # If not detected, try again
        realWordMergeFlag2 = False
        if not realWordMergeFlag:
            line_idx += 1
            in_token2, realWordMergeFlag2, subflags2 = parse_detect_line(lines[line_idx])
        
        if realWordMergeFlag or realWordMergeFlag2:
            if verbose: print(f'\tDetected: [{in_token}]')
            # Skip results
            line_idx += 1
            while line_idx < len(lines) and (lines[line_idx].startswith('- Context: [') or \
                  lines[line_idx].startswith('- Score: ')):
                if verbose: print(f'\tSkip line: {lines[line_idx]}')
                line_idx += 1
            if line_idx < len(lines) and lines[line_idx].startswith('- Correct: ['):  # Corrected
                _, correction, typo_type, typo_handler = parse_correct_line(lines[line_idx])
                outputs.append((in_token, True, correction, (typo_type, typo_handler)))
                line_idx += 1
                token_idx += 1
            else:  # No correction. Detected but not corrected
                outputs.append((in_token, True, None, None))
                token_idx += 1
        else:  # Not detected
            outputs.append((in_token, False, None, None))
            line_idx += 1
            token_idx += 1

    # Add remaining tokens (perhaps space tokens)
    for token in input_tokens[token_idx:]:
        outputs.append((token, False, None, None))
    
    return line_idx, outputs


In [15]:
def validate_step_outputss(step_outputss):
    for i in range(len(step_outputss) - 1):
        outputs1, outputs2 = step_outputss[i], step_outputss[i+1]

        idx1, idx2 = 0, 0
        while idx1 < len(outputs1) and idx2 < len(outputs2):
            inword1, detect1, correction1, handler1 = outputs1[idx1]
            if not correction1:  # No correction
                inword2, detect2, correction2, handler2 = outputs2[idx2]
                assert inword1 == inword2
                idx1 += 1
                idx2 += 1
            else:
                # Skip the merged
                idx1 += 1
                while idx1 < len(outputs1):
                    inword11, detect11, correction11, handler11 = outputs1[idx1]
                    if detect11 != detect1 or correction1 != correction11 or handler11 != handler1:
                        break
                    idx1 += 1

                # Split & compare
                correct_tokens1 = split_line_tokens(correction1)
                for token1 in correct_tokens1:
                    inword2, detect2, correction2, handler2 = outputs2[idx2]
                    assert token1 == inword2
                    idx2 += 1

        assert idx1 == len(outputs1) and idx2 == len(outputs2)  # Validation did not reach end

In [16]:
def merge_step_outputs(outputs1, outputs2, outputs_ds):
    outputs_result = []
    
    idx1, idx2 = 0, 0
    while idx1 < len(outputs1) and idx2 < len(outputs2):
        idx1_start, idx2_start = idx1, idx2
        # Merge first
        inword1, detect1, correction1, handler1 = outputs1[idx1]
        if not correction1:  # No correction
            inword2, detect2, correction2, handler2 = outputs2[idx2]
            assert inword1 == inword2
            idx1 += 1
            idx2 += 1
        else:
            # Skip the merged
            idx1 += 1
            while idx1 < len(outputs1):
                inword11, detect11, correction11, handler11 = outputs1[idx1]
                if detect11 != detect1 or correction1 != correction11 or handler11 != handler1:
                    break
                idx1 += 1

            # Split & compare
            correct_tokens1 = split_line_tokens(correction1)
            for token1 in correct_tokens1:
                inword2, detect2, correction2, handler2 = outputs2[idx2]
                assert token1 == inword2
                idx2 += 1
                
        for i in range(idx1_start, idx1):
            inword1, detect1, correction1, handler1 = outputs1[i]
            detect2 = any([outputs_ds[j][1] for j in range(idx2_start, idx2)])
            corrections2 = [outputs_ds[j][2] for j in range(idx2_start, idx2) if outputs_ds[j][2]]
            handlers2 = [outputs_ds[j][3] for j in range(idx2_start, idx2) if outputs_ds[j][3]]
            
            detect = detect1 or detect2
            correction = corrections2[0] if corrections2 else correction1
            handler = handlers2[0] if handlers2 else handler1
            
            output = (inword1, detect1 or detect2, correction, handler)
            outputs_result.append(output)
    assert idx1 == len(outputs1) and idx2 == len(outputs2)
    return outputs_result

- Example text / CSpell output / parsed CSpell output

In [17]:
# Dataset example
train_typo_examples[1]

[1,
 '1',
 87,
 95,
 'dianosed',
 'diagnosed',
 "My mom is 82 years old suffering from anxiety and depression for the last 10 years was dianosed early onset dementia 3 years ago. Do y'all have a office in Greensboro NC? Can you recommend someone. she has serotonin syndrome and nothing helps her.\n\n"]

In [18]:
# The original text (1.txt)
print(open('../data/cspell/TrainSet_brat/1.txt').read())

My mom is 82 years old suffering from anixity and depression for the last 10 years was dianosed early on set deminita 3 years ago. Do yall have a office in Greensboro NC? Can you recommend someone. she has seretona syndrome and nonething helps her.




In [19]:
# The original annotation (1.ann)
print(open('../data/cspell/TrainSet_brat/1.ann').read())

A1	Important-Focus T2
A2	Important-Focus T6
#1	AnnotatorNotes T1	diagnosed
#2	AnnotatorNotes T2	anxiety
#3	AnnotatorNotes T3	onset
#4	AnnotatorNotes T4	dementia
#5	AnnotatorNotes T5	y'all
#6	AnnotatorNotes T6	serotonin
#7	AnnotatorNotes T7	nothing
T1	Misspelling 87 95	dianosed
T2	Misspelling 38 45	anixity
T3	ToMerge 102 108	on set
T4	Misspelling 109 117	deminita
T5	Punctuation 134 138	yall
T6	Misspelling 206 214	seretona
T7	Misspelling 228 237	nonething
T8	RealWord 102 108	on set



In [20]:
# Input to CSpell
fname = sorted(list(filter(lambda x: x.endswith('_debug.txt'), os.listdir(train_example_dir))),
                           key=lambda x: int(x[:x.index('_')]))[1]
print(open(os.path.join(train_example_dir, fname.replace('_debug', '')), 'r').read())

My mom is 82 years old suffering from anxiety and depression for the last 10 years was dianosed early onset dementia 3 years ago. Do y'all have a office in Greensboro NC? Can you recommend someone. she has serotonin syndrome and nothing helps her.




In [21]:
# Parsed CSpell output
fname = sorted(list(filter(lambda x: x.endswith('_debug.txt'), os.listdir(train_example_dir))),
                           key=lambda x: int(x[:x.index('_')]))[1]
debug_lines = open(os.path.join(train_example_dir, fname), 'r').read().splitlines()
read_debug_text(debug_lines)

[[('My', True, None, None),
  (' ', False, None, None),
  ('mom', True, None, None),
  (' ', False, None, None),
  ('is', True, None, None),
  (' ', False, None, None),
  ('82', False, None, None),
  (' ', False, None, None),
  ('years', False, None, None),
  (' ', False, None, None),
  ('old', True, None, None),
  (' ', False, None, None),
  ('suffering', True, None, None),
  (' ', False, None, None),
  ('from', True, None, None),
  (' ', False, None, None),
  ('anxiety', True, None, None),
  (' ', False, None, None),
  ('and', True, None, None),
  (' ', False, None, None),
  ('depression', True, None, None),
  (' ', False, None, None),
  ('for', True, None, None),
  (' ', False, None, None),
  ('the', True, None, None),
  (' ', False, None, None),
  ('last', True, None, None),
  (' ', False, None, None),
  ('10', False, None, None),
  (' ', False, None, None),
  ('years', False, None, None),
  (' ', False, None, None),
  ('was', True, None, None),
  (' ', True, None, None),
  ('diano

## Compute the Accuracy of CSpell

In [22]:
temp_token = '$@$@$@'

def parse_and_location(text, typo, start, end):
    assert text[start:end] == typo
    lines = text.split('\n')
    text_replace = text[:start] + temp_token + text[end:]
    lines2 = text_replace.split('\n')
    line_idx = 0
    while line_idx < len(lines2):
        if temp_token in lines2[line_idx]:
            token_idx = [i for i, t in enumerate(split_line_tokens(lines2[line_idx])) if temp_token in t][0]
            break
        else:
            line_idx += 1
    
    return lines, line_idx, token_idx

In [23]:
def cspell_results(typo_examples, dataset_root, verbose=False):
    results = []  # (ex_id, before_token, detected, correction)
    for i, example in enumerate(typo_examples):
        with open(os.path.join(dataset_root, f'{i}_debug.txt'), 'r') as fd:
            debug_lines = [(l[:-1] if l.endswith('\n') else l) for l in fd.readlines()]
        debug_parse_results = read_debug_text(debug_lines)

        ex_id, note_id, start, end, typo, correction, text = example
        alpha_check = all([c.isalpha() for c in typo]) and all([c.isalpha() for c in correction])
        lines, line_idx, token_idx = parse_and_location(text, typo, start, end)
        before_token = split_line_tokens(lines[line_idx])[token_idx]
        answer_token = before_token.replace(typo, correction)
        typo_output = debug_parse_results[line_idx][token_idx]
        
        assert typo_output[0] == before_token
        detected = typo_output[1]
        after_token = typo_output[2]
        correct = after_token is not None and answer_token.lower() == after_token.lower()
        
        results.append((ex_id, before_token, detected, after_token, correct))
        
        if verbose:
            print(f'example {ex_id:4d} ({"A" if alpha_check else " "}): {typo:13s} -> {correction:13s}/ {before_token:15s} ({"D"if detected else " "})-> {after_token if after_token else "(No correction)":15s} ({"O" if correct else "X"})')
    return results

In [24]:
def cspell_result_counts(typo_examples, cspell_results):
    assert len(typo_examples) == len(cspell_results)
    total_cnt, detected_cnt, changed_cnt, correct_cnt = 0, 0, 0, 0
    for example, result in zip(typo_examples, cspell_results):
        ex_id, note_id, start, end, typo, correction, text = example
        _, before_token, detected, after_token, correct = result
        
        total_cnt += 1
        detected_cnt += detected
        changed_cnt += after_token is not None
        correct_cnt += correct
    return total_cnt, detected_cnt, changed_cnt, correct_cnt

In [25]:
train_results = cspell_results(train_typo_examples, train_example_dir, verbose=True)

example    0 (A): anixity       -> anxiety      / anixity         (D)-> (No correction) (X)
example    1 (A): dianosed      -> diagnosed    / dianosed        (D)-> diagnosed       (O)
example    2 (A): deminita      -> dementia     / deminita        (D)-> dementia        (O)
example    3 (A): seretona      -> serotonin    / seretona        (D)-> serenoa         (X)
example    4 (A): nonething     -> nothing      / nonething       (D)-> none thing      (X)
example    5 (A): shown         -> showed       / shown           (D)-> (No correction) (X)
example    6 (A): required      -> require      / required        (D)-> (No correction) (X)
example    7 (A): knoledge      -> knowledge    / knoledge        (D)-> knowledge       (O)
example    8 (A): Truely        -> Truly        / Truely          (D)-> truly           (O)
example    9 (A): depress       -> depressed    / depress         (D)-> (No correction) (X)
example   10 (A): kg            -> of           / kg              ( )-> (No corr

example   99 (A): surgin        -> surgeon      / surgin          (D)-> surgeon         (O)
example  100 (A): diaretic      -> diuretic     / diaretic)       (D)-> diabetic)       (X)
example  101 (A): bearly        -> barely       / bearly          (D)-> early           (X)
example  102 (A): damagel       -> damage       / damagel         (D)-> damage          (O)
example  103 (A): exdruating    -> excruciating / exdruating,     (D)-> (No correction) (X)
example  104 (A): ajd           -> and          / ajd             (D)-> (No correction) (X)
example  105 (A): ccryln        -> cry          / ccryln          (D)-> (No correction) (X)
example  106 (A): lidokaine     -> lidocaine    / lidokaine       (D)-> lidocaine       (O)
example  107 (A): pls           -> please       / pls             (D)-> please          (O)
example  108 (A): intercurse    -> intercourse  / intercurse      (D)-> intercourse     (O)
example  109 (A): protombin     -> prothrombin  / protombin       (D)-> prothomb

example  210 (A): Medlin        -> Medline      / Medlin          (D)-> medline         (O)
example  211 (A): dianosed      -> diagnosed    / dianosed        (D)-> diagnosed       (O)
example  212 (A): u             -> you          / u               (D)-> you             (O)
example  213 (A): u             -> you          / u               (D)-> you             (O)
example  214 (A): u             -> you          / u               (D)-> you             (O)
example  215 (A): wiht          -> with         / wiht            (D)-> wit             (X)
example  216 (A): prolly        -> probably     / (prolly         (D)-> (prolia         (X)
example  217 (A): Cronic        -> Chronic      / Cronic          (D)-> chronic         (O)
example  218 (A): Lymphocytice  -> lymphocytic  / Lymphocytice    (D)-> lymphocytic     (O)
example  219 (A): MedlinkPlus   -> MedlinePlus  / MedlinkPlus     (D)-> medlineplus     (O)
example  220 (A): wich          -> which        / wich            (D)-> with    

example  350 (A): amd           -> and          / amd             (D)-> (No correction) (X)
example  351 (A): MedicinePlus  -> MedlinePlus  / MedicinePlus    (D)-> medicine plus   (X)
example  352 (A): heridity      -> heredity     / heridity        (D)-> heredity        (O)
example  353 (A): susseptable   -> susceptible  / susseptable.    (D)-> susceptible.    (O)
example  354 (A): vertbral      -> vertebral    / vertbral        (D)-> vertebral       (O)
example  355 (A): u             -> you          / u               (D)-> you             (O)
example  356 (A): plz           -> please       / her.plz         (D)-> her. please     (X)
example  357 (A): u             -> you          / u               (D)-> you             (O)
example  358 (A): woried        -> worried      / woried          (D)-> worried         (O)
example  359 (A): describeing   -> describing   / describeing     (D)-> describing      (O)
example  360 (A): espicaly      -> especially   / espicaly        (D)-> (No corr

In [26]:
total_cnt, detected_cnt, changed_cnt, correct_cnt = cspell_result_counts(train_typo_examples, train_results)

print(f'Train set result')
print(f'Total accuracy   : {correct_cnt}/{total_cnt} = {correct_cnt/total_cnt:.6f}')
print(f'Detected accuracy: {correct_cnt}/{detected_cnt} = {correct_cnt/detected_cnt:.6f}')
print(f'Changed accuracy : {correct_cnt}/{changed_cnt} = {correct_cnt/changed_cnt:.6f}')

Train set result
Total accuracy   : 248/421 = 0.589074
Detected accuracy: 248/409 = 0.606357
Changed accuracy : 248/295 = 0.840678


In [27]:
test_results = cspell_results(test_typo_examples, test_example_dir, verbose=True)

example    0 (A): faty          -> fatty        / faty            (D)-> fatty           (O)
example    1 (A): leaver        -> liver        / leaver&high     (D)-> leaver & high   (X)
example    2 (A): presure       -> pressure     / presure.        (D)-> pressure.       (O)
example    3 (A): mam           -> madam        / &mam            (D)-> & mam           (X)
example    4 (A): presur        -> pressure     / presur          (D)-> (No correction) (X)
example    5 (A): presure       -> pressure     / presure..       (D)-> pressure..      (O)
example    6 (A): neurostymulation -> neurostimulation/ neurostymulation (D)-> neurostimulation (O)
example    7 (A): cist          -> cyst         / cist            (D)-> list            (X)
example    8 (A): cists         -> cysts        / cists           (D)-> cysts           (O)
example    9 (A): prpostectomy  -> prostatectomy/ prpostectomy    (D)-> (No correction) (X)
example   10 (A): prostectomy   -> prostatectomy/ prostectomy     (D)-> 

example   96 (A): ob            -> of           / ob              (D)-> (No correction) (X)
example   97 (A): losses        -> loses        / losses          (D)-> (No correction) (X)
example   98 (A): consiousness  -> consciousness/ consiousness    (D)-> consciousness   (O)
example   99 (A): yhink         -> think        / yhink           (D)-> think           (O)
example  100 (A): tabs          -> tablets      / tabs            (D)-> (No correction) (X)
example  101 (A): doe           -> for          / doe             (D)-> (No correction) (X)
example  102 (A): prier         -> prior        / prier           (D)-> prior           (O)
example  103 (A): knw           -> know         / knw             (D)-> know            (O)
example  104 (A): on            -> one          / on              (D)-> (No correction) (X)
example  105 (A): takeing       -> taking       / takeing         (D)-> taking          (O)
example  106 (A): tuberchlosis  -> tuberculosis / tuberchlosis    (D)-> tubercul

example  192 (A): u             -> you          / u               (D)-> you             (O)
example  193 (A): exspect       -> expect       / exspect         (D)-> expect          (O)
example  194 (A): comeing       -> coming       / comeing         (D)-> coming          (O)
example  195 (A): haveing       -> having       / haveing         (D)-> having          (O)
example  196 (A): therpies      -> therapies    / therpies        (D)-> therapies       (O)
example  197 (A): cant          -> cannot       / cant            (D)-> can't           (X)
example  198 (A): dignosis      -> diagnosis    / dignosis        (D)-> diagnosis       (O)
example  199 (A): Gamapentin    -> Gabapentin   / Gamapentin      (D)-> gabapentin      (O)
example  200 (A): Providone     -> Povidone     / Providone       (D)-> povidone        (O)
example  201 (A): Fırst         -> First        / Fırst           (D)-> first           (O)
example  202 (A): whıch         -> which        / whıch           (D)-> which   

example  305 (A): assistents    -> assistants   / assistents      (D)-> assistants      (O)
example  306 (A): mutismo       -> mutism       / mutismo         (D)-> mutism          (O)
example  307 (A): selectivo     -> selective    / selectivo?      (D)-> selective?      (O)
example  308 (A): Thrombastinia -> Thrombasthenia/ Thrombastinia   (D)-> thrombasthenia  (O)
example  309 (A): unnessary     -> unnecessary  / unnessary       (D)-> unnecessary     (O)
example  310 (A): paraplagia    -> paraplegia   / paraplagia      (D)-> paraplegia      (O)
example  311 (A): pl            -> please       / pl              (D)-> please          (O)
example  312 (A): dignoised     -> diagnosed    / dignoised       (D)-> diagnosed       (O)
example  313 (A): akmost        -> almost       / akmost          (D)-> almost          (O)
example  314 (A): insideous     -> insidious    / insideous       (D)-> insides         (X)
example  315 (A): aeons         -> eons         / aeons?          ( )-> (No cor

example  394 (A): filed         -> filled       / filed           (D)-> (No correction) (X)
example  395 (A): effecting     -> affecting    / effecting       (D)-> (No correction) (X)
example  396 (A): than          -> then         / than            (D)-> (No correction) (X)
example  397 (A): Gen           -> gene         / Gen.            ( )-> (No correction) (X)
example  398 (A): Sry           -> Sorry        / Sry             (D)-> (No correction) (X)
example  399 (A): to            -> too          / to              (D)-> (No correction) (X)
example  400 (A): to            -> too          / to              (D)-> (No correction) (X)
example  401 (A): smothie       -> smoothie     / smothie         (D)-> smother         (X)
example  402 (A): redi          -> readi        / redi            (D)-> redicat         (X)
example  403 (A): trichoepithilioma -> trichoepithelioma/ trichoepithilioma (D)-> trichoepithelioma (O)
example  404 (A): Giloma        -> Glioma       / Giloma          (D

example  488 (A): are           -> area         / are             (D)-> (No correction) (X)
example  489 (A): rthe          -> the          / rthe            (D)-> the             (O)
example  490 (A): succesfuly    -> successfully / succesfuly      (D)-> successful      (X)
example  491 (A): m             -> am           / m               ( )-> (No correction) (X)
example  492 (A): hasitate      -> hesitate     / hasitate        (D)-> hesitate        (O)
example  493 (A): m             -> am           / m               ( )-> (No correction) (X)
example  494 (A): hasitate      -> hesitate     / hasitate        (D)-> hesitate        (O)
example  495 (A): tabacoo       -> tobacco      / tabacoo.        (D)-> tabac.          (X)
example  496 (A): Plz           -> please       / Plz             (D)-> please          (O)
example  497 (A): tabacoo       -> tobacco      / tabacoo.        (D)-> tabac.          (X)
example  498 (A): actives       -> active       / actives         (D)-> (No corr

example  580 (A): mum           -> mother       / mum             ( )-> (No correction) (X)
example  581 (A): abnormaility  -> abnormality  / abnormaility.   (D)-> abnormality.    (O)
example  582 (A): mum           -> mother       / mum,            ( )-> (No correction) (X)
example  583 (A): mum           -> mother       / mum             ( )-> (No correction) (X)
example  584 (A): of            -> if           / of              (D)-> (No correction) (X)
example  585 (A): diease        -> disease      / diease          (D)-> disease         (O)
example  586 (A): consistant    -> consistent   / consistant      (D)-> consistent      (O)
example  587 (A): rresearch     -> research     / rresearch       (D)-> research        (O)
example  588 (A): m             -> am           / m               ( )-> (No correction) (X)
example  589 (A): decease       -> disease      / decease         (D)-> (No correction) (X)
example  590 (A): cant          -> cannot       / cant            (D)-> can't   

In [28]:
total_cnt, detected_cnt, changed_cnt, correct_cnt = cspell_result_counts(test_typo_examples, test_results)

print(f'Test set result')
print(f'Total accuracy   : {correct_cnt}/{total_cnt} = {correct_cnt/total_cnt:.6f}')
print(f'Detected accuracy: {correct_cnt}/{detected_cnt} = {correct_cnt/detected_cnt:.6f}')
print(f'Changed accuracy : {correct_cnt}/{changed_cnt} = {correct_cnt/changed_cnt:.6f}')

Test set result
Total accuracy   : 314/611 = 0.513912
Detected accuracy: 314/574 = 0.547038
Changed accuracy : 314/421 = 0.745843


## Generate Detection Masks

In [29]:
cspell_train_mask = []
for result in train_results:
    _, before_token, detected, after_token, correct = result
    cspell_train_mask.append(int(detected))
    
print(f'{sum(cspell_train_mask)}/{len(cspell_train_mask)}')
print(cspell_train_mask)

409/421
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [30]:
cspell_test_mask = []
for result in test_results:
    _, before_token, detected, after_token, correct = result
    cspell_test_mask.append(int(detected))
    
print(f'{sum(cspell_test_mask)}/{len(cspell_test_mask)}')
print(cspell_test_mask)

574/611
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [31]:
with open(os.path.join(dataset_dir, 'cspell_train_mask.json'), 'w') as fd:
    json.dump(cspell_train_mask, fd)
    
with open(os.path.join(dataset_dir, 'cspell_test_mask.json'), 'w') as fd:
    json.dump(cspell_test_mask, fd)