## Tokenize a corpus of Tibetan Texts and separate it into sentences
This code will tokenize a collection of Tibetan texts. 

All text is tokenized with the botok tokenizer. In the output files, every sentence will be in a separate line and within each sentence, individual tokens will be separated be spaces. 
The tokenizer will leave frequently occurring words as a single token, so not every syllable boundary will lead to a token-split.

Note: With a 6-core / 12-thread CPU the tokenization of a large corpus will take several hours. If run on Google Collab, the performance will be extremely bad because this is purely a CPU-based workload and CPU resources on Collab are very limited.

This code is adapted from an example in the Esukhia bonltk repository https://github.com/Esukhia/bonltk/blob/master/nbs/prepare_dataset/esukhia_dergey_kangyur_and%20tengyur.ipynb. 

The main differences compared to the original code are:
* Update from fastai 1 to fastai 2
* The code was adjusted for running not only on Google Collab but also locally
* Processing of files utilizes all available CPU cores, not just a single thread
* The output folder keeps the same hierarchy as the input folder and existing output files are not re-processed to allow incremental processing
* The original code removed various types of brackets and parentheses which are retained here as separate tokens

All code below is licensed under Apache License 2.0 

## Install the botok Tibetan tokenizer library from esukhia if it is not present 

In [1]:
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')
  folder_prefix = '/content/drive/MyDrive/Colab Notebooks/'
  google_collab = True
else:
  folder_prefix = ''
  google_collab = False


if google_collab:
    !pip install botok
    !pip install 'tqdm>=4.59.0' # make sure to use a sufficiently new version of tqdm that supports multithreading

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import libraries

In [2]:
# Libraries:

# Tibetan tokenizer / sentence segmenter
from botok import WordTokenizer, sentence_tokenizer

# file access
import os
from glob import glob
from pathlib import Path 

# progress bar and multiprocessing
from tqdm.notebook import tqdm
from tqdm.contrib.concurrent import process_map

# helper stuff 
import itertools
import traceback
import sys
from typing import List

## Path configuration 

In [3]:
source_paths = 'Corpora/**/*.txt' 
output_path  = 'data/tokenized/'


## Sentence-split and tokenize the entire corpus with the botok library

In [4]:
tokenizer = WordTokenizer()

Loading Trie... (2s.)


In [8]:
# adjust some special characters
def post_process(text:str, 
                 steps=[(' ', '@'), ('_', ' '), ('@', '_'), ('*', ''), ('(', ' ( '), (')', ' ) '), ('[', ' [ '), (']', ' ] '), ('{', ' { '), ('}', ' } '), ('<', ' < '), ('>', ' > ')])->str:
    for f, t in steps:
        text = text.replace(f, t)
    return text 

# split a string in sentences and tokenize each sentence
def get_sentences(text:str):
    text = text.replace('\n', ' ')
    tokens = tokenizer.tokenize(text)
    sentences_tok = sentence_tokenizer(tokens)
    sentences = [' '.join([post_process(tok.text) for tok in sent[1]]).strip() for sent in sentences_tok]
    return sentences


# tokenize a single file, split it into one sentence per line and write it into the output folder
# note: input_file must be a relative path starting at the current location
def process_file(path_prefix:str, input_file:str, out_path:str)->List[str]:    
    path = Path(  path_prefix ).joinpath(input_file)
    out_path = Path( path_prefix ).joinpath(out_path).joinpath(input_file)
    print(str(path) + '->' + str(out_path))
    #print(str(out_path))
    os.makedirs(out_path.parent, exist_ok=True)
        
    if(out_path.is_file()):
        # -> tokenized output file already exists. Read and return it
        existing_file=out_path.read_text()
        return existing_file.split('\n')
    else:
        # -> output does not exist yet. Tokenize the input and store the result

        # read input file
        text = path.read_text()        

    
        # segment text into sentences and tokens
        sentences = get_sentences(text)

        # post-processing
        #result = post_process('\n'.join(sentences))

        # save the sentences
        out_path.write_text('\n'.join(sentences))
        print(out_path)

        return sentences

    
# call wrapper: tokenize a single file but take all arguments as a single n-tuple to allow easy multithreaded calling
def process_file_(params):
    try: 
        path_prefix, input_file, out_path = params
        return process_file(path_prefix, input_file, out_path)
    except Exception as e:
        # catch exceptions so that the program does not die if there is a problematic input file and print the error information
        print(f'Error when processing file: {input_file}')
        print(e) 
        traceback.print_exception(*sys.exc_info())

    
# Parse into sentences and write one sentence per line into the output file.
# Furthermore, each sentence is tokenized so that tokens are split into strings
def convert_to_sentence_per_line(single_threaded:bool, path_prefix:str, input_file_pattern:str, out_path:str):
    n_sents = 0

    out_path = Path( path_prefix ).joinpath(out_path)
    #os.makedirs(out_path, exist_ok=True)
    input_pattern_full = str( Path(path_prefix).joinpath(input_file_pattern) )

    file_list = glob(input_pattern_full, recursive=True)

    file_list = [file_name.replace(path_prefix,'') for file_name in file_list] 
    call_aguments = [ ( path_prefix, file_name, out_path ) for file_name in file_list ]     
    if single_threaded:
      # do single-threaded processing
      for params in call_aguments: #tqdm(call_aguments):
        process_file_(params)
    else:
      # do parallel processing to speed things up.
      # each file is processed by a separate thread
      results = process_map(process_file_, call_aguments, chunksize=1) 
   
    # merge lines from all result files back together again after they were processed in individual threads:
    results = list( itertools.chain.from_iterable(results) )
    
    return results


In [9]:
single_threading = False # = google_collab
# trigger actual processing: sentence-split and tokenize the entire corpus
results = convert_to_sentence_per_line(single_threading, folder_prefix, source_paths, output_path)

print(f'[INFO] Corpus contains {len(results)} sentences.')


/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9199/UT4CZ5369-I1KG9199-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9199/UT4CZ5369-I1KG9199-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9202/UT4CZ5369-I1KG9202-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9202/UT4CZ5369-I1KG9202-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9219/UT4CZ5369-I1KG9219-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9219/UT4CZ5369-I1KG9219-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9174/UT4CZ5369-I1KG9174-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9174/UT4CZ5369-I1KG9174-0

  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '


/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9195/UT4CZ5369-I1KG9195-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9195/UT4CZ5369-I1KG9195-0000.txt


  0%|          | 0/10742 [00:00<?, ?it/s]

/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9165/UT4CZ5369-I1KG9165-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9165/UT4CZ5369-I1KG9165-0000.txt


  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '


/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9147/UT4CZ5369-I1KG9147-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9147/UT4CZ5369-I1KG9147-0000.txt


  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{se

/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9165/UT4CZ5369-I1KG9165-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9176/UT4CZ5369-I1KG9176-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9176/UT4CZ5369-I1KG9176-0000.txt
/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9174/UT4CZ5369-I1KG9174-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9204/UT4CZ5369-I1KG9204-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9204/UT4CZ5369-I1KG9204-0000.txt


  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '


/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9202/UT4CZ5369-I1KG9202-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9205/UT4CZ5369-I1KG9205-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9205/UT4CZ5369-I1KG9205-0000.txt


  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '


/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9195/UT4CZ5369-I1KG9195-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9160/UT4CZ5369-I1KG9160-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9160/UT4CZ5369-I1KG9160-0000.txt
/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9154/UT4CZ5369-I1KG9154-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9159/UT4CZ5369-I1KG9159-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9159/UT4CZ5369-I1KG9159-0000.txt


  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '


/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9147/UT4CZ5369-I1KG9147-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9184/UT4CZ5369-I1KG9184-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9184/UT4CZ5369-I1KG9184-0000.txt


  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '


/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9176/UT4CZ5369-I1KG9176-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9158/UT4CZ5369-I1KG9158-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9158/UT4CZ5369-I1KG9158-0000.txt
/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9205/UT4CZ5369-I1KG9205-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9140/UT4CZ5369-I1KG9140-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9140/UT4CZ5369-I1KG9140-0000.txt
/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9204/UT4CZ5369-I1KG9204-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9186/UT4CZ53

  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '


/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9158/UT4CZ5369-I1KG9158-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9223/UT4CZ5369-I1KG9223-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9223/UT4CZ5369-I1KG9223-0000.txt
/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9186/UT4CZ5369-I1KG9186-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9132/UT4CZ5369-I1KG9132-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9132/UT4CZ5369-I1KG9132-0000.txt
/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9169/UT4CZ5369-I1KG9169-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9134/UT4CZ53

  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '


/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9135/UT4CZ5369-I1KG9135-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9130/UT4CZ5369-I1KG9130-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9130/UT4CZ5369-I1KG9130-0000.txt
/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9140/UT4CZ5369-I1KG9140-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9131/UT4CZ5369-I1KG9131-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9131/UT4CZ5369-I1KG9131-0000.txt
/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9191/UT4CZ5369-I1KG9191-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9193/UT4CZ53

  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
  f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '


/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9133/UT4CZ5369-I1KG9133-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9133/UT4CZ5369-I1KG9133-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9156/UT4CZ5369-I1KG9156-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9156/UT4CZ5369-I1KG9156-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9188/UT4CZ5369-I1KG9188-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9188/UT4CZ5369-I1KG9188-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9213/UT4CZ5369-I1KG9213-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9213/UT4CZ5369-I1KG9213-0

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9180/UT4CZ5369-I1KG9180-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9180/UT4CZ5369-I1KG9180-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9200/UT4CZ5369-I1KG9200-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9200/UT4CZ5369-I1KG9200-0000.txt
/content/drive/MyDrive/Colab Notebooks/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9203/UT4CZ5369-I1KG9203-0000.txt->/content/drive/MyDrive/Colab Notebooks/data/tokenized/Corpora/BDRC/eKangyur/UT4CZ5369/UT4CZ5369-I1KG9203/UT4CZ5369-I1KG9203-0000.txt
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/tqdm/contrib/concurrent.py", line 76, in _executor_map
    return list(tqdm_class(ex.map(fn, *iterables, **map_args), **kwargs))
  File "/usr/local/lib/python3.7/

TypeError: ignored

Process ForkProcess-4:
Traceback (most recent call last):
  File "/usr/lib/python3.7/concurrent/futures/process.py", line 239, in _process_worker
    r = call_item.fn(*call_item.args, **call_item.kwargs)
  File "/usr/lib/python3.7/concurrent/futures/process.py", line 198, in _process_chunk
    return [fn(*args) for args in chunk]
  File "/usr/lib/python3.7/concurrent/futures/process.py", line 198, in <listcomp>
    return [fn(*args) for args in chunk]
  File "<ipython-input-8-2607d24a84b4>", line 54, in process_file_
    return process_file(path_prefix, input_file, out_path)
  File "<ipython-input-8-2607d24a84b4>", line 38, in process_file
    sentences = get_sentences(text)
  File "<ipython-input-8-2607d24a84b4>", line 11, in get_sentences
    tokens = tokenizer.tokenize(text)
  File "/usr/local/lib/python3.7/dist-packages/botok/tokenizers/wordtokenizer.py", line 76, in tokenize
    string, ignore_chars=self.ignore_chars, space_as_punct=spaces_as_punct
  File "/usr/local/lib/python3.7