#### In this version, all of the punctuation predicted by the model is restored
#### but the hyphens and colons are replaced with commas
##### https://pypi.org/project/deepmultilingualpunctuation/

In [1]:
pip install deepmultilingualpunctuation




##### load the necessary libraries
##### you need to install transformers for the model to work
##### see e.g. https://huggingface.co/docs/transformers/installation

In [2]:
import glob, os
from deepmultilingualpunctuation import PunctuationModel

model = PunctuationModel()



##### For every .txt file in the folder or any subfolders
##### All punctuation predicted by the model will be restored
##### This script will overwrite the files (so it's good to have a backup)

In [7]:
for file in glob.iglob('**', recursive=True):
    # if the file is a file (not a directory)
    if os.path.isfile(file):
        # avoid editing .py files, etc
        if file.endswith('.txt'):
            with open(file, 'r') as input:
                text = input.read()
                
                clean_text = model.preprocess(text)
                # this generates a list of lists ['word', 'punctuation or 0 for no punctuation', 'probability']
                labeled_words = model.predict(clean_text)
                
                # create an empty list for the new text
                new_text = []
                
                for item in labeled_words:        
                    # if the model predicts no punctuation, just add the word
                    if item[1] == '0':
                        # https://community.dataquest.io/t/using-the-append-method-with-two-arguments/515619
                        # only one argument, so I can use .append in the normal way
                        new_text.append(item[0])
                    # if the model predicts a full stop, add the word and the full stop to the new text (same for other punctuation)
                    if item[1] == '.':
                        # two arguments, so I need to use .extend and add a list of 2 items (if I use append, I will get a list within the list)
                        new_text.extend([item[0], item[1]])
                    if item[1] == '?':
                        new_text.extend([item[0], item[1]])
                    if item[1] == ',':
                        new_text.extend([item[0], item[1]])
                    # for colons and hyphens, add a comma (not the predicted punctuation 'item[1]')
                    if item[1] == ':':
                        new_text.extend([item[0], ','])
                    if item[1] == '-':
                        new_text.extend([item[0], ','])

            # https://stackoverflow.com/questions/15950672/join-split-words-and-punctuation-with-punctuation-in-the-right-place
            punc = set(',.?') # or whatever special chars you want

            with open('temp.txt', 'w') as output:
                # join punctuation to the previous word and all other words by a space
                output.write(''.join(w if set(w) <= punc else ' '+w for w in new_text).lstrip())
            os.replace('temp.txt', file)