In [1]:
import re
import pandas as pd

class VoynichTextProcessor:
    """Processes Voynich Manuscript text to return a cleaned DataFrame."""
    
    def __init__(self):
        self.raw_text = None

    def load_raw_text(self, filepath: str) -> bool:
        """Load raw text from file."""
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                self.raw_text = f.read()
            return True
        except FileNotFoundError:
            return False

    def clean_voynich_text(self, filepath: str, treat_commas_as_spaces: bool = True, min_word_length: int = 2) -> pd.DataFrame:
        """Clean Voynich Manuscript text and return a DataFrame"""
        if not self.load_raw_text(filepath):
            return pd.DataFrame(columns=['folio', 'text'])

        lines = self.raw_text.strip().split('\n')
        folio_pattern = r'<f(\d+)([rv])?\.'
        data = []
        current_folio = None

        def replace_uncertain(match):
            options = match.group(1).split(':')
            return options[0] if options else ''

        for line in lines:
            line = line.strip()
            if not line or line.startswith('#'):
                continue

            cleaned_line = line
            cleaned_line = re.sub(r'@\d+', '', cleaned_line)  # Remove annotations
            cleaned_line = re.sub(r'<![^>]*>', '', cleaned_line)  # Remove comments
            cleaned_line = re.sub(r'<[^>]*>', '', cleaned_line)  # Remove other markup
            cleaned_line = re.sub(r'\[([^\]]+)\]', replace_uncertain, cleaned_line)  # Handle uncertain readings
            cleaned_line = re.sub(r'[{}]', '', cleaned_line)  # Remove braces
            cleaned_line = re.sub(r'\?+', '', cleaned_line)  # Remove question marks
            cleaned_line = re.sub(r'[^a-zA-Z\s,.]', '', cleaned_line)  # Keep only letters, spaces, commas, periods
            if treat_commas_as_spaces:
                cleaned_line = cleaned_line.replace('.', ' ').replace(',', ' ')
            else:
                cleaned_line = cleaned_line.replace('.', ' ').replace(',', '')
            cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip().lower()  # Normalize spaces, lowercase

            folio_match = re.search(folio_pattern, line)
            if folio_match:
                current_folio = f"{folio_match.group(1)}{folio_match.group(2) or 'r'}"
            elif re.match(folio_pattern, cleaned_line):
                continue

            if cleaned_line and current_folio:
                words = cleaned_line.split()
                clean_words = [w for w in words if re.match(r'^[a-z]+$', w) and len(w) >= min_word_length]
                if clean_words:
                    data.append({'folio': current_folio, 'text': ' '.join(clean_words)})

        text = pd.DataFrame(data, columns=['folio', 'text'])
        return text

if __name__ == "__main__":
    processor = VoynichTextProcessor()
    filepath = "transliteration_zl.txt"
    text = processor.clean_voynich_text(filepath)
    
text

Unnamed: 0,folio,text
0,1r,fachys ykal ar ataiin shol shory cthres kor sh...
1,1r,sory ckhar or kair chtaiin shar ase cthar ctha...
2,1r,syaiir sheky or ykaiin shod cthoary cthes dara...
3,1r,soiin oteey oteos roloty cthiar daiin okaiin o...
4,1r,sair chear cthaiin cphar cfhaiin
...,...,...
5302,116r,osain shky qorain chckhey qokey lkechy okeey o...
5303,116r,sykar ain olkeey dainchey qokar chey dain otan...
5304,116r,sysor shey qokey okeolan chey qol or cheey qor...
5305,116r,sodal ch al chcthy chckhy qol ain ary
