### Phonotatic Constraits Analysis

Atempt of morphological decipherement (via phonotatic constraits) based on the hypothesis of Naibbe Cipher: https://youtu.be/ByARtG-GUPo?t=5559

In [1]:
import re
import pandas as pd

class VoynichTextProcessor:
    """Processes Voynich Manuscript text to return a cleaned DataFrame."""
    
    def __init__(self):
        self.raw_text = None

    def load_raw_text(self, filepath: str) -> bool:
        """Load raw text from file."""
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                self.raw_text = f.read()
            return True
        except FileNotFoundError:
            return False

    def clean_voynich_text(self, filepath: str, treat_commas_as_spaces: bool = True, min_word_length: int = 2) -> pd.DataFrame:
        """Clean Voynich Manuscript text and return a DataFrame"""
        if not self.load_raw_text(filepath):
            return pd.DataFrame(columns=['folio', 'text'])

        lines = self.raw_text.strip().split('\n')
        folio_pattern = r'<f(\d+)([rv])?\.'
        data = []
        current_folio = None

        def replace_uncertain(match):
            options = match.group(1).split(':')
            return options[0] if options else ''

        for line in lines:
            line = line.strip()
            if not line or line.startswith('#'):
                continue

            cleaned_line = line
            cleaned_line = re.sub(r'@\d+', '', cleaned_line)  # Remove annotations
            cleaned_line = re.sub(r'<![^>]*>', '', cleaned_line)  # Remove comments
            cleaned_line = re.sub(r'<[^>]*>', '', cleaned_line)  # Remove other markup
            cleaned_line = re.sub(r'\[([^\]]+)\]', replace_uncertain, cleaned_line)  # Handle uncertain readings
            cleaned_line = re.sub(r'[{}]', '', cleaned_line)  # Remove braces
            cleaned_line = re.sub(r'\?+', '', cleaned_line)  # Remove question marks
            cleaned_line = re.sub(r'[^a-zA-Z\s,.]', '', cleaned_line)  # Keep only letters, spaces, commas, periods
            if treat_commas_as_spaces:
                cleaned_line = cleaned_line.replace('.', ' ').replace(',', ' ')
            else:
                cleaned_line = cleaned_line.replace('.', ' ').replace(',', '')
            cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip().lower()  # Normalize spaces, lowercase

            folio_match = re.search(folio_pattern, line)
            if folio_match:
                current_folio = f"{folio_match.group(1)}{folio_match.group(2) or 'r'}"
            elif re.match(folio_pattern, cleaned_line):
                continue

            if cleaned_line and current_folio:
                words = cleaned_line.split()
                clean_words = [w for w in words if re.match(r'^[a-z]+$', w) and len(w) >= min_word_length]
                if clean_words:
                    data.append({'folio': current_folio, 'text': ' '.join(clean_words)})

        text = pd.DataFrame(data, columns=['folio', 'text'])
        return text

if __name__ == "__main__":
    processor = VoynichTextProcessor()
    filepath = "transliteration_zl.txt"
    text = processor.clean_voynich_text(filepath)

text

Unnamed: 0,folio,text
0,1r,fachys ykal ar ataiin shol shory cthres kor sh...
1,1r,sory ckhar or kair chtaiin shar ase cthar ctha...
2,1r,syaiir sheky or ykaiin shod cthoary cthes dara...
3,1r,soiin oteey oteos roloty cthiar daiin okaiin o...
4,1r,sair chear cthaiin cphar cfhaiin
...,...,...
5302,116r,osain shky qorain chckhey qokey lkechy okeey o...
5303,116r,sykar ain olkeey dainchey qokar chey dain otan...
5304,116r,sysor shey qokey okeolan chey qol or cheey qor...
5305,116r,sodal ch al chcthy chckhy qol ain ary


In [2]:
import re
import pandas as pd
from collections import Counter

def get_folio_number(folio):
    """Extract numeric part of folio (e.g., '1r' -> 1)."""
    match = re.match(r'(\d+)', folio)
    return int(match.group(1)) if match else 0

def generate_ngrams(words, n):
    """Generate n-grams from a list of words."""
    return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

def find_ngram_frequencies(text_df, max_n=10):
    """Return a DataFrame with longest n-grams (2 to max_n) with frequencies >1 for folios 1 to 60."""
    # Filter for folios 1 to 60
    filtered_text = text_df[text_df['folio'].apply(get_folio_number) <= 55]
    
    # Concatenate all text into one list of words
    all_words = ' '.join(filtered_text['text'].fillna('').tolist()).split()
    
    # Collect n-gram data
    ngram_data = []
    
    # Generate and count n-grams for n=2 to max_n
    for n in range(2, max_n + 1):
        ngrams = generate_ngrams(all_words, n)
        ngram_counts = Counter(ngrams)
        # Only include n-grams with count > 1
        for seq, count in ngram_counts.items():
            if count > 1:
                ngram_data.append({'n': n, 'sequence': seq, 'count': count})
    
    # Create DataFrame
    df = pd.DataFrame(ngram_data, columns=['n', 'sequence', 'count'])
    
    if df.empty:
        return df
    
    # Filter out shorter n-grams contained in longer ones
    longest_ngrams = []
    for n in range(max_n, 1, -1):  # Start with longest n-grams
        current_ngrams = df[df['n'] == n]
        for _, row in current_ngrams.iterrows():
            seq = row['sequence']
            is_subsequence = False
            # Check if this sequence is contained in any longer n-gram
            for _, longer_row in df[df['n'] > n].iterrows():
                if seq in longer_row['sequence']:
                    is_subsequence = True
                    break
            if not is_subsequence:
                longest_ngrams.append(row)
    
    # Create final DataFrame with only longest n-grams
    result_df = pd.DataFrame(longest_ngrams, columns=['n', 'sequence', 'count'])
    return result_df.sort_values(by=['n', 'count'], ascending=[False, True])

ngram_df = find_ngram_frequencies(text, max_n=10)
ngram_df

Unnamed: 0,n,sequence,count
348,3,chol chol kor,2
349,3,chol daiin cthy,2
350,3,daiin cthy schey,2
351,3,chol chy chaiin,2
352,3,cthor chol chor,2
...,...,...,...
84,2,shol daiin,7
88,2,otol chol,7
102,2,chor daiin,8
161,2,daiin dain,9


In [3]:
import pandas as pd
from collections import Counter, defaultdict
import re

def map_valid_sequences(text_df, transitions_df, max_n=10):
    def get_folio_number(folio):
        match = re.match(r'(\d+)', folio)
        return int(match.group(1)) if match else 0
    filtered_text = text_df[text_df['folio'].apply(get_folio_number) <= 60]
    
    successor_dict = {row['symbol']: set(row['successors'].split(', ')) if row['successors'] else set() for _, row in transitions_df.iterrows()}
    
    sequence_folios = defaultdict(list)
    sequence_counts = Counter()
    
    for _, row in filtered_text.iterrows():
        folio = row['folio']
        words = row['text'].split()
        if len(words) < 2:
            continue
        for n in range(2, min(max_n + 1, len(words) + 1)):
            for i in range(len(words) - n + 1):
                subsequence = words[i:i+n]
                is_valid = True
                for j in range(len(subsequence) - 1):
                    current = subsequence[j]
                    next_word = subsequence[j + 1]
                    if current not in successor_dict or next_word not in successor_dict[current]:
                        is_valid = False
                        break
                if is_valid:
                    seq_str = ' '.join(subsequence)
                    sequence_counts[seq_str] += 1
                    if folio not in sequence_folios[seq_str]:
                        sequence_folios[seq_str].append(folio)
    
    valid_sequences = []
    for seq, folios in sequence_folios.items():
        valid_sequences.append({
            'folio': ', '.join(sorted(folios)),
            'sequence': seq,
            'length': len(seq.split()),
            'frequency': sequence_counts[seq]
        })
    
    df = pd.DataFrame(valid_sequences)
    if df.empty:
        return df
    return df.sort_values(by=['frequency','length'], ascending=[False, False])

# Run the function
valid_sequences_df = map_valid_sequences(text, transitions_df)
valid_sequences_df

NameError: name 'transitions_df' is not defined

In [None]:
from typing import Set, Tuple, List
import pandas as pd

def extract_symbol_bigrams(symbols: List[str]) -> Set[str]:
    """Extract unique bigrams of symbols from a list of symbols."""
    bigrams = {f"{symbols[i]}-{symbols[i+1]}" for i in range(len(symbols) - 1)}
    return bigrams

def get_absent_symbol_bigrams(symbols: List[str], observed_bigrams: Set[str]) -> Set[str]:
    """Get all possible symbol bigrams not observed in the data."""
    alphabet = set(symbols)
    all_possible_bigrams = {f"{a}-{b}" for a in alphabet for b in alphabet}
    return all_possible_bigrams - observed_bigrams

def find_longest_symbol_bigram_chain(symbols: List[str]) -> int:
    """Find the longest chain of bigrams where each shares a second symbol with the next's first."""
    if len(symbols) < 2:
        return 0
    max_chain = 1  # Minimum is one bigram
    current_chain = 1
    for i in range(len(symbols) - 2):
        if symbols[i + 1] == symbols[i + 2]:  # Check if next bigram starts with current bigram's second symbol
            current_chain += 1
            max_chain = max(max_chain, current_chain)
        else:
            current_chain = 1
    return max_chain

def generate_symbol_bigram_rule(symbols: List[str]) -> str:
    """Generate a rule based on the longest chain of symbol bigrams."""
    max_chain = find_longest_symbol_bigram_chain(symbols)
    if max_chain == 0:
        return ""
    return f"(L²){{{max_chain}}}"

def process_voynich_symbol_sequences(valid_sequences_df: pd.DataFrame) -> Tuple[Set[str], Set[str], str]:
    """
    Process valid_sequences_df to extract symbol bigrams, absent bigrams, and a rule.
    Returns (observed_bigrams, absent_bigrams, rule).
    """
    # Concatenate all sequences into a list of symbols
    all_symbols = []
    for seq in valid_sequences_df["sequence"]:
        symbols = seq.split()  # Split each sequence into symbols (e.g., "chol daiin" -> ["chol", "daiin"])
        all_symbols.extend(symbols)
    
    # Extract bigrams
    observed_bigrams = extract_symbol_bigrams(all_symbols)
    
    # Get absent bigrams
    absent_bigrams = get_absent_symbol_bigrams(all_symbols, observed_bigrams)
    
    # Generate rule
    rule = generate_symbol_bigram_rule(all_symbols)
    
    return observed_bigrams, absent_bigrams, rule

def main():
    # Assume valid_sequences_df is available from map_valid_sequences
    observed_bigrams, absent_bigrams, rule = process_voynich_symbol_sequences(valid_sequences_df)
    
    # Output results
    print("Possible symbol bigram combinations:")
    for bigram in sorted(observed_bigrams):
        print(bigram)
    
    print("\nAbsent symbol bigram combinations:")
    for bigram in sorted(absent_bigrams):
        print(bigram)
    
    print(f"\nGeneralized rule: {rule}")

if __name__ == "__main__":
    main()