# Word and Glyph Displacement

This notebook processes the Voynich Manuscript text (folio 1r from the ZL transliteration) to analyze word similarities using Levenshtein distance, the goal is to find any displacement rules that unveils any pattern for word and/setence generator. It performs the following:

- **Section I**: Loads and tokenizes words, treating 'ch' and 'sh' as digraphs.
- **Section II**: Computes Levenshtein distances for all unique word pairs.
- **Section III**: Computes closest word pairs (excluding identicals) with detailed alignments and operation counts.
- **Section IV**: Calculates distances to each word's predecessor in the sequence.

In [1]:
import re
from typing import List, Dict, Tuple, Any
import pandas as pd
import math
import json

class VoynichTextProcessor:
    """Processes Voynich Manuscript text with folio-aware cleaning and tokenization."""
    def __init__(self):
        self.digraphs = {'ch', 'sh'}
        self.folio_data = {}
        self.raw_text = None

    def load_raw_text(self, filepath: str) -> bool:
        """Load raw text from file."""
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                self.raw_text = f.read()
            return True
        except FileNotFoundError:
            print(f"Error: File '{filepath}' not found.")
            return False

    def tokenize_word(self, word: str) -> List[str]:
        """Tokenize a word into characters or digraphs (e.g., 'ch', 'sh')."""
        tokens = []
        i = 0
        while i < len(word):
            if i < len(word) - 1 and word[i:i+2] in self.digraphs:
                tokens.append(word[i:i+2])
                i += 2
            else:
                tokens.append(word[i])
                i += 1
        return tokens

    def parse_all_folios(self, filepath: str, glyph_level: bool = False, treat_commas_as_spaces: bool = True, min_word_length: int = 2) -> Tuple[Dict, List[str], List[List[str]]]:
        """Parse Voynich text into folios and sentences."""
        if not self.load_raw_text(filepath):
            return {}, [], []

        lines = self.raw_text.strip().split('\n')
        current_folio = None
        current_folio_key = None
        folio_pattern = r'<f(\d+)([rv])?\.'
        all_words = []
        sentences = []
        current_sentence = []

        def replace_uncertain(match):
            options = match.group(1).split(':')
            return options[0] if options else ''

        for line in lines:
            line = line.strip()
            if not line or line.startswith('#'):
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
                continue

            cleaned_line = line
            cleaned_line = re.sub(r'@\d+', '', cleaned_line)
            cleaned_line = re.sub(r'<![^>]*>', '', cleaned_line)
            cleaned_line = re.sub(r'<[^>]*>', '', cleaned_line)
            cleaned_line = re.sub(r'\[([^\]]+)\]', replace_uncertain, cleaned_line)
            cleaned_line = re.sub(r'[{}]', '', cleaned_line)
            cleaned_line = re.sub(r'\?+', '', cleaned_line)
            cleaned_line = re.sub(r'[^a-zA-Z\s,.]', '', cleaned_line)
            if treat_commas_as_spaces:
                cleaned_line = cleaned_line.replace('.', ' ').replace(',', ' ')
            else:
                cleaned_line = cleaned_line.replace('.', ' ').replace(',', '')
            cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip().lower()

            folio_match = re.search(folio_pattern, line)
            if folio_match:
                current_folio = int(folio_match.group(1))
                folio_side = folio_match.group(2) or 'r'
                current_folio_key = f"{current_folio}{folio_side}"
                folio_text = cleaned_line
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
            else:
                folio_text = cleaned_line

            if current_folio and folio_text:
                if current_folio_key not in self.folio_data:
                    self.folio_data[current_folio_key] = {
                        'folio_num': current_folio,
                        'folio_side': folio_side,
                        'raw_text': [],
                        'clean_words': [],
                        'clean_glyphs': []
                    }
                self.folio_data[current_folio_key]['raw_text'].append(folio_text)
                if glyph_level:
                    glyphs = list(folio_text)
                    glyphs = [g for g in glyphs if g.isalpha()]
                    self.folio_data[current_folio_key]['clean_glyphs'].extend(glyphs)
                words = folio_text.split()
                clean_words = [w for w in words if re.match(r'^[a-z]+$', w) and len(w) >= min_word_length]
                self.folio_data[current_folio_key]['clean_words'].extend(clean_words)
                all_words.extend(clean_words)
                current_sentence.extend(clean_words)

        if current_sentence:
            sentences.append(current_sentence)

        sentences = [s for s in sentences if len(s) > 1]
        return self.folio_data, all_words, sentences

    def levenshtein_distance(self, seq1: List[str], seq2: List[str], return_alignment: bool = False) -> Tuple[int, Any]:
        """Compute Levenshtein distance and optional alignment between two sequences."""
        m, n = len(seq1), len(seq2)
        if m < n:
            dist, align = self.levenshtein_distance(seq2, seq1, return_alignment)
            if return_alignment:
                flipped_align = [(b, a, {'insert': 'delete', 'delete': 'insert'}.get(op, op)) for a, b, op in reversed(align)]
                return dist, flipped_align
            return dist, None

        if n == 0:
            if return_alignment:
                return m, [(c, '-', 'delete') for c in seq1]
            return m, None

        dp = [[0] * (n + 1) for _ in range(m + 1)]
        backpointer = [[None] * (n + 1) for _ in range(m + 1)] if return_alignment else None

        for i in range(m + 1):
            dp[i][0] = i
            if return_alignment:
                backpointer[i][0] = ('delete', i - 1, 0) if i > 0 else None

        for j in range(n + 1):
            dp[0][j] = j
            if return_alignment:
                backpointer[0][j] = ('insert', 0, j - 1) if j > 0 else None

        for i in range(1, m + 1):
            for j in range(1, n + 1):
                cost = 0 if seq1[i-1] == seq2[j-1] else 1
                deletion = dp[i-1][j] + 1
                insertion = dp[i][j-1] + 1
                substitution = dp[i-1][j-1] + cost
                min_val = min(deletion, insertion, substitution)
                dp[i][j] = min_val

                if return_alignment:
                    if min_val == substitution:
                        op = 'match' if cost == 0 else 'substitute'
                        backpointer[i][j] = (op, i-1, j-1)
                    elif min_val == insertion:
                        backpointer[i][j] = ('insert', i, j-1)
                    else:
                        backpointer[i][j] = ('delete', i-1, j)

        distance = dp[m][n]
        if not return_alignment:
            return distance, None

        alignment = []
        i, j = m, n
        while i > 0 or j > 0:
            if backpointer[i][j] is None:
                break
            op, prev_i, prev_j = backpointer[i][j]
            if op == 'insert':
                alignment.append(('-', seq2[j-1], 'insert'))
                i, j = prev_i, prev_j
            elif op == 'delete':
                alignment.append((seq1[i-1], '-', 'delete'))
                i, j = prev_i, prev_j
            else:
                alignment.append((seq1[i-1], seq2[j-1], op))
                i, j = prev_i, prev_j

        alignment.reverse()
        return distance, alignment

## Section I: Pre-preparation

Load and parse the ZL transliteration file, tokenize words in folio 1r, and display a DataFrame of words and their tokens (e.g., 'fachys' → 'f, a, ch, y, s').

In [2]:
processor = VoynichTextProcessor()
filepath = "transliteration_zl.txt"
folio_data, all_words, sentences = processor.parse_all_folios(filepath)

folio_key = '1r'
if folio_key not in folio_data or len(folio_data[folio_key]['clean_words']) < 2:
    print(f"Folio {folio_key} not found or insufficient words.")
else:
    words = folio_data[folio_key]['clean_words']
    tokenized_words = [processor.tokenize_word(word) for word in words]
    df_preparation = pd.DataFrame({
        'Word': words,
        'Tokenized': [', '.join(tokens) for tokens in tokenized_words]
    })
    print(f"Section I: Pre-preparation for folio {folio_key}:")
    print(df_preparation)

Section I: Pre-preparation for folio 1r:
        Word          Tokenized
0     fachys     f, a, ch, y, s
1       ykal         y, k, a, l
2         ar               a, r
3     ataiin   a, t, a, i, i, n
4       shol           sh, o, l
..       ...                ...
202     chol           ch, o, l
203     chok           ch, o, k
204    choty        ch, o, t, y
205   chotey     ch, o, t, e, y
206  dchaiin  d, ch, a, i, i, n

[207 rows x 2 columns]


## Section II: All Pairwise Distances

Compute Levenshtein distances for all unique word pairs in folio 1r and display the results with the average distance.

In [3]:
if folio_key not in folio_data or len(folio_data[folio_key]['clean_words']) < 2:
    print(f"Folio {folio_key} not found or insufficient words.")
else:
    pairs = []
    total_distance = 0
    count = 0
    for i in range(len(tokenized_words)):
        for j in range(i + 1, len(tokenized_words)):
            dist, _ = processor.levenshtein_distance(tokenized_words[i], tokenized_words[j])
            pairs.append({
                'Word1': words[i],
                'Word2': words[j],
                'Distance': dist
            })
            total_distance += dist
            count += 1

    if count > 0:
        df_all_pairs = pd.DataFrame(pairs)
        print(f"Section II: All pairwise distances for folio {folio_key}:")
        print(df_all_pairs)
        average_distance = total_distance / count
        print(f"Average Levenshtein distance (all pairs): {average_distance}")
    else:
        print("No pairs to compare.")

Section II: All pairwise distances for folio 1r:
        Word1    Word2  Distance
0      fachys     ykal         5
1      fachys       ar         4
2      fachys   ataiin         5
3      fachys     shol         5
4      fachys    shory         4
...       ...      ...       ...
21316    chok   chotey         3
21317    chok  dchaiin         5
21318   choty   chotey         1
21319   choty  dchaiin         5
21320  chotey  dchaiin         5

[21321 rows x 3 columns]
Average Levenshtein distance (all pairs): 4.314009661835748


## Section III: Closest Words with Alignment Details

For each word in folio 1r, find the closest non-identical word, compute Levenshtein alignments (match/substitute/insert/delete operations), and show operation counts and glyph differences. Filter for distances ≥1, sort by distance, and compute the average.

In [4]:
if folio_key not in folio_data or len(folio_data[folio_key]['clean_words']) < 2:
    print(f"Folio {folio_key} not found or insufficient words.")
    df_result = pd.DataFrame()
else:
    closest_pairs_data = []
    total_min_distance = 0
    for i in range(len(tokenized_words)):
        min_dist = math.inf
        closest_j = None
        closest_alignment = None
        for j in range(len(tokenized_words)):
            if i == j:
                continue
            dist, alignment = processor.levenshtein_distance(tokenized_words[i], tokenized_words[j], return_alignment=True)
            if dist < min_dist:
                min_dist = dist
                closest_j = j
                closest_alignment = alignment
        if closest_j is not None:
            ops = [op for _, _, op in closest_alignment]
            num_subs = ops.count('substitute')
            num_ins = ops.count('insert')
            num_del = ops.count('delete')
            num_match = ops.count('match')
            diff_glyphs = []
            for a, b, op in closest_alignment:
                if op != 'match':
                    if op == 'substitute':
                        diff_glyphs.append(f"{a}->{b}")
                    elif op == 'insert':
                        diff_glyphs.append(f"-ins->{b}")
                    elif op == 'delete':
                        diff_glyphs.append(f"{a}-del->-")
            closest_pairs_data.append({
                'Word': words[i],
                'MostSimilar': words[closest_j],
                'Distance': min_dist,
                'Alignment': json.dumps(closest_alignment),
                'Operations': json.dumps(ops),
                'Num_Substitutions': num_subs,
                'Num_Insertions': num_ins,
                'Num_Deletions': num_del,
                'Num_Matches': num_match,
                'Difference_Glyphs': '; '.join(diff_glyphs) if diff_glyphs else ''
            })
            total_min_distance += min_dist

    df_result = pd.DataFrame(closest_pairs_data)
    df_result = df_result[df_result['Distance'] >= 1].sort_values('Distance').reset_index(drop=True)
    if len(df_result) > 0:
        average_min_distance = df_result['Distance'].mean()
        print(f"Average minimum Levenshtein distance (excluding identical words): {average_min_distance}")
        print(f"Number of word pairs with distance >= 1: {len(df_result)}")
    else:
        print("No word pairs found with distance >= 1")

df_result

Average minimum Levenshtein distance (excluding identical words): 1.3214285714285714
Number of word pairs with distance >= 1: 140


Unnamed: 0,Word,MostSimilar,Distance,Alignment,Operations,Num_Substitutions,Num_Insertions,Num_Deletions,Num_Matches,Difference_Glyphs
0,oiin,soiin,1,"[[""n"", ""n"", ""match""], [""i"", ""i"", ""match""], [""i...","[""match"", ""match"", ""match"", ""match"", ""insert""]",0,1,0,4,-ins->s
1,dchar,char,1,"[[""d"", ""-"", ""delete""], [""ch"", ""ch"", ""match""], ...","[""delete"", ""match"", ""match"", ""match""]",0,0,1,3,d-del->-
2,ain,dain,1,"[[""n"", ""n"", ""match""], [""i"", ""i"", ""match""], [""a...","[""match"", ""match"", ""match"", ""insert""]",0,1,0,3,-ins->d
3,kodshey,koshey,1,"[[""k"", ""k"", ""match""], [""o"", ""o"", ""match""], [""d...","[""match"", ""match"", ""delete"", ""match"", ""match"",...",0,0,1,5,d-del->-
4,she,sh,1,"[[""sh"", ""sh"", ""match""], [""e"", ""-"", ""delete""]]","[""match"", ""delete""]",0,0,1,1,e-del->-
...,...,...,...,...,...,...,...,...,...,...
135,ckhyds,cthres,3,"[[""c"", ""c"", ""match""], [""k"", ""t"", ""substitute""]...","[""match"", ""substitute"", ""match"", ""substitute"",...",3,0,0,3,k->t; y->r; d->e
136,ydaraishy,daraiin,3,"[[""y"", ""-"", ""delete""], [""d"", ""d"", ""match""], [""...","[""delete"", ""match"", ""match"", ""match"", ""match"",...",2,0,1,5,y-del->-; sh->i; y->n
137,ctholdar,cthar,3,"[[""c"", ""c"", ""match""], [""t"", ""t"", ""match""], [""h...","[""match"", ""match"", ""match"", ""delete"", ""delete""...",0,0,3,5,o-del->-; l-del->-; d-del->-
138,fachys,chy,3,"[[""f"", ""-"", ""delete""], [""a"", ""-"", ""delete""], [...","[""delete"", ""delete"", ""match"", ""match"", ""delete""]",0,0,3,2,f-del->-; a-del->-; s-del->-


## Section IV: Distance to Predecessor

Compute the Levenshtein distance between each word and its predecessor in folio 1r's sequence, showing the results and average distance.

In [5]:
if folio_key not in folio_data or len(folio_data[folio_key]['clean_words']) < 2:
    print(f"Folio {folio_key} not found or insufficient words.")
else:
    pred_pairs = []
    total_pred_distance = 0
    for i in range(1, len(tokenized_words)):
        dist, _ = processor.levenshtein_distance(tokenized_words[i], tokenized_words[i-1])
        pred_pairs.append({
            'Index': i,
            'Word': words[i],
            'Predecessor': words[i-1],
            'Distance': dist
        })
        total_pred_distance += dist

    if pred_pairs:
        df_pred = pd.DataFrame(pred_pairs)
        print(f"Section IV: Distance to predecessor for each word in folio {folio_key}:")
        print(df_pred)
        average_pred_distance = total_pred_distance / len(pred_pairs)
        print(f"Average Levenshtein distance to predecessor: {average_pred_distance}")
    else:
        print("No predecessor comparisons possible.")

Section IV: Distance to predecessor for each word in folio 1r:
     Index     Word Predecessor  Distance
0        1     ykal      fachys         5
1        2       ar        ykal         3
2        3   ataiin          ar         5
3        4     shol      ataiin         6
4        5    shory        shol         2
..     ...      ...         ...       ...
201    202     chol          eo         2
202    203     chok        chol         1
203    204    choty        chok         2
204    205   chotey       choty         1
205    206  dchaiin      chotey         5

[206 rows x 4 columns]
Average Levenshtein distance to predecessor: 4.286407766990291


In [6]:
import pandas as pd
import math
import json

# Section VI: Chain of Word Creation Analysis
# Tests the hypothesis that words in folio 1r form a chain where each word is derived
# from the previous via minimal Levenshtein edits, starting from the first word.

if 'folio_data' not in locals() or 'words' not in locals() or 'tokenized_words' not in locals() or 'processor' not in locals():
    print("Required variables (folio_data, words, tokenized_words, processor) missing. Run Section I first.")
elif folio_key not in folio_data or len(folio_data[folio_key]['clean_words']) < 2:
    print(f"Folio {folio_key} not found or insufficient words.")
else:
    # Initialize chain starting with the first word
    chain = []
    visited_indices = set()
    current_idx = 0  # Start with first word
    total_distance = 0
    visited_indices.add(current_idx)

    while len(visited_indices) < len(words):
        current_word = words[current_idx]
        current_tokens = tokenized_words[current_idx]
        min_dist = math.inf
        next_idx = None
        next_alignment = None

        # Find unvisited word with minimum Levenshtein distance
        for j in range(len(tokenized_words)):
            if j in visited_indices:
                continue
            dist, alignment = processor.levenshtein_distance(current_tokens, tokenized_words[j], return_alignment=True)
            if dist < min_dist:
                min_dist = dist
                next_idx = j
                next_alignment = alignment

        if next_idx is None:
            break  # No more unvisited words

        # Extract operation details
        ops = [op for _, _, op in next_alignment]
        num_subs = ops.count('substitute')
        num_ins = ops.count('insert')
        num_del = ops.count('delete')
        num_match = ops.count('match')
        diff_glyphs = []
        for a, b, op in next_alignment:
            if op != 'match':
                if op == 'substitute':
                    diff_glyphs.append(f"{a}->{b}")
                elif op == 'insert':
                    diff_glyphs.append(f"-ins->{b}")
                elif op == 'delete':
                    diff_glyphs.append(f"{a}-del->-")

        chain.append({
            'Word': current_word,
            'MostSimilar': words[next_idx],
            'Distance': min_dist,
            'Alignment': json.dumps(next_alignment),
            'Operations': json.dumps(ops),
            'Num_Substitutions': num_subs,
            'Num_Insertions': num_ins,
            'Num_Deletions': num_del,
            'Num_Matches': num_match,
            'Difference_Glyphs': '; '.join(diff_glyphs) if diff_glyphs else ''
        })

        total_distance += min_dist
        visited_indices.add(next_idx)
        current_idx = next_idx

    # Create DataFrame
    df_chain = pd.DataFrame(chain)
    print(f"Chain of Word Creation for Folio {folio_key}:")
    print(df_chain)
    print(f"\nChain Coverage: {len(chain)}/{len(words)} words")
    if len(chain) > 0:
        avg_distance = total_distance / len(chain)
        print(f"Average Levenshtein Distance in Chain: {avg_distance:.2f}")
    else:
        print("No chain formed.")

    # Summary of operation types
    if not df_chain.empty:
        op_summary = df_chain[['Num_Substitutions', 'Num_Insertions', 'Num_Deletions', 'Num_Matches']].sum()
        print("\nOperation Summary in Chain:")
        print(op_summary.to_frame().T)
        print("\nInsight: High coverage with low distances and frequent ins/del suggest a systematic word derivation process.")

Chain of Word Creation for Folio 1r:
          Word MostSimilar  Distance  \
0       fachys         chy         3   
1          chy         chy         0   
2          chy          sy         1   
3           sy          cy         1   
4           cy          ar         2   
..         ...         ...       ...   
201    daictoy   ydaraishy         5   
202  ydaraishy   shcthaiin         6   
203  shcthaiin    ctholdar         6   
204   ctholdar   tshodeesy         7   
205  tshodeesy   shokcheey         4   

                                             Alignment  \
0    [["f", "-", "delete"], ["a", "-", "delete"], [...   
1         [["ch", "ch", "match"], ["y", "y", "match"]]   
2     [["ch", "s", "substitute"], ["y", "y", "match"]]   
3      [["s", "c", "substitute"], ["y", "y", "match"]]   
4    [["c", "a", "substitute"], ["y", "r", "substit...   
..                                                 ...   
201  [["y", "y", "match"], ["o", "sh", "substitute"...   
202  [["y", "sh", 

In [7]:
df_chain

Unnamed: 0,Word,MostSimilar,Distance,Alignment,Operations,Num_Substitutions,Num_Insertions,Num_Deletions,Num_Matches,Difference_Glyphs
0,fachys,chy,3,"[[""f"", ""-"", ""delete""], [""a"", ""-"", ""delete""], [...","[""delete"", ""delete"", ""match"", ""match"", ""delete""]",0,0,3,2,f-del->-; a-del->-; s-del->-
1,chy,chy,0,"[[""ch"", ""ch"", ""match""], [""y"", ""y"", ""match""]]","[""match"", ""match""]",0,0,0,2,
2,chy,sy,1,"[[""ch"", ""s"", ""substitute""], [""y"", ""y"", ""match""]]","[""substitute"", ""match""]",1,0,0,1,ch->s
3,sy,cy,1,"[[""s"", ""c"", ""substitute""], [""y"", ""y"", ""match""]]","[""substitute"", ""match""]",1,0,0,1,s->c
4,cy,ar,2,"[[""c"", ""a"", ""substitute""], [""y"", ""r"", ""substit...","[""substitute"", ""substitute""]",2,0,0,0,c->a; y->r
...,...,...,...,...,...,...,...,...,...,...
201,daictoy,ydaraishy,5,"[[""y"", ""y"", ""match""], [""o"", ""sh"", ""substitute""...","[""match"", ""substitute"", ""substitute"", ""substit...",4,1,0,3,o->sh; t->i; c->a; i->r; -ins->y
202,ydaraishy,shcthaiin,6,"[[""y"", ""sh"", ""substitute""], [""d"", ""c"", ""substi...","[""substitute"", ""substitute"", ""substitute"", ""su...",6,0,0,2,y->sh; d->c; a->t; r->h; sh->i; y->n
203,shcthaiin,ctholdar,6,"[[""sh"", ""-"", ""delete""], [""c"", ""c"", ""match""], [...","[""delete"", ""match"", ""match"", ""match"", ""insert""...",4,1,1,3,sh-del->-; -ins->o; a->l; i->d; i->a; n->r
204,ctholdar,tshodeesy,7,"[[""c"", ""-"", ""delete""], [""t"", ""t"", ""match""], [""...","[""delete"", ""match"", ""substitute"", ""match"", ""in...",5,1,1,2,c-del->-; h->sh; -ins->d; l->e; d->e; a->s; r->y
