In [3]:
# Copyright (c) 2025, Michael A. Greshko
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software, datasets, and associated documentation files (the "Software
# and Datasets"), to deal in the Software and Datasets without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software and Datasets, and to
# permit persons to whom the Software is furnished to do so, subject to the
# following conditions:
# 
# - The above copyright notice and this permission notice shall be included
#   in all copies or substantial portions of the Software and Datasets.
# - Any publications making use of the Software and Datasets, or any substantial
#   portions thereof, shall cite the Software and Datasets's original publication:
# 
# > Greshko, Michael A. (2025). The Naibbe cipher: a substitution cipher that encrypts
# Latin and Italian as Voynich Manuscript-like ciphertext.
# Cryptologia. https://doi.org/10.1080/01611194.2025.2566408
#   
# THE SOFTWARE AND DATASETS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
# EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE AND DATASETS.

import pandas as pd
import numpy as np
import math

# === USER INPUT ===
CSV_PATH = "data/voyb_reference.csv"  # Replace with the path to your CSV file

# === FUNCTIONS ===
def conditional_char_entropy_rowwise(series):
    total_pairs = 0
    pair_counts = {}
    char_counts = {}

    for text in series.dropna().astype(str):
        if len(text) < 2:
            continue
        for i in range(len(text) - 1):
            a, b = text[i], text[i + 1]
            pair = (a, b)
            pair_counts[pair] = pair_counts.get(pair, 0) + 1
            char_counts[a] = char_counts.get(a, 0) + 1
            total_pairs += 1

    entropy = 0.0
    for (a, b), count_ab in pair_counts.items():
        p_ab = count_ab / total_pairs
        p_b_given_a = count_ab / char_counts[a]
        entropy -= p_ab * math.log2(p_b_given_a)

    return entropy

def char_entropy_rowwise(series):
    total_chars = 0
    char_counts = {}

    for text in series.dropna().astype(str):
        for char in text:
            char_counts[char] = char_counts.get(char, 0) + 1
            total_chars += 1

    entropy = 0.0
    for count in char_counts.values():
        p = count / total_chars
        entropy -= p * math.log2(p)

    return entropy

# === MAIN EXECUTION ===
df = pd.read_csv(CSV_PATH)

char_entropy_rowwise_vals = df.apply(char_entropy_rowwise, axis=0)
char_entropy_rowwise_vals.name = "Char_Entropy_Rowwise"

conditional_entropy_rowwise_vals = df.apply(conditional_char_entropy_rowwise, axis=0)
conditional_entropy_rowwise_vals.name = "Conditional_Char_Entropy_Rowwise"

# Combine and export results
entropy_df = pd.concat([char_entropy_rowwise_vals, conditional_entropy_rowwise_vals], axis=1)
print(entropy_df)

# Optional: Save to CSV
entropy_df.to_csv("entropy_results_voyb.csv")


      Char_Entropy_Rowwise  Conditional_Char_Entropy_Rowwise
eva               3.860315                          1.954412
cuva              3.898963                          2.254932
