In [1]:
import numpy as np
import pandas as pd
import re
import os
import random
from collections import Counter
from nltk.probability import FreqDist, MLEProbDist, ConditionalFreqDist, ConditionalProbDist

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def segment_surprisal(lexicon, syllables, language, sample_size=None, random_seed=None):
  '''
  Takes a lexicon and breaks it down into segments (phonemes or graphemes).
  Returns an N x 6 DataFrame, where N is the total number of segments, and
  the columns are:

    word_id: The index of the word the segment belongs to (starting at 1)
    position: The position of the segment within the word (starting at 1)
    segment: The segment itself (phoneme or grapheme)
    surprisal: The surprisal of that segment conditioned on all the preceding segments in the word.
    language: The language code of the lexicon.
    syllable: Binary marker for syllable breaks, 1 for the start of a new syllable, 0 otherwise.

  Params:
    lexicon (list): List of vocab terms.
    syllables (list): List of syllable breaks.
    language (str): Language code for lexicon.
    sample_size (int): Number of samples to take from lexicon.
                       If None, will take entire lexicon.
    random_seed (int): Random seed to make sampling reproducible.
                       If None, sampling will be non-reproducible.
                       If sample_size is None, this parameter is ignored.
  '''
  np.random.seed(seed=random_seed)

  # If sample_size is not None, draw that many samples from lexicon for vocab_list.
  if sample_size:
    if sample_size > len(lexicon):
      raise ValueError(f'Sample size of {sample_size} is larger than lexicon size of {len(lexicon)}')
    rand_idx = np.random.choice(len(lexicon), size=sample_size, replace=False)
    vocab_list = [lexicon[i] for i in rand_idx]
    syll_list = [syllables[i] for i in rand_idx]

  # Otherwise, use full lexicon.
  else:
    vocab_list = lexicon
    syll_list = syllables

  # Add ending character ">" to each word.
  vocab_list = [word+'>' for word in vocab_list]

  # Flatten syll_list so it is no longer separated by words
  syll_list = [syll for word in syll_list for syll in word]

  max_pos = max([len(word) for word in vocab_list])
  idx = list(range(len(vocab_list)))

  # Initialize empty lists for column values.
  word_id = []
  position = []
  segment = []
  surprisal = []

  for p in range(0, max_pos):
    # Create conditional probability distribution P(segment|context) up to position p.
    pos_list = [(word[0:p], word[p]) for word in vocab_list if len(word)>p]
    cond_freq_dist = ConditionalFreqDist(pos_list)
    cond_prob_dist = ConditionalProbDist(cond_freq_dist, MLEProbDist)

    # Add on to lists of word_ids, positions, segments, and surprisals up to position p.
    word_id += [i+1 for i in idx if len(vocab_list[i])>p]
    position += [p+1] * len(pos_list)
    segment += [ngram[1] for ngram in pos_list]
    surprisal += [-cond_prob_dist[ngram[0]].logprob(ngram[1]) for ngram in pos_list]

  # Add "lang" list, should match the length of other four lists.
  lang = [language] * len(word_id)

  # Create DataFrame with word_id, position, segment, surprisal, and lang columns.
  df = pd.DataFrame(list(zip(word_id, position, segment, surprisal, lang)),
                    columns=['word_id', 'position', 'segment', 'surprisal', 'lang'])

  # Remove rows with ending marker ">" from DataFrame
  df = df[df['segment'] != '>']


  # Make sure all surprisal values are >= 0, then change -0.0 to 0.0.
  if df['surprisal'].lt(0).any():
    raise ValueError('Surprisal cannot be < 0')
  df['surprisal'] = df['surprisal'].apply(abs)


  # Sort DataFrame first by word_id, then by position.
  df = df.sort_values(by=['word_id', 'position'], ignore_index=True)

  # Add syll_list to DataFrame
  df['syllable'] = syll_list

  return df

In [4]:
#@title Czech

# Read in preprocessed language dataframe
df = pd.read_csv('/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/SunPoeppelDatabases/czech_database_updated.csv', delimiter=';')
# Extract phonetic transcriptions with syllable breaks
_phones = df['phone_full']
# Remove stress markers % and ' from phonetic transcriptions
_phones = [re.sub("[%']",'',word) for word in _phones]
# Split word into syllables using syllable break marker "-"
breaks = [phone.split('-') for phone in _phones]
# Convert syllables into binary, with "1" at the start of each syllable and "0" following
sylls = [[[1] + [0] * (len(seg) - 1) for seg in word] for word in breaks]
sylls = [[el for seg in word for el in seg] for word in sylls]
# Remove syllable markers from phonetic transcriptions
phones = [re.sub('-', '', word) for word in _phones]
# Calculate surprisals
czech_surprisal = segment_surprisal(phones, sylls, 'cs', sample_size=None, random_seed=None)

In [None]:
#@title Dutch

# Read in preprocessed language dataframe
df = pd.read_csv('/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/SunPoeppelDatabases/dutch_celex_database_updated.txt', delimiter='\t')
# Extract phonetic transcriptions with syllable breaks
_phones = df['phone_full']
# Remove stress markers % and ' from phonetic transcriptions
_phones = [re.sub("[%']",'',word) for word in _phones]
# Split word into syllables using syllable break marker "-"
breaks = [phone.split('-') for phone in _phones]
# Convert syllables into binary, with "1" at the start of each syllable and "0" following
sylls = [[[1] + [0] * (len(seg) - 1) for seg in word] for word in breaks]
sylls = [[el for seg in word for el in seg] for word in sylls]
# Remove syllable markers from phonetic transcriptions
phones = [re.sub('-', '', word) for word in _phones]
# Calculate surprisals
dutch_surprisal = segment_surprisal(phones, sylls, 'nl', sample_size=None, random_seed=None)

In [None]:
#@title English

# Read in preprocessed language dataframe
df = pd.read_csv('/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/SunPoeppelDatabases/english_celex_database_updated.txt', delimiter='\t')
# Extract phonetic transcriptions with syllable breaks
_phones = df['phone_full']
# Remove stress markers % and ' from phonetic transcriptions
_phones = [re.sub("[%']",'',word) for word in _phones]
# Split word into syllables using syllable break marker "-"
breaks = [phone.split('-') for phone in _phones]
# Convert syllables into binary, with "1" at the start of each syllable and "0" following
sylls = [[[1] + [0] * (len(seg) - 1) for seg in word] for word in breaks]
sylls = [[el for seg in word for el in seg] for word in sylls]
# Remove syllable markers from phonetic transcriptions
phones = [re.sub('-', '', word) for word in _phones]
# Calculate surprisals
english_surprisal = segment_surprisal(phones, sylls, 'en', sample_size=None, random_seed=None)

In [None]:
#@title French

# Read in preprocessed language dataframe
df = pd.read_csv('/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/SunPoeppelDatabases/french_lexique_database_updated.txt', delimiter='\t')
# Extract phonetic transcriptions with syllable breaks
_phones = df['phone_full']
# Remove stress markers % and ' from phonetic transcriptions
_phones = [re.sub("[%']",'',word) for word in _phones]
# Split word into syllables using syllable break marker "-"
breaks = [phone.split('-') for phone in _phones]
# Convert syllables into binary, with "1" at the start of each syllable and "0" following
sylls = [[[1] + [0] * (len(seg) - 1) for seg in word] for word in breaks]
sylls = [[el for seg in word for el in seg] for word in sylls]
# Remove syllable markers from phonetic transcriptions
phones = [re.sub('-', '', word) for word in _phones]
# Calculate surprisals
french_surprisal = segment_surprisal(phones, sylls, 'fr', sample_size=None, random_seed=None)

In [None]:
#@title German

# Read in preprocessed language dataframe
df = pd.read_csv('/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/SunPoeppelDatabases/german_celex_database_updated.txt', delimiter='\t')
# Extract phonetic transcriptions with syllable breaks
_phones = df['phone_full']
# Remove stress markers % and ' from phonetic transcriptions
_phones = [re.sub("[%']",'',word) for word in _phones]
# Split word into syllables using syllable break marker "-"
breaks = [phone.split('-') for phone in _phones]
# Convert syllables into binary, with "1" at the start of each syllable and "0" following
sylls = [[[1] + [0] * (len(seg) - 1) for seg in word] for word in breaks]
sylls = [[el for seg in word for el in seg] for word in sylls]
# Remove syllable markers from phonetic transcriptions
phones = [re.sub('-', '', word) for word in _phones]
# Calculate surprisals
german_surprisal = segment_surprisal(phones, sylls, 'de', sample_size=None, random_seed=None)

In [None]:
#@title Greek

# Read in preprocessed language dataframe
df = pd.read_csv('/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/SunPoeppelDatabases/greek_lex_database_updated.csv', delimiter=';')
# Extract phonetic transcriptions with syllable breaks
_phones = df['phone_full']
# Remove stress markers % and ' from phonetic transcriptions
_phones = [re.sub("[%']",'',word) for word in _phones]
# Split word into syllables using syllable break marker "-"
breaks = [phone.split('-') for phone in _phones]
# Convert syllables into binary, with "1" at the start of each syllable and "0" following
sylls = [[[1] + [0] * (len(seg) - 1) for seg in word] for word in breaks]
sylls = [[el for seg in word for el in seg] for word in sylls]
# Remove syllable markers from phonetic transcriptions
phones = [re.sub('-', '', word) for word in _phones]
# Calculate surprisals
greek_surprisal = segment_surprisal(phones, sylls, 'el', sample_size=None, random_seed=None)

In [None]:
#@title Italian

# Read in preprocessed language dataframe
df = pd.read_csv('/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/SunPoeppelDatabases/italian_phon_database_updated.txt', delimiter='\t')
# Extract phonetic transcriptions with syllable breaks
_phones = df['phone_full']
# Remove stress markers % and ' from phonetic transcriptions
_phones = [re.sub("[%']",'',word) for word in _phones]
# Split word into syllables using syllable break marker "-"
breaks = [phone.split('-') for phone in _phones]
# Convert syllables into binary, with "1" at the start of each syllable and "0" following
sylls = [[[1] + [0] * (len(seg) - 1) for seg in word] for word in breaks]
sylls = [[el for seg in word for el in seg] for word in sylls]
# Remove syllable markers from phonetic transcriptions
phones = [re.sub('-', '', word) for word in _phones]
# Calculate surprisals
italian_surprisal = segment_surprisal(phones, sylls, 'it', sample_size=None, random_seed=None)

In [None]:
#@title Korean

# Read in preprocessed language dataframe
df = pd.read_csv('/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/SunPoeppelDatabases/korean_kspan_database_updated.txt', delimiter='\t')
# Extract phonetic transcriptions with syllable breaks
_phones = df['phone_full']
# Remove stress markers % and ' from phonetic transcriptions
_phones = [re.sub("[%']",'',word) for word in _phones]
# Split word into syllables using syllable break marker "-"
breaks = [phone.split('-') for phone in _phones]
# Convert syllables into binary, with "1" at the start of each syllable and "0" following
sylls = [[[1] + [0] * (len(seg) - 1) for seg in word] for word in breaks]
sylls = [[el for seg in word for el in seg] for word in sylls]
# Remove syllable markers from phonetic transcriptions
phones = [re.sub('-', '', word) for word in _phones]
# Calculate surprisals
korean_surprisal = segment_surprisal(phones, sylls, 'ko', sample_size=None, random_seed=None)

In [None]:
#@title Norwegian

# Read in preprocessed language dataframe
df = pd.read_csv('/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/SunPoeppelDatabases/norwegian_nst_database_updated.txt', delimiter='\t')
# Extract phonetic transcriptions with syllable breaks
_phones = df['phone_full']
# Remove stress markers % and ' from phonetic transcriptions
_phones = [re.sub("[%']",'',word) for word in _phones]
# Split word into syllables using syllable break marker "-"
breaks = [phone.split('-') for phone in _phones]
# Convert syllables into binary, with "1" at the start of each syllable and "0" following
sylls = [[[1] + [0] * (len(seg) - 1) for seg in word] for word in breaks]
sylls = [[el for seg in word for el in seg] for word in sylls]
# Remove syllable markers from phonetic transcriptions
phones = [re.sub('-', '', word) for word in _phones]
# Calculate surprisals
norwegian_surprisal = segment_surprisal(phones, sylls, 'no', sample_size=None, random_seed=None)

In [None]:
#@title Spanish

# Read in preprocessed language dataframe
df = pd.read_csv('/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/SunPoeppelDatabases/spanish_bpal_database_updated.txt', delimiter='\t')
# Extract phonetic transcriptions with syllable breaks
_phones = df['phone_full']
# Remove stress markers % and ' from phonetic transcriptions
_phones = [re.sub("[%']",'',word) for word in _phones]
# Split word into syllables using syllable break marker "-"
breaks = [phone.split('-') for phone in _phones]
# Convert syllables into binary, with "1" at the start of each syllable and "0" following
sylls = [[[1] + [0] * (len(seg) - 1) for seg in word] for word in breaks]
sylls = [[el for seg in word for el in seg] for word in sylls]
# Remove syllable markers from phonetic transcriptions
phones = [re.sub('-', '', word) for word in _phones]
# Calculate surprisals
spanish_surprisal = segment_surprisal(phones, sylls, 'es', sample_size=None, random_seed=None)

In [None]:
#@title Swedish

# Read in preprocessed language dataframe
df = pd.read_csv('/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/SunPoeppelDatabases/swedish_nst_database_updated.txt', delimiter='\t')
# Extract phonetic transcriptions with syllable breaks
_phones = df['phone_full']
# Remove stress markers % and ' from phonetic transcriptions
_phones = [re.sub("[%']",'',word) for word in _phones]
# Split word into syllables using syllable break marker "-"
breaks = [phone.split('-') for phone in _phones]
# Convert syllables into binary, with "1" at the start of each syllable and "0" following
sylls = [[[1] + [0] * (len(seg) - 1) for seg in word] for word in breaks]
sylls = [[el for seg in word for el in seg] for word in sylls]
# Remove syllable markers from phonetic transcriptions
phones = [re.sub('-', '', word) for word in _phones]
# Calculate surprisals, save DataFrame
swedish_surprisal = segment_surprisal(phones, sylls, 'sv', sample_size=None, random_seed=None)

In [None]:
#@title Turkish

# Read in preprocessed language dataframe
df = pd.read_csv('/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/SunPoeppelDatabases/turkish_tell_database_updated.csv', delimiter=';')
# Extract phonetic transcriptions with syllable breaks
_phones = df['phone_full']
# Remove stress markers % and ' from phonetic transcriptions
_phones = [re.sub("[%']",'',word) for word in _phones]
# Split word into syllables using syllable break marker "-"
breaks = [phone.split('-') for phone in _phones]
# Convert syllables into binary, with "1" at the start of each syllable and "0" following
sylls = [[[1] + [0] * (len(seg) - 1) for seg in word] for word in breaks]
sylls = [[el for seg in word for el in seg] for word in sylls]
# Remove syllable markers from phonetic transcriptions
phones = [re.sub('-', '', word) for word in _phones]
# Calculate surprisals, save DataFrame
turkish_surprisal = segment_surprisal(phones, sylls, 'tr', sample_size=None, random_seed=None)