In [None]:
import pandas as pd
import cologne_phonetics
from typing import Callable, List

In [None]:
PATH_TO_TEST_DATASET = "../../data/alliteration_detection/alliteration_test_set.csv"

# Alliteration Detection

In [None]:
def has_at_least_one_alliteration_case(sentence: str) -> bool:
    """This functions checks four different alliteration cases for a given sentence
       case_1: There is a word following another word, both having the same letter at the 
               beginning (sentence is in lowercase). example: alle achtung!
       case_2: There is a word two steps after the first word, both having the same letter at the
               beginning (sentence is in lowercase). example: alles ist abverkauft
       case_3: Same as case_1 but the first digit of the cologne phonetics encoding is 
               compared.
       case_4: Same as case_2 but the first digit of the cologne phonetics encoding is
               compared.
    """
    case_1 = contains_alliteration(sentence, 1, get_preprocessed_sentence_as_list)
    
    case_2 = contains_alliteration(sentence, 2, get_preprocessed_sentence_as_list)
    
    case_3 = contains_alliteration(sentence, 1, get_cologne_encoding_as_list)
    
    case_4 = contains_alliteration(sentence, 2, get_cologne_encoding_as_list)
    
    return case_1 or case_2 or case_3 or case_4


def contains_alliteration(sentence: str, gap: int,
                          sentence_preprocessing_func: Callable[[str], List[str]]) -> bool:
    """This functions checks whether a sentence contains an alliteration
       It can take two possible functions as preprocessing function:
       get_preprocessed_sentence_as_list(str) -> List[str]
       get_cologne_encoding_as_list(str) -> List[str]
    """
    preprocessed_sentence = sentence_preprocessing_func(sentence)
    res = []
    
    index = 0
    
    while index < len(preprocessed_sentence):
        # appends True if there is an alliteration (either phonetically or textually)
        res.append(has_word_with_same_letter_at_gap(index,gap,preprocessed_sentence))
        index += 1
    
    return True in res

def get_preprocessed_sentence_as_list(sentence: str) -> List[str]:
    """Expects a sentence where the words are separated by one space.
       Returns all the words in the sentence in a list. 
       The words are set to lowercase.
       The sentence is splitted at the space between the words.
    """
    if type(sentence) == str:
        return sentence.lower().split(" ")
    else:
        return [""]

def get_cologne_encoding_as_list(sentence: str) -> List[str]:
    """Expects a sentence where the words are separated by one space
       Returns a list containing the cologne phonetics encoding of each word
       in the sentence.
    """
    word_encodings = cologne_phonetics.encode(sentence)
    res = []
    
    # word_encoding is a tuple
    for word,encoding in word_encodings:
        res.append(encoding)
    return res

def has_word_with_same_letter_at_gap(starting_index: int, gap: int,
                                     preprocessed_sentence: List[str]) -> bool:
    """Expects a preprocessed sentence (a list of words), the index of the first word
       and the gap between the first word and the second word where the first letter
       should be the same.
       
       The preprocessed sentence has to be either generated using the 
       get_cologne_encoding_as_list(str) -> List[str] function or the 
       get_preprocessed_sentence_as_list(str) -> [str] function.
    """
    step = starting_index + gap
    
    # catch index out of range 
    if(step >= len(preprocessed_sentence)):
        return False
    
    # first word is empty
    if (len(preprocessed_sentence[starting_index]) < 1):
        return False
    
    # second word is empty
    if (len(preprocessed_sentence[step]) < 1):
        return False

    # checks whether the first letter of both words is equal
    else:
        return preprocessed_sentence[starting_index][0] == preprocessed_sentence[step][0]

A test dataset containing 605 alliterations is read into pandas. It is then used to test the functions developed above. The dataset was generated using the code in the alliteration_scraper.ipynb file

In [None]:
alliteration_df = pd.read_csv(PATH_TO_TEST_DATASET)
alliteration_df.columns = ["is_alliteration","sentence"]

In [None]:
alliteration_df.info()

In [None]:
alliteration_df.head(10)

In [None]:
alliterations = list(alliteration_df["sentence"])

5 out of 605 cases are not recognized at the moment. Further tests need to be executed

In [None]:
for elem in alliterations:
    if not has_at_least_one_alliteration_case(elem):
        print(elem)