In [None]:
import pandas as pd
import cologne_phonetics
from statistics import mean
from num2words import num2words
from collections import Counter

https://www.mehner.info/html/alliteration.html

https://www.mehner.info/ladezone/sprache/alle-alliterationen-sammlung.html

In [None]:
PATH_TO_ALLITERATION_DATASET = "../../data/research_question_3/thesis/alliteration_test_set.csv"
PATH_TO_ALL_STATEMENTS = "../../data/protocol_obtainment/political_statements_thesis.csv"

### Example sentences

In [None]:
sentence = "Der frühe Vogel fängt den Wurm"

### General util functions

In [None]:
# for example ['40','Grad'] is turned into ['vierzig', 'Grad']
def replace_numbers_with_string(words):
    index = 0
    res = []
    
    while index < len(words):
        word = words[index]
        if word.isnumeric():
            res.append(num2words(word, lang='de'))
        else:
            res.append(word)
        
        index += 1
        
    return res

# generates all the sublists having size 'size'
def get_sublists_of_size(input_list, size):
    res = []
    
    if size <= len(input_list):
        for index in range(len(input_list) - size + 1):
            sublist = input_list[index:index+size]
            res.append(sublist)
    
    return res

def get_cologne_phonetics_result(word):
    encod = cologne_phonetics.encode(word)
    
    if len(encod[0][1]) > 0:
        return encod[0][1][0]
    else:
        return "-"

#### Example of the get_sublists_of_size function

In [None]:
words = sentence.lower().split(" ")
letters = [word[0] for word in words if len(word) > 1]
subs = get_sublists_of_size(letters, 3)
subs

In [None]:
def check_for_step(sublists,step):
    
    res = []
    res_letters = []
    
    # check whether all sublists (lists in a list) have equal length
    if len(set(map(len,sublists))) != 1:
        return False, res_letters
    
    # the step should not be greater than the length of the sublists
    if step > len(sublists[0]):
        return False, res_letters
    else:
        for list_elem in sublists:
            index = 0
            
            while index < len(list_elem):
                # check whether the step is already to big
                if index + step >= len(list_elem):
                    index += 1
                else:
                    if list_elem[index] == list_elem[index+step]:
                        res.append(True)
                        res_letters.append(list_elem[index])
                    index += 1          
    
    return True in res, res_letters

### Algorithm for finding alliterations based on size and steps

In [None]:
# returns tuple with element and count like this (element, count)
def get_most_common_as_tuple(lst):
    data = Counter(lst)
    return data.most_common()[0]


def find_alliteration(sentence, size, steps=[], algo='std'):
    
    res = []
    res_letters = []
    
    sent = sentence.lower()
    words = sent.split(" ")
    words = replace_numbers_with_string(words)
    
    letters = []
    if algo == 'std':
        letters = [word[0] for word in words if len(word) > 1]
    elif algo == 'cologne':
        letters = [get_cologne_phonetics_result(word) for word in words if len(word) > 1]
    else:
        raise Exception("Only standard (std) and cologne phonetics (cologne) is available as algorithm")

    if len(letters) < 2:
        return False, res_letters
    
    index = 0

    subsets = get_sublists_of_size(letters, size)
    
    if len(subsets) < 1:
        return False, res_letters
    
    if algo == 'cologne':
        subsets_cologne = []
        for subset in subsets:
            # ignore subsets where '0' is the only code returned by cologne phonetics
            if all(x==subset[0] for x in subset) and subset[0] == '0':
                continue
            else:
                # ignore subsets where '0' is the most common entry
                #most_common = get_most_common_as_tuple(subset)
                #if most_common[0] == '0' and most_common[1] > round(len(subset) / 2):
                #    print("case is triggered")
                #    continue
                #else:
                subsets_cologne.append(subset)
        subsets = subsets_cologne
    
    for subset in subsets:
        if all(x==subset[0] for x in subset):
            res.append(True)
            res_letters.append(subset[0])
    for step in steps:
        res_step_check = check_for_step(subsets, step)
        res.append(res_step_check[0])
        res_letters.extend(res_step_check[1])
            
    return True in res, res_letters

### Experiments

In [None]:
df_alliteration = pd.read_csv(PATH_TO_ALLITERATION_DATASET)
df_all = pd.read_csv(PATH_TO_ALL_STATEMENTS)

In [None]:
df_alliteration

In [None]:
def has_at_least_one_alliteration_case(sentence):
    res_1 = find_alliteration(sentence,2,steps=[1])
    res_2 = find_alliteration(sentence,3,[1,2])
    res_3 = find_alliteration(sentence,2,[1], algo='cologne')
    res_4 = find_alliteration(sentence,3,[1,2], algo='cologne')
    
    #res_1 = find_alliteration(sentence, 2, algo='cologne')
    #res_2 = find_alliteration(sentence, 2)
    
    case_1 = res_1[0]
    case_2 = res_2[0]
    case_3 = res_3[0]
    case_4 = res_4[0]
    
    letters_1 = res_1[1]
    letters_2 = res_2[1]
    letters_3 = res_3[1]
    letters_4 = res_4[1]

    
    return case_1 or case_2 or case_3 or case_4, set(letters_1 + letters_2 + letters_3 + letters_4)
    #return case_1 or case_2, set(letters_1 + letters_2)


def has_both_alliteration_cases(sentence, size):
    res_1 = find_alliteration(sentence, size, algo='cologne')
    res_2 = find_alliteration(sentence, size)

    case_1 = res_1[0]
    case_2 = res_2[0]
    
    set_1 = res_1[1]
    set_2 = res_2[1]
    
    if case_1 == True and case_2 == False:
        return "Cologne", str(set_1 + set_2)
    elif case_1 == False and case_2 == True:
        return "Standard", str(set_1 + set_2)
    elif case_1 == True and case_2 == True:
        return "Both", str(set_1 + set_2)
    else:
        return "Nothing", str(set_1 + set_2)

In [None]:
alliterations = list(df_alliteration["0"])

In [None]:
for elem in alliterations:
    if not has_at_least_one_alliteration_case(elem)[0]:
        print(elem)

In [None]:
len(alliterations)

### Working with the dataframe that contains all the statements

In [None]:
df_all["res_allit"] = df_all["speech"].apply(lambda sentence: find_alliteration(sentence,4,algo='cologne'))

In [None]:
df_all["res_allit"].value_counts()

In [None]:
df_all["res_both"] = df_all["speech"].apply(lambda sentence: has_both_alliteration_cases(sentence, 4))

In [None]:
df_all["res_both"].value_counts()

In [None]:
df_all["res_both_updated"] = df_all["speech"].apply(lambda sentence: has_both_alliteration_cases(sentence, 4))

In [None]:
df_all["res_both_updated"].value_counts()

In [None]:
df_all[["res_both_last_version", "letters"]] = df_all["speech"].apply(lambda sentence: pd.Series(has_both_alliteration_cases(sentence, 4)))

In [None]:
df_all["res_both_last_version"].value_counts()

In [None]:
df_all["letters"].value_counts()

### Extract the data for the manual evaluation

In [None]:
#df_all[df_all["res_both_last_version"] == "Cologne"].sample(n=200).to_csv("cologne_200.csv",sep=";")

In [None]:
#df_all[df_all["res_both_last_version"] == "Both"].sample(n=200).to_csv("both_200.csv",sep=";")

In [None]:
#df_all[df_all["res_both_last_version"] == "Standard"].sample(n=200).to_csv("standard_200.csv",sep=";")

In [None]:
#df_all.to_csv("alliteration_final_results.csv",sep=";")

## Interesting examples

### Number has been found 

der ganz große [Dank, die 365 Tage im Jahr]

### Bad example as there are a lot of articles

der Theorie, dass die

### Example why the rules for number 3 are not that good

fangen wir wieder von vorne

vor Wahlen wieder Wahlzuckerl verteilen wollen

### Positive example of rules for number 3

für viele Frauen, für viele Familien

Fremdübernahmen von Firmen vollzogen

### Interesting example for g, c and k

keine Geschäfte, keine Cafés, keine Gasthöfe

### Negative example of the rules for number 8

sich zum Ziel setzt

## Letter examples

#### Standard contains alliterations with a, e, i, o, u

The reason for that is that we had to exclude 0 from cologne phonetics, as it included way to many different letters

### Examples where the same word is said over and over again (negative)

Impfen, impfen, impfen

Ja, ja, ja, ja

und, und, und, und

Aber, aber, aber aber

### Repetition (negative)

vollkommen verständlich, vollkommen verständlich

in Österreich zugelassen [sind, sind sicher, sind]

### Examples that showed up quite often

Unternehmerinnen und Unternehmern, und 

unserer Unternehmerinnen und Unternehmern

auch an alle anderen 

aber auch allen anderen

dass das dann die 

der dazu dient, dass das

dass derjenige, der das

### Negative because of different sound

nicht nur [sich selbst schützt, sondern]

### Negative because of special character in between

Was aber [wollen wir? Wir wollen]

### Negative because e and ei sound very different

es ein extrem erfreuliches,

Ich bringe auch einen entsprechenden Entschließungsantrag ein

### Ending sentence

einen Entschließungsantrag ein. Er lautet

### Does have other word just separated by minus (negative)

aller AMA-Gütesiegelprodukte ausschließlich auf

ein eigenes EDV-System, eigene

### Edge Cases

-chöre, -orchester, -streichorchester, -blasmusikorchester 

"Lettland", "Finnland", "Schweden", "Norwegen"

Results can be further reduced when everything needs to be in one sentence and if separators like ":" are not allowed as well

### Rule with g and k (negative)

keine Geschäfte, keine Cafés, keine Gasthöfe

### Th sounds like f, might be relevant for rule 3?

dass der Thinktank Think