In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np

### Preparation

In [None]:
PDF_FOLDER_CHOSEN = 'plots/'
DATA_FOLDER_CHOSEN = 'data/'

In [None]:
PATH_TO_DATASET_BOTH = DATA_FOLDER_CHOSEN + "both_200_done.csv"
PATH_TO_DATASET_COLOGNE = DATA_FOLDER_CHOSEN + "cologne_200_done.csv"
PATH_TO_DATASET_STANDARD = DATA_FOLDER_CHOSEN + "standard_200_done.csv"

#### Colors

In [None]:
HEX_COLORS = ['#000000','#E69F00','#56B4E9','#009E73','#F0E442','#0072B2','#D55E00','#CC79A7']
RGB_COLORS = [mcolors.hex2color(hex_color) for hex_color in HEX_COLORS]

BAR_CHART_COLOR = RGB_COLORS[2]
BAR_CHART_ORANGE = RGB_COLORS[1]

#### Read the datasets

In [None]:
df_both = pd.read_csv(PATH_TO_DATASET_BOTH)
df_cologne = pd.read_csv(PATH_TO_DATASET_COLOGNE)
df_standard = pd.read_csv(PATH_TO_DATASET_STANDARD)

In [None]:
df_cologne["label"]

#### Preprocessing

In [None]:
def preprocess_label_names(label):
    if label == "Has Alliteration":
        return "Alliteration"
    else:
        return label

In [None]:
df_cologne["label"] = df_cologne["label"].apply(lambda row: preprocess_label_names(row))

#### Cologne Phonetics Dict

In [None]:
cologne_dict = {}
cologne_dict["1"] = ["b", "p"]
cologne_dict["2"] = ["d", "t"]
cologne_dict["3"] = ["f", "v", "w", "p"]
cologne_dict["4"] = ["g", "k", "q", "c"]
cologne_dict["48"] = ["x"]
cologne_dict["5"] = ["l"]
cologne_dict["6"] = ["m", "n"]
cologne_dict["7"] = ["r"]
cologne_dict["8"] = ["s", "z", "c", "d", "t", "x"]

### Building plots

#### Draw value counts plots

In [None]:
def draw_value_counts_bar_chart(df, color, title, destination, filename):
    value_counts = df["label"].value_counts()
    desired_order = ['Alliteration', 'No Alliteration']

    # Sort the value counts based on the desired order
    value_counts = value_counts.loc[desired_order]
    
    plt.bar(value_counts.index, value_counts.values, color=color)
    plt.xlabel('Categories')
    plt.ylabel('Amount')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(destination + filename + '.pdf', format='pdf')

In [None]:
draw_value_counts_bar_chart(df_both, BAR_CHART_COLOR, "Amount of alliterations (both)",PDF_FOLDER_CHOSEN, "values_counts_both")

In [None]:
draw_value_counts_bar_chart(df_cologne, BAR_CHART_COLOR, "Amount of alliterations (cologne)",PDF_FOLDER_CHOSEN, "values_counts_cologne")

In [None]:
draw_value_counts_bar_chart(df_standard, BAR_CHART_COLOR, "Amount of alliterations (standard)",PDF_FOLDER_CHOSEN, "values_counts_standard")

#### Produce plot that shows the most common sets of numbers and letters

In [None]:
def transform_string_list_to_string_set(row):
    column_values = row.replace("'", "").replace("[", "").replace("]","").replace(" ", "").split(",")
    column_values = sorted(column_values)
    
    return set(column_values)

def transform_letters_column_to_full_representation(row):
    column_values = row.replace("'", "").replace("[", "").replace("]","").replace(" ", "").split(",")
    
    res = []
    
    for elem in column_values:
        if elem in cologne_dict.keys():
            res.extend(cologne_dict[elem])
        else:
            res.append(elem)
    
    return set(res)

def draw_bar_of_letter_value_counts(df, column, color, title, destination, filename):
    df[column] = df[column].apply(lambda letters: str(letters))
    value_counts_full_both_test = df[column].value_counts().head(10)
    plt.bar(value_counts_full_both_test.index, value_counts_full_both_test.values, color=color)
    plt.xticks(rotation=90)
    plt.xlabel('Letter combination')
    plt.ylabel('Amount')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(destination + filename + '.pdf', format='pdf')
    
    
# additional methods for separatino based on whether they are in an alliteration or not
def get_top_10_letter_sets_as_string(df, column):
    
    df[column] = df[column].apply(lambda letters: str(letters))

    return df[column].value_counts().head(10).index

def get_counts_for_positive_and_negative_case(df, column, letter_set):
    
    df[column] = df[column].apply(lambda letters: str(letters))

    counts = pd.Series(df[df[column] == letter_set]["label"] == "Alliteration").value_counts()
    
    negative = 0
    positive = 0
    
    # if there are only true or false cases an exception will be thrown
    try:
        negative = counts[False]
    except:
        return counts[True], 0
    
    try:
        positive = counts[True]
    except:
        return 0, counts[False]
    
    return positive, negative

def get_counts_for_each_letter_set(df, column, letter_sets):
    all_negative_counts = []
    all_positive_counts = []
    
    for letter_set in letter_sets:
        pos, neg = get_counts_for_positive_and_negative_case(df, column, letter_set)
        all_positive_counts.append(pos)
        all_negative_counts.append(neg)
    
    return all_positive_counts, all_negative_counts

def draw_letter_set_plot_with_true_false_sep(df, column, title, destination, filename):
    
    top_10_letter_sets = get_top_10_letter_sets_as_string(df, column)

    word_in_true_case, word_in_false_case = get_counts_for_each_letter_set(df, column, top_10_letter_sets)

    # Create an array for the x-axis positions
    x = np.arange(len(top_10_letter_sets))

    # Set the width of each bar
    bar_width = 0.35

    # Create subplots
    fig, ax = plt.subplots()

    # Plot true count bars
    true_bars = ax.bar(x - bar_width/2, word_in_true_case, bar_width, label='Alliteration', color=BAR_CHART_COLOR)

    # Plot false count bars
    false_bars = ax.bar(x + bar_width/2, word_in_false_case, bar_width, label='No Alliteration', color=BAR_CHART_ORANGE)

    # Set labels and title
    ax.set_xlabel('Letter combination')
    ax.set_ylabel('Amount')
    ax.set_title(title)
    ax.set_xticks(x,labels=top_10_letter_sets, rotation=90)
    ax.legend()

    fig.tight_layout()
    fig.savefig(destination + filename + '.pdf', format='pdf')

In [None]:
df_both["letters_full_set"] = df_both["letters"].apply(lambda row: transform_letters_column_to_full_representation(row))
df_cologne["letters_full_set"] = df_cologne["letters"].apply(lambda row: transform_letters_column_to_full_representation(row))
df_standard["letters_full_set"] = df_standard["letters"].apply(lambda row: transform_letters_column_to_full_representation(row))

df_both["letters_set"] = df_both["letters"].apply(lambda row: transform_string_list_to_string_set(row))
df_cologne["letters_set"] = df_cologne["letters"].apply(lambda row: transform_string_list_to_string_set(row))
df_standard["letters_set"] = df_standard["letters"].apply(lambda row: transform_string_list_to_string_set(row))


#### Letter set

In [None]:
draw_letter_set_plot_with_true_false_sep(df_both,
                                "letters_set",
                                "Most common combinations (both)", 
                                PDF_FOLDER_CHOSEN, 
                                "letters_set_both"
)

In [None]:
draw_letter_set_plot_with_true_false_sep(df_cologne,
                                "letters_set",
                                "Most common combinations (cologne)", 
                                PDF_FOLDER_CHOSEN, 
                                "letters_set_cologne"
)

In [None]:
draw_letter_set_plot_with_true_false_sep(df_standard,
                                "letters_set",
                                "Most common combinations (standard)", 
                                PDF_FOLDER_CHOSEN, 
                                "letters_set_standard"
)

#### Full letter set

In [None]:
draw_letter_set_plot_with_true_false_sep(df_both,
                                "letters_full_set",
                                "Most common combinations transformed (both)", 
                                PDF_FOLDER_CHOSEN, 
                                "full_letters_set_both"
)

In [None]:
draw_letter_set_plot_with_true_false_sep(df_cologne,
                                "letters_full_set",
                                "Most common combinations transformed (cologne)", 
                                PDF_FOLDER_CHOSEN, 
                                "full_letters_set_cologne"
)

### Find the most common words

In [None]:
def find_words_based_on_letters_set(row, global_count_dict):
    speech = row["text"]
    letters_set = row["letters_full_set"]
    
    words = speech.split(" ")
    
    for word in words:
        first_letter = word[0]
        
        if first_letter in letters_set:
            if word not in global_count_dict:
                global_count_dict[word] = 0
            global_count_dict[word] += 1
            
def get_most_common_words(df, amount):
    
    word_count_dict = {}
    
    df.apply(lambda row: find_words_based_on_letters_set(row, word_count_dict),axis=1)
    
    most_common_words_with_count = list((dict(sorted(word_count_dict.items(), key=lambda item: item[1],reverse=True)).items()))[0:amount]
    
    return most_common_words_with_count


### store count for each word that was found based on alliteration letter

In [None]:
def plot_word_count_bar_chart(df, label, color, title, destination, filename):
    word_count_tuples = get_most_common_words(df[df["label"] == label], 15)
    
    labels, values = zip(*word_count_tuples)
    
    plt.bar(labels, values, color=color)
    plt.xticks(rotation=90)
    plt.xlabel('Words')
    plt.ylabel('Amount')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(destination + filename + '.pdf', format='pdf')

In [None]:
plot_word_count_bar_chart(df_both, 
                          "Alliteration",
                          BAR_CHART_COLOR, 
                          "Most common words in speech with alliteration (both)", 
                          PDF_FOLDER_CHOSEN, 
                          "word_count_alliteration_both"
)

In [None]:
plot_word_count_bar_chart(df_both, 
                          "No Alliteration",
                          BAR_CHART_COLOR, 
                          "Most common words in speech without alliteration (both)", 
                          PDF_FOLDER_CHOSEN, 
                          "word_count_no_alliteration_both"
)

In [None]:
plot_word_count_bar_chart(df_cologne, 
                          "Alliteration",
                          BAR_CHART_COLOR, 
                          "Most common words in speech with alliteration (cologne)", 
                          PDF_FOLDER_CHOSEN, 
                          "word_count_alliteration_cologne"
)

In [None]:
plot_word_count_bar_chart(df_cologne, 
                          "No Alliteration",
                          BAR_CHART_COLOR, 
                          "Most common words in speech without alliteration (cologne)", 
                          PDF_FOLDER_CHOSEN, 
                          "word_count_no_alliteration_cologne"
)

In [None]:
plot_word_count_bar_chart(df_standard, 
                          "Alliteration",
                          BAR_CHART_COLOR, 
                          "Most common words in speech with alliteration (standard)", 
                          PDF_FOLDER_CHOSEN, 
                          "word_count_alliteration_standard"
)

In [None]:
plot_word_count_bar_chart(df_standard, 
                          "No Alliteration",
                          BAR_CHART_COLOR, 
                          "Most common words in speech without alliteration (standard)", 
                          PDF_FOLDER_CHOSEN, 
                          "word_count_no_alliteration_standard"
)

### Evaluation of the alliteration dataset

In [None]:
values = [601, 4]
labels = ['Correctly detected', 'Not detected']

# Create a bar plot
plt.bar(labels, values, color=BAR_CHART_COLOR)
plt.ylabel('Amount')
plt.title('Amount of detected and undetected alliterations in alliteration dataset')

# Show the plot
plt.tight_layout()
plt.savefig(PDF_FOLDER_CHOSEN + "res_allit_dataset" + '.pdf', format='pdf')

### Plot for the overall evaluation

Nothing     61241
Cologne      1592
Both          792
Standard      284

In [None]:
values = [61241, 1592, 792, 284]
labels = ['None', 'Cologne Phonetics', 'Both', 'Standard']

# Create a bar plot
plt.bar(labels, values, color=BAR_CHART_COLOR)
plt.ylabel('Amount')
plt.title('Amount of each different set detected by the algorithm')

# Show the plot
plt.tight_layout()
plt.savefig(PDF_FOLDER_CHOSEN + "res_political_dataset" + '.pdf', format='pdf')