## <span style='font-family:Georgia'> Objectives
The purpose of this notebook is beginning of creating a baseline rule-model, which is to be a benchmark of the neural model developed in the later phase of the project.

In [5]:
import os
import re
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm import tqdm
from IPython.display import display, Markdown
import matplotlib.pyplot as plt

from supportive_functions import rm_consecutive_spaces


sns.set(
    rc={
        "figure.figsize": (14, 8.27),
        "axes.facecolor": "white",
        "axes.grid": True,
        "grid.color": ".9",
    }
)

In [6]:
def read_files(dir_path: str, sep: str = ",") -> pd.DataFrame:
    """
    Read the files with timestamps from a given catalog.

    Args:
        dir_path: Path to the catalog with the files
        sep: Separator used in pd.read_csv() function. Defaults to ",".

    Returns:
        pd.DataFrame: Data from all the files concatenated into one dataframe.
    """
    files = os.listdir(dir_path)
    data = pd.DataFrame(
        [], columns=["index", "timestamp_start", "timestamp_stop", "word"]
    )
    for file in tqdm(files):
        f = open(os.path.join(dir_path, file), encoding="utf-8", mode="r")
        name = file.split(".")[0]
        df = pd.read_csv(f, header=None, sep=sep, encoding="utf-8")
        df["index"] = name
        if sep != ",":
            df["timestamp_start"] = df.iloc[:, 0].str.split(",").str[0].str[1:]
            df["timestamp_stop"] = (
                df.iloc[:, 0].str.split(",").str[1].str.split("\)").str[0]
            )
            df["word"] = df.iloc[:, 0].str.split("\)+\s").str[1]
            df.drop([0], axis=1, inplace=True)
            df.drop(df.tail(1).index, inplace=True)
        else:
            df["timestamp_start"] = df.iloc[:, 0].str[1:]
            df["timestamp_stop"] = df.iloc[:, 1].str.split("\)+\s").str[0]
            df["word"] = df.iloc[:, 1].str.split("\)+\s").str[1]

            df.drop([0, 1], axis=1, inplace=True)
            df.drop(df.tail(1).index, inplace=True)
        data = pd.concat([data, df])
        f.close()
    return data

In [3]:
def calculate_pauses(data: pd.DataFrame) -> pd.DataFrame:
    """
    Generate new features describing pauses between two timestamps.

    Args:
        data: Dataframe with preprocessed timestamp data separated
        into 'timestamp_start' and 'timestamp_stop' columns.

    Returns:
        pd.DataFrame: Input dataframe with two new features added,
        describing the pause between the word and after it.
    """
    data.timestamp_start = data.timestamp_start.astype(int)
    data.timestamp_stop = data.timestamp_stop.astype(int)
    data_shifted = pd.concat(
        [data, data.timestamp_start.shift(-1), data.timestamp_stop.shift()], axis=1
    )
    data_shifted.columns = [
        "index",
        "timestamp_start",
        "timestamp_stop",
        "word",
        "timestamp_start_lead",
        "timestamp_stop_lag",
    ]
    data_shifted["pause_before"] = (
        data_shifted.timestamp_start - data_shifted.timestamp_stop_lag
    )
    data_shifted["pause_after"] = (
        data_shifted.timestamp_start_lead - data_shifted.timestamp_stop
    )
    data_shifted.drop(
        ["timestamp_start_lead", "timestamp_stop_lag"], axis=1, inplace=True
    )
    # to discuss
    data_shifted.pause_before.fillna(data_shifted.timestamp_start, inplace=True)
    data_shifted.pause_after.fillna(0, inplace=True)
    data_shifted.reset_index(drop=True, inplace=True)
    for i, row in data_shifted.iterrows():
        if (i > 0) and (
            data_shifted.loc[i, "index"] != data_shifted.loc[i - 1, "index"]
        ):
            data_shifted.loc[i, "pause_before"] = data_shifted.loc[i, "timestamp_start"]
            data_shifted.loc[i - 1, "pause_after"] = 0

    return data_shifted


## <span style='font-family:Georgia'> Data preparation & pauses calculation

### <span style='font-family:Georgia'> Train data

In [7]:
# Load data
data_train = read_files("./data/forced-alignment/train", sep="\t")

data_train_calc = calculate_pauses(data_train).drop(
    ["timestamp_start", "timestamp_stop"], axis=1
)

100%|██████████| 793/793 [00:24<00:00, 32.04it/s]
100%|██████████| 200/200 [00:04<00:00, 49.80it/s]


In [8]:
data_train_calc.head(3)

Unnamed: 0,index,word,pause_before,pause_after
0,wikinews178430,we,690.0,90.0
1,wikinews178430,wrocławiu,90.0,300.0
2,wikinews178430,walkę,300.0,210.0


In [10]:
### train symbols and noisy words
# load list of symbols to replace
symbols_to_replace_infile = open(
    "./data/out/eda/symbols_to_replace.txt", "r", encoding="utf-8"
)
symbols_to_replace = symbols_to_replace_infile.read().splitlines()
# load list of noisy words, i.e. words with letters from outside the Polish alphabet
noisy_words_infile = open("./data/out/eda/noisy_words.txt", "r", encoding="utf-8")
noisy_words = noisy_words_infile.read().splitlines()
# load list of letters from outside the Polish alphabet
non_polish_letters_infile = open(
    "./data/out/eda/non_polish_letters.txt", "r", encoding="utf-8"
)
non_polish_letters = non_polish_letters_infile.read().splitlines()

# merge noisy data into one list
symbols_to_replace.extend(noisy_words)
symbols_to_replace.extend(non_polish_letters)

In [13]:
# joining data into whole records

data_train_calc_joined = (
    data_train_calc[["index", "word"]]
    .groupby(["index"])["word"]
    .agg(" ".join)
    .reset_index()
)

In [15]:
for symb in symbols_to_replace:
    data_train_calc_joined["word"] = data_train_calc_joined["word"].apply(
        lambda x: x.replace(symb, "")
    )

data_train_calc_joined["word"] = data_train_calc_joined["word"].apply(
    rm_consecutive_spaces
)

In [17]:
# COMPARISON - check whether base records in timestamps and tabular data are the same
# reading preprocessed text data for comparison (without punctuation)
data_train_clean = pd.read_csv('data/out/data_cleaning/train_with_ids_clean.csv')
data_train_clean = data_train_clean.sort_values('FileId').reset_index(drop=True)
data_train_clean.head()

Unnamed: 0,FileId,FixedOutput
0,wikinews178430,we wrocławiu walkę ze szkodnikiem rozpoczyna z...
1,wikinews178747,do serii rozbojów doszło w środę wieczorem w l...
2,wikinews178788,trzech 19 latków zatrzymała wczoraj lubelska p...
3,wikinews178804,drugie zwycięstwo w tegorocznym giro ditalia o...
4,wikinews178814,kradzieże telefonów komórkowych to coraz częst...


In [18]:
len(data_train_clean)

793

In [19]:
# merging for comparison
data_train_merged = data_train_clean.merge(
    data_train_calc_joined, left_on="FileId", right_on="index"
)
data_train_merged.head()

Unnamed: 0,FileId,FixedOutput,index,word
0,wikinews178430,we wrocławiu walkę ze szkodnikiem rozpoczyna z...,wikinews178430,we wrocławiu walkę ze szkodnikiem rozpoczyna z...
1,wikinews178747,do serii rozbojów doszło w środę wieczorem w l...,wikinews178747,do serii rozbojów doszło w środę wieczorem w l...
2,wikinews178788,trzech 19 latków zatrzymała wczoraj lubelska p...,wikinews178788,trzech 19 latków zatrzymała wczoraj lubelska p...
3,wikinews178804,drugie zwycięstwo w tegorocznym giro ditalia o...,wikinews178804,drugie zwycięstwo w tegorocznym giro ditalia o...
4,wikinews178814,kradzieże telefonów komórkowych to coraz częst...,wikinews178814,kradzieże telefonów komórkowych to coraz częst...


In [20]:
# fixing errors (mainly spaces at the end)
data_train_merged["compare"] = (
    data_train_merged["FixedOutput"] == data_train_merged["word"]
)
data_train_merged.loc[
    data_train_merged["compare"] == False, "word"
] = data_train_merged.loc[data_train_merged["compare"] == False, "word"].str.strip()
data_train_merged["compare"] = (
    data_train_merged["FixedOutput"] == data_train_merged["word"]
)
data_train_merged.loc[
    data_train_merged["compare"] == False, "FixedOutput"
] = data_train_merged.loc[
    data_train_merged["compare"] == False, "FixedOutput"
].str.strip()
data_train_merged["compare"] = (
    data_train_merged["FixedOutput"] == data_train_merged["word"]
)
print("Same records: ", data_train_merged["compare"].sum())
print(
    "Error records: ", data_train_merged.shape[0] - data_train_merged["compare"].sum()
)
data_train_merged.to_csv("temp.csv", index=False)

Same records:  793
Error records:  0


### <span style='font-family:Georgia'> Test data

In [None]:
data_test = read_files("./data/forced-alignment/validation", sep="\t")
data_test_calc = calculate_pauses(data_test).drop(
    ["timestamp_start", "timestamp_stop"], axis=1
)

In [9]:
data_test_calc.head(3)

Unnamed: 0,index,word,pause_before,pause_after
0,wikinews179014,w,390.0,30.0
1,wikinews179014,gdańsku,30.0,60.0
2,wikinews179014,zgodnie,60.0,30.0


In [11]:
### test symbols and noisy words
# load list of symbols to replace
symbols_to_replace_infile_test = open(
    "./data/out/eda/symbols_to_replace.txt", "r", encoding="utf-8"
)
symbols_to_replace_test = symbols_to_replace_infile_test.read().splitlines()
# load list of noisy words, i.e. words with letters from outside the Polish alphabet
noisy_words_infile_test = open("./data/out/test/noisy_words.txt", "r", encoding="utf-8")
noisy_words_test = noisy_words_infile_test.read().splitlines()
# load list of letters from outside the Polish alphabet
non_polish_letters_infile_test = open(
    "./data/out/test/non_polish_letters.txt", "r", encoding="utf-8"
)
non_polish_letters_test = non_polish_letters_infile_test.read().splitlines()

# merge noisy data into one list
noisy_words_test.extend(non_polish_letters_test)

In [12]:
symbols_to_replace_test

["'", '"', ';', '%', '(', ')', '[', ']', '²', '€', '³', '+', '·']

In [14]:
data_test_calc_joined = (
    data_test_calc[["index", "word"]]
    .groupby(["index"])["word"]
    .agg(" ".join)
    .reset_index()
)

In [16]:
for symb in noisy_words_test:
    data_test_calc_joined["word"] = data_test_calc_joined["word"].apply(
        lambda x: x.replace(symb, "")
    )

data_test_calc_joined["word"] = data_test_calc_joined["word"].apply(
    rm_consecutive_spaces
)

for symb in symbols_to_replace_test:
    data_test_calc_joined["word"] = data_test_calc_joined["word"].apply(
        lambda x: x.replace(symb, "")
    )

data_test_calc_joined["word"] = data_test_calc_joined["word"].apply(
    rm_consecutive_spaces
)

In [13]:
# COMPARISON - check whether base records in timestamps and tabular data are the same
# reading preprocessed text data for comparison (without punctuation)
data_test_clean = pd.read_csv('./data/out/data_cleaning/test_with_ids_clean.csv')
data_test_clean = data_test_clean.sort_values('FileId').reset_index(drop=True)
data_test_clean.head()

Unnamed: 0,FileId,ASROutput
0,wikinews179014,w gdańsku zgodnie już z coroczną czerwcową tra...
1,wikinews179354,prezydent usa george bush powiedział że odnowa...
2,wikinews179650,mamy najgorsze przedmieścia w europie nie woln...
3,wikinews179740,w sejmie trwała dziś debata nad sprawozdaniem ...
4,wikinews179784,aleksander łukaszenka który za 3 miesiące będz...


In [39]:
data_test_calc_joined.head()

Unnamed: 0,index,word
0,wikinews179014,w gdańsku zgodnie już z coroczną czerwcową tra...
1,wikinews179354,prezydent usa george bush powiedział że odnowa...
2,wikinews179650,mamy najgorsze przedmieścia w europie nie woln...
3,wikinews179740,w sejmie trwała dziś debata nad sprawozdaniem ...
4,wikinews179784,aleksander łukaszenka który za 3 miesiące będz...


In [23]:
# merging for comparison
data_test_merged = data_test_clean.merge(
    data_test_calc_joined, left_on="FileId", right_on="index"
)
data_test_merged.head()

Unnamed: 0,FileId,ASROutput,index,word
0,wikinews179014,w gdańsku zgodnie już z coroczną czerwcową tra...,wikinews179014,w gdańsku zgodnie już z coroczną czerwcową tra...
1,wikinews179354,prezydent usa george bush powiedział że odnowa...,wikinews179354,prezydent usa george bush powiedział że odnowa...
2,wikinews179650,mamy najgorsze przedmieścia w europie nie woln...,wikinews179650,mamy najgorsze przedmieścia w europie nie woln...
3,wikinews179740,w sejmie trwała dziś debata nad sprawozdaniem ...,wikinews179740,w sejmie trwała dziś debata nad sprawozdaniem ...
4,wikinews179784,aleksander łukaszenka który za 3 miesiące będz...,wikinews179784,aleksander łukaszenka który za 3 miesiące będz...


In [24]:
# fixing errors (mainly spaces at the end)
data_test_merged["compare"] = data_test_merged["ASROutput"] == data_test_merged["word"]
data_test_merged.loc[
    data_test_merged["compare"] == False, "word"
] = data_test_merged.loc[data_test_merged["compare"] == False, "word"].str.strip()
data_test_merged["compare"] = data_test_merged["ASROutput"] == data_test_merged["word"]
data_test_merged.loc[
    data_test_merged["compare"] == False, "ASROutput"
] = data_test_merged.loc[data_test_merged["compare"] == False, "ASROutput"].str.strip()
data_test_merged["compare"] = data_test_merged["ASROutput"] == data_test_merged["word"]
print("Same records: ", data_test_merged["compare"].sum())
print("Error records: ", data_test_merged.shape[0] - data_test_merged["compare"].sum())
data_test_merged.to_csv("temp.csv", index=False)

Same records:  200
Error records:  0
