## <span style='font-family:Georgia'> Objectives
The purpose of this notebook is modeling data preparation (including cleaning) & named entity recognition model training   
*Author: Elżbieta Jowik*

In [2]:
import os
import glob
from tqdm.notebook import tqdm

from utils.parse_tsv import parse_tsv
from utils.parse_data import parse_data
from utils.convert_to_pandas import convert_to_pandas
from utils.split_long_examples import split_long_examples
from utils.train_model import train_model
from utils.test_model import test_model

In [4]:
# Input data cleaning functions

import os
import re
import json
from functools import reduce

# Data cleaning
def rm_consecutive_spaces(string):
    return re.sub(' +', ' ', string)

# Data cleaning
def generate_replacement_dict():
    # load list of symbols to replace
    symbols_to_replace_infile = open("../data/outputs/eda/symbols_to_replace.txt", "r", encoding="utf-8")
    symbols_to_replace = symbols_to_replace_infile.read().splitlines()
    # load list of noisy words, i.e. words with letters from outside the Polish alphabet
    noisy_words_infile = open("../data/outputs/eda/noisy_words.txt", "r", encoding="utf-8")
    noisy_words = noisy_words_infile.read().splitlines()
    # load list of letters from outside the Polish alphabet
    non_polish_letters_infile = open("../data/outputs/eda/non_polish_letters.txt", "r", encoding="utf-8")
    non_polish_letters = non_polish_letters_infile.read().splitlines()

    # merge noisy data into one list
    symbols_to_replace.extend(noisy_words)
    symbols_to_replace.extend(non_polish_letters)
    symbols_to_replace.extend(["...", "…"])

    # generate dictionary
    replacement_dict = {}

    for symbol in symbols_to_replace:
        replacement_dict[symbol] = ""
    return replacement_dict


def replace_with_dict(str_to_replace, replacement_dict):
    str_replaced = reduce(lambda x, y: x.replace(*y), [str_to_replace, *list(replacement_dict.items())])
    return str_replaced


# *.tsv
def clean_tsv_file(in_path, out_path):
    
    target_classes=['.', ',', '?', '!', '-', ':']

    os.makedirs(out_path, mode = 0o777, exist_ok = True) 
    out_path = f"{out_path}/{os.path.basename(in_path)}"

    replacement_dict = generate_replacement_dict()
    
    if not os.path.exists(out_path):
        open(out_path, 'w+').close()
    
    with open(in_path, encoding="utf-8", mode="r") as f1, open(out_path, encoding="utf-8", mode="w+") as f2:
        for line in f1:
            try:
                name, text = line.split("\t")
            except:
                text = line
                name = None
            text_replaced = replace_with_dict(text, replacement_dict)
            text_cleaned = rm_consecutive_spaces(text_replaced)
            for item in target_classes:
                text_cleaned = text_cleaned.replace(f" {item}", item)
            text_cleaned = text_cleaned.lstrip().rstrip()
            if name: line_cleaned = f"{name}\t{text_cleaned}\n"
            else: line_cleaned = f"{text_cleaned}\n"
            

            f2.write(line_cleaned)
            
    f1.close()
    f2.close()
        

# *.clntmstmp
def clean_clmtmstmp_file(in_path, out_path):
    os.makedirs(out_path, mode = 0o777, exist_ok = True) 
    out_path = f"{out_path}/{os.path.basename(in_path)}"
    
    replacement_dict = generate_replacement_dict()
    
    with open(in_path, encoding="utf-8", mode="r") as f1, open(out_path, encoding="utf-8", mode="w+") as f2:
        for line in f1:
            try:
                interval, word = line.split(" ")
                word_replaced = replace_with_dict(word, replacement_dict).strip()
                if word_replaced != "":
                    line_cleaned = f"{interval} {word_replaced}\n"
                else: continue
            except ValueError: line_cleaned = line  # EOF case
            f2.write(line_cleaned)
    f1.close()
    f2.close()

            
# *.json
def clean_json_file(in_path, out_path):
    os.makedirs(out_path, mode = 0o777, exist_ok = True) 
    out_path = f"{out_path}/{os.path.basename(in_path)}"

    replacement_dict = generate_replacement_dict()
    clean_rows = []

    with open(in_path, encoding="utf-8", mode="r") as f1, open(out_path, encoding="utf-8", mode="w+") as f2:
        data = json.load(f1)

        for row in data['words']:
            new_row = {
                'word': replace_with_dict(row['word'], replacement_dict).strip(),
                'punctuation': replace_with_dict(row['punctuation'], replacement_dict).strip(),
                'space_after': row['space_after']
            }
            # TODO: constraints
            clean_rows.append(new_row)

        data_clean = {
            'title': data['title'],
            'words': clean_rows
        }

        json.dump(data_clean, f2)
    
    f1.close()
    f2.close()

In [13]:
# Input files paths
train_in_path = "../data/tests/source/train/in.tsv"
test_in_path = "../data/tests/source/test-A/in.tsv"

train_expected_path = "../data/tests/source/train/expected.tsv"
test_expected_path = "../data/tests/source/test-A/expected.tsv"

train_clntmstmp_dir = "../data/source/poleval_fa.train/train"
test_clntmstmp_dir = "../data/source/poleval_fa.validation/validation"

wikitalks_json_dir = "../data/tests/source/poleval_text.rest/wikinews/all/json"
wikinews_json_dir = "../data/tests/source/poleval_text.rest/wikitalks/all/json"

In [14]:
# Output files paths
step1_out_dirpath = "../data/tests/outputs/step1"
step2_out_dirpath = "../data/tests/outputs/step2"
step3_out_dirpath = step1_out_dirpath
step4_out_dirpath = "../data/tests/outputs/step4"
step5_out_dirpath = "../data/tests/outputs/step5"
step6_out_dirpath = "../data/tests/outputs/step6"
step7_out_dirpath = "../data/tests/outputs/step7"

step1_train_save_path = f"{step1_out_dirpath}/original_train.conll"
step1_test_save_path = f"{step1_out_dirpath}/original_test-A.conll"

step2_train_in_save_path = f"{step2_out_dirpath}/train_in.tsv"
step2_rest_in_save_path = f"{step2_out_dirpath}/rest_in.tsv"
step2_test_in_save_path = f"{step2_out_dirpath}/test_in.tsv"

step2_train_expected_save_path = f"{step2_out_dirpath}/train_expected.tsv"
step2_rest_expected_save_path = f"{step2_out_dirpath}/rest_expected.tsv"
step2_test_expected_save_path = f"{step2_out_dirpath}/test_expected.tsv"

step3_train_save_path = f"{step3_out_dirpath}/train.conll"
step3_rest_save_path = f"{step3_out_dirpath}/rest.conll"
step3_test_save_path = f"{step3_out_dirpath}/test.conll"

step4_train_save_path = f"{step4_out_dirpath}/original_train.tsv"
step4_test_save_path = f"{step4_out_dirpath}/original_test-A.tsv"
step4_rest_save_path = f"{step4_out_dirpath}/rest.tsv"

step5_train_save_path = f"{step5_out_dirpath}/original_train.tsv.s"
step5_test_save_path = f"{step5_out_dirpath}/original_test-A.tsv.s"
step5_rest_save_path = f"{step5_out_dirpath}/rest.tsv.s"

step6_model_save_path = f"{step6_out_dirpath}/best_model"

step7_pred_save_path = f"{step7_out_dirpath}/test-A/out.tsv"

In [15]:
# *.tsv input files cleaning
_train_in_path = "../data/tests/preprocessed/train/"
_test_in_path = "../data/tests/preprocessed/test-A/"
_train_expected_path = "../data/tests/preprocessed/train/"
_test_expected_path = "../data/tests/preprocessed/test-A/"

clean_tsv_file(in_path=train_in_path, out_path=_train_in_path)
clean_tsv_file(in_path=test_in_path, out_path=_test_in_path)
clean_tsv_file(in_path=train_expected_path, out_path=_train_expected_path)
clean_tsv_file(in_path=test_expected_path, out_path=_test_expected_path)

# *.clntmstmp input files cleaning
_train_clntmstmp_dir = "../data/tests/preprocessed/poleval_fa.train/train/"
_test_clntmstmp_dir = "../data/tests/preprocessed/poleval_fa.validation/validation"

for in_path in tqdm(glob.glob(f"{train_clntmstmp_dir}/*.clntmstmp")):
    clean_clmtmstmp_file(in_path=in_path, out_path=_train_clntmstmp_dir)

for in_path in tqdm(glob.glob(f"{test_clntmstmp_dir}/*.clntmstmp")):
    clean_clmtmstmp_file(in_path=in_path, out_path=_test_clntmstmp_dir)

# *.json input files cleaning
_wikitalks_json_dir = "../data/tests/preprocessed/json-wikinews"
_wikinews_json_dir = "../data/tests/preprocessed/json-wikitalks"

for in_path in tqdm(glob.glob(f"{wikinews_json_dir}/*.json")):
    clean_json_file(in_path=in_path, out_path=_wikinews_json_dir)
    
for in_path in tqdm(glob.glob(f"{wikitalks_json_dir}/*.json")):
    clean_json_file(in_path=in_path, out_path=_wikitalks_json_dir)

# Overwriting raw data paths with cleaned data paths
train_in_path = f"{_train_in_path}/{os.path.basename(train_in_path)}"
test_in_path = f"{_test_in_path}/{os.path.basename(test_in_path)}"
train_expected_path = f"{_train_expected_path}/{os.path.basename(train_expected_path)}"
test_expected_path = f"{_test_expected_path}/{os.path.basename(test_expected_path)}"
train_clntmstmp_dir = _train_clntmstmp_dir
test_clntmstmp_dir = _test_clntmstmp_dir
wikitalks_json_dir = _wikitalks_json_dir
wikinews_json_dir = _wikinews_json_dir

  0%|          | 0/793 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

In [16]:
# step 1.
clntmstmp_missing = [
    "wikitalks0013565",
    "wikitalks0015043",
    "wikitalks0016297",
    "wikitalks0016712",
    "wikitalks00415",
    "wikitalks005277",
    "wikitalks007429",
]

parse_tsv(
    in_path=train_in_path,
    expected_path=train_expected_path,
    save_path=step1_train_save_path,
    clntmstmp_dir=train_clntmstmp_dir,
    files_to_ignore=clntmstmp_missing,
)
parse_tsv(
    in_path=test_in_path,
    expected_path=test_expected_path,
    save_path=step1_test_save_path,
    clntmstmp_dir=test_clntmstmp_dir,
    files_to_ignore=clntmstmp_missing,
)

In [17]:
# step 2.
data = [
    wikinews_json_dir,
    wikitalks_json_dir
]

parse_data(
    train_path=train_in_path,
    test_path=test_in_path,
    data=data,
    save_path=step2_out_dirpath,
)

In [18]:
# step 3.
parse_tsv(
    in_path=step2_train_in_save_path,
    expected_path=step2_train_expected_save_path,
    save_path=step3_train_save_path,
    clntmstmp_dir=train_clntmstmp_dir,
    files_to_ignore=clntmstmp_missing,
)
parse_tsv(
    in_path=step2_rest_in_save_path,
    expected_path=step2_rest_expected_save_path,
    save_path=step3_rest_save_path,
    files_to_ignore=clntmstmp_missing,
)
parse_tsv(
    in_path=step2_test_in_save_path,
    expected_path=step2_test_expected_save_path,
    save_path=step3_test_save_path,
    clntmstmp_dir=test_clntmstmp_dir,
    files_to_ignore=clntmstmp_missing,
)

In [19]:
# step 4.
convert_to_pandas(data_file=step1_train_save_path, out_file=step4_train_save_path)
convert_to_pandas(data_file=step1_test_save_path, out_file=step4_test_save_path)
convert_to_pandas(data_file=step3_rest_save_path, out_file=step4_rest_save_path)

In [20]:
# step 5.
split_long_examples(data_path=step4_train_save_path, out_file=step5_train_save_path)
split_long_examples(data_path=step4_test_save_path, out_file=step5_test_save_path)
split_long_examples(data_path=step4_rest_save_path, out_file=step5_rest_save_path)

In [21]:
# step 6.
best_model_dir = train_model(
    train_data_dir=step5_rest_save_path,
    test_data_dir=step5_test_save_path,
    eval_data_dir=step5_train_save_path,
    save_dirpath=step6_out_dirpath,
    model_name="allegro/herbert-large-cased",
    learning_rate=2e-5,
    batch_size=12,
    grad_acc_steps=1,
    warmup_steps=0,
    eval_steps=200,
    eval_during_training=True,
    max_seq_len=256,
    seed=2
)

print("Learning process succeeded!")

labels ['B', ':', ';', ',', '.', '-', '?', '!']


Some weights of the model checkpoint at allegro/herbert-large-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not in

  0%|          | 0/21 [00:00<?, ?it/s]



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]



Running Epoch 1 of 5:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]



Running Epoch 2 of 5:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/11 [00:01<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]



Running Epoch 3 of 5:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/11 [00:01<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]



Running Epoch 4 of 5:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]



Learning process succeeded!


In [23]:
# step 7.
test_model(path_to_model=best_model_dir, path_to_test=test_in_path, path_to_out=step7_pred_save_path)