## <span style='font-family:Georgia'> Objectives
The purpose of this notebook is to test data cleaning and pre-processing for training on small data subset
    
*Author: El≈ºbieta Jowik*

In [1]:
import os
import glob
from tqdm.notebook import tqdm

from utils.clean_data import clean_tsv_file, clean_clmtmstmp_file, clean_json_file
from utils.parse_tsv import parse_tsv
from utils.parse_data import parse_data
from utils.convert_to_pandas import convert_to_pandas
from utils.split_long_examples import split_long_examples
from utils.train_model import train_model
from utils.test_model import test_model

### <span style='font-family:Georgia'> Parameters settings

In [2]:
# Input files paths
train_in_path = "../data/tests/source/train/in.tsv"
test_in_path = "../data/tests/source/test-A/in.tsv"

train_expected_path = "../data/tests/source/train/expected.tsv"
test_expected_path = "../data/tests/source/test-A/expected.tsv"

train_clntmstmp_dir = "../data/source/poleval_fa.train/train"
test_clntmstmp_dir = "../data/source/poleval_fa.validation/validation"

wikitalks_json_dir = "../data/tests/source/poleval_text.rest/wikinews/all/json"
wikinews_json_dir = "../data/tests/source/poleval_text.rest/wikitalks/all/json"

# Output files paths
step1_out_dirpath = "../data/tests/outputs/step1"
step2_out_dirpath = "../data/tests/outputs/step2"
step3_out_dirpath = step1_out_dirpath
step4_out_dirpath = "../data/tests/outputs/step4"
step5_out_dirpath = "../data/tests/outputs/step5"

step1_train_save_path = f"{step1_out_dirpath}/original_train.conll"
step1_test_save_path = f"{step1_out_dirpath}/original_test-A.conll"

step2_train_in_save_path = f"{step2_out_dirpath}/train_in.tsv"
step2_rest_in_save_path = f"{step2_out_dirpath}/rest_in.tsv"
step2_test_in_save_path = f"{step2_out_dirpath}/test_in.tsv"

step2_train_expected_save_path = f"{step2_out_dirpath}/train_expected.tsv"
step2_rest_expected_save_path = f"{step2_out_dirpath}/rest_expected.tsv"
step2_test_expected_save_path = f"{step2_out_dirpath}/test_expected.tsv"

step3_train_save_path = f"{step3_out_dirpath}/train.conll"
step3_rest_save_path = f"{step3_out_dirpath}/rest.conll"
step3_test_save_path = f"{step3_out_dirpath}/test.conll"

step4_train_save_path = f"{step4_out_dirpath}/original_train.tsv"
step4_test_save_path = f"{step4_out_dirpath}/original_test-A.tsv"
step4_rest_save_path = f"{step4_out_dirpath}/rest.tsv"

step5_train_save_path = f"{step5_out_dirpath}/original_train.tsv.s"
step5_test_save_path = f"{step5_out_dirpath}/original_test-A.tsv.s"
step5_rest_save_path = f"{step5_out_dirpath}/rest.tsv.s"

### <span style='font-family:Georgia'> Data cleaning (including `*.tsv`, `*.json` & `*.clntmstmp` files)

In [3]:
# *.tsv input files cleaning
_train_in_path = "../data/tests/preprocessed/train/"
_test_in_path = "../data/tests/preprocessed/test-A/"
_train_expected_path = "../data/tests/preprocessed/train/"
_test_expected_path = "../data/tests/preprocessed/test-A/"

clean_tsv_file(in_path=train_in_path, out_path=_train_in_path)
clean_tsv_file(in_path=test_in_path, out_path=_test_in_path)
clean_tsv_file(in_path=train_expected_path, out_path=_train_expected_path)
clean_tsv_file(in_path=test_expected_path, out_path=_test_expected_path)

# *.clntmstmp input files cleaning
_train_clntmstmp_dir = "../data/tests/preprocessed/poleval_fa.train/train/"
_test_clntmstmp_dir = "../data/tests/preprocessed/poleval_fa.validation/validation"

for in_path in tqdm(glob.glob(f"{train_clntmstmp_dir}/*.clntmstmp")):
    clean_clmtmstmp_file(in_path=in_path, out_path=_train_clntmstmp_dir)

for in_path in tqdm(glob.glob(f"{test_clntmstmp_dir}/*.clntmstmp")):
    clean_clmtmstmp_file(in_path=in_path, out_path=_test_clntmstmp_dir)

# *.json input files cleaning
_wikitalks_json_dir = "../data/tests/preprocessed/json-wikinews"
_wikinews_json_dir = "../data/tests/preprocessed/json-wikitalks"

for in_path in tqdm(glob.glob(f"{wikinews_json_dir}/*.json")):
    clean_json_file(in_path=in_path, out_path=_wikinews_json_dir)
    
for in_path in tqdm(glob.glob(f"{wikitalks_json_dir}/*.json")):
    clean_json_file(in_path=in_path, out_path=_wikitalks_json_dir)

# Overwriting raw data paths with cleaned data paths
train_in_path = f"{_train_in_path}/{os.path.basename(train_in_path)}"
test_in_path = f"{_test_in_path}/{os.path.basename(test_in_path)}"
train_expected_path = f"{_train_expected_path}/{os.path.basename(train_expected_path)}"
test_expected_path = f"{_test_expected_path}/{os.path.basename(test_expected_path)}"
train_clntmstmp_dir = _train_clntmstmp_dir
test_clntmstmp_dir = _test_clntmstmp_dir
wikitalks_json_dir = _wikitalks_json_dir
wikinews_json_dir = _wikinews_json_dir

  0%|          | 0/793 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

### <span style='font-family:Georgia'> Data preprocessing step 1.: 
Input & expected `*.tsv` files parsing including enrichment with information inferred from `*.clntmstmp` data

In [4]:
clntmstmp_missing = [
    "wikitalks0013565",
    "wikitalks0015043",
    "wikitalks0016297",
    "wikitalks0016712",
    "wikitalks00415",
    "wikitalks005277",
    "wikitalks007429",
]

parse_tsv(
    in_path=train_in_path,
    expected_path=train_expected_path,
    save_path=step1_train_save_path,
    clntmstmp_dir=train_clntmstmp_dir,
    files_to_ignore=clntmstmp_missing,
)
parse_tsv(
    in_path=test_in_path,
    expected_path=test_expected_path,
    save_path=step1_test_save_path,
    clntmstmp_dir=test_clntmstmp_dir,
    files_to_ignore=clntmstmp_missing,
)

### <span style='font-family:Georgia'> Data preprocessing step 2.: 
Wikitalks & Wikinews `*.json` files loading and writing to `*.tsv` format

In [5]:
data = [
    wikinews_json_dir,
    wikitalks_json_dir
]

parse_data(
    train_path=train_in_path,
    test_path=test_in_path,
    data=data,
    save_path=step2_out_dirpath,
)

### <span style='font-family:Georgia'> Data preprocessing step 3.:
Wikitalks & Wikinews (`*.tsv`) data parsing, including enrichment with information inferred from `*.clntmstmp` data

In [6]:
parse_tsv(
    in_path=step2_train_in_save_path,
    expected_path=step2_train_expected_save_path,
    save_path=step3_train_save_path,
    clntmstmp_dir=train_clntmstmp_dir,
    files_to_ignore=clntmstmp_missing,
)
parse_tsv(
    in_path=step2_rest_in_save_path,
    expected_path=step2_rest_expected_save_path,
    save_path=step3_rest_save_path,
    files_to_ignore=clntmstmp_missing,
)
parse_tsv(
    in_path=step2_test_in_save_path,
    expected_path=step2_test_expected_save_path,
    save_path=step3_test_save_path,
    clntmstmp_dir=test_clntmstmp_dir,
    files_to_ignore=clntmstmp_missing,
)

### <span style='font-family:Georgia'> Data preprocessing step 4.: 
Change the data format from `*.tsv` files to a Pandas frame

In [7]:
convert_to_pandas(data_file=step1_train_save_path, out_file=step4_train_save_path)
convert_to_pandas(data_file=step1_test_save_path, out_file=step4_test_save_path)
convert_to_pandas(data_file=step3_rest_save_path, out_file=step4_rest_save_path)

### <span style='font-family:Georgia'> Data preprocessing step 5.: 
Long examples splitting

In [8]:
split_long_examples(data_path=step4_train_save_path, out_file=step5_train_save_path)
split_long_examples(data_path=step4_test_save_path, out_file=step5_test_save_path)
split_long_examples(data_path=step4_rest_save_path, out_file=step5_rest_save_path)