In [1]:
from pathlib import Path
from parsing.structs import Pipeline
from parsing.tools import save_dfs, read_txts
import parsing.pipes as pipes
import pandas as pd

#### Read data

In [2]:
# output paths for stages in pipeline: 
stage2_dir = Path("..", "data", "preprocessing", "stage 2 - prepared")
stage3_path = Path("..", "data", "preprocessing", "stage 3 - verses", "verses_df.pkl")
stage4_dir = Path("..", "data", "preprocessing", "stage 4 - corpora")
stage5_dir = Path("..", "data", "preprocessing", "stage 5 - parallel corpora")
stage6_dir = Path("..", "data", "preprocessing", "stage 6 - split")

In [3]:
txtfiles = [file for file in stage2_dir.iterdir()]
texts = read_txts(txtfiles)

### Apply preprocessing

In [4]:
def stratify_wordcount(data):
    """Stratify by word count."""
    wordcount = data.applymap(lambda x: x.count(" ")).mean(axis=1) # mean of word count for each verse pair.
    return pd.cut(wordcount, 4, labels=False) # into bins.

In [5]:
stage3 = pipes.FunctionStep(
    lambda x: pd.to_pickle(x, stage3_path.as_posix())
)

stage5 = pipes.FunctionStep(
    lambda x: save_dfs(dfs=[x], folder=stage5_dir, filenames=["verse-corpus"])
)

stage6 = pipes.FunctionStep(
    lambda dfs: save_dfs(
        dfs, folder=stage6_dir, 
        filenames=[f"verse-corpus-{suffix}" for suffix in ["train", "test", "val"]])
)

In [9]:
pipeline = Pipeline([
    ("create documents", pipes.DocumentMatcher()),
    ("extract paragraphs", pipes.ParagraphMatcher()),
    ("extract verses", pipes.VerseMatcher()),
    ("language tag paragraphs", pipes.ParagraphLanguageTagger()),
    ("language tag verses", pipes.VerseLanguageTagger()),
    ("extract verses to DataFrame", pipes.DataframeTransformer()),
    ("replace annotation", pipes.CleanupStep()),
#    ("savepoint (pickle)", stage3),
    ("to parallel corpora", pipes.ParallelCorpusTransformer()),
#    ("savepoint (csv)", stage5),
    ("split dataset", pipes.TrainTestValSplitter(stratify_function=stratify_wordcount)),
#    ("savepoint (csv)", stage6),
])

In [10]:
processed = pipeline.process(texts)