In [2]:
!pip install stanza
!pip install transformers

Collecting stanza
  Using cached stanza-1.5.0-py3-none-any.whl (802 kB)
Collecting tqdm
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting emoji
  Using cached emoji-2.2.0.tar.gz (240 kB)
Collecting protobuf
  Downloading protobuf-4.23.0-py3-none-any.whl (173 kB)
[K     |████████████████████████████████| 173 kB 3.5 MB/s eta 0:00:01
[?25hCollecting requests
  Using cached requests-2.30.0-py3-none-any.whl (62 kB)
Collecting six
  Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
Collecting numpy
  Using cached numpy-1.24.3-cp38-cp38-macosx_10_9_x86_64.whl (19.8 MB)
Collecting torch>=1.3.0
  Using cached torch-2.0.1-cp38-none-macosx_10_9_x86_64.whl (143.1 MB)
Collecting idna<4,>=2.5
  Using cached idna-3.4-py3-none-any.whl (61 kB)
Collecting certifi>=2017.4.17
  Using cached certifi-2023.5.7-py3-none-any.whl (156 kB)
Collecting urllib3<3,>=1.21.1
  Using cached urllib3-2.0.2-py3-none-any.whl (123 kB)
Collecting charset-normalizer<4,>=2
  Using cached charset_normalizer-3.

# Libraries

In [None]:
import stanza
stanza.download("en")
stanza.download("pt")
stanza.download("tr")
stanza.install_corenlp()

In [22]:
%env CONSTITUENCY_BASE=./

env: CONSTITUENCY_BASE=./


### English

#### Data preparation

In [None]:
def split_treebank(treebank, train_size, dev_size):
    """
    Split a treebank deterministically
    """
    train_end = int(len(treebank) * train_size)
    dev_end = int(len(treebank) * (train_size + dev_size))
    return treebank[:train_end], treebank[train_end:dev_end], treebank[dev_end:]


all_file = ""

for doc in range(1, 199):
    doc_str = f"wsj_{str(doc).zfill(4)}"
    with open(f"./datasets/english/penntreebank/{doc_str}.mrg") as file:
        file_text = file.read()
    all_file = all_file + file_text

all_file = all_file.replace("  ", "")
all_file = all_file.replace("\n", "")
all_file = all_file.replace("(. .) ))", "(. .) ))\n")

all_file = all_file.split("\n")
train, dev, test = split_treebank(all_file, 0.8, 0.1)

import os

os.makedirs(os.path.dirname("./datasets/english/processed/en_wsj_train.mrg"), exist_ok=True)
os.makedirs(os.path.dirname("./datasets/english/processed/en_wsj_dev.mrg"), exist_ok=True)
os.makedirs(os.path.dirname("./datasets/english/processed/en_wsj_test.mrg"), exist_ok=True)

with open("./datasets/english/processed/en_wsj_train.mrg", "w+") as file:
    file.write("\n".join(train))

with open("./datasets/english/processed/en_wsj_dev.mrg", "w+") as file:
    file.write("\n".join(dev))

with open("./datasets/english/processed/en_wsj_test.mrg", "w+") as file:
    file.write("\n".join(test))

#### Train

In [3]:
import time
start = time.time()

!python3 -m stanza.utils.training.run_constituency --train en_wsj --save_dir ./output_stanza_en --epochs 20

end = time.time()
print(end - start)

2023-04-16 21:39:34 INFO: Training program called with:
/Users/gonzalojaimovitch/Desktop/projects/constituency_parser/.env/lib/python3.8/site-packages/stanza/utils/training/run_constituency.py --train en_wsj --save_dir ./output_stanza_en --epochs 20
2023-04-16 21:39:34 INFO: Save file for en_wsj model: en_wsj_constituency.pt
2023-04-16 21:39:34 INFO: en_wsj: ./output_stanza_en/en_wsj_constituency.pt does not exist, training new model
2023-04-16 21:39:34 INFO: Using default pretrain for language, found in /Users/gonzalojaimovitch/stanza_resources/en/pretrain/combined.pt  To use a different pretrain, specify --wordvec_pretrain_file
2023-04-16 21:39:34 INFO: Using model /Users/gonzalojaimovitch/stanza_resources/en/forward_charlm/1billion.pt for forward charlm
2023-04-16 21:39:34 INFO: Using model /Users/gonzalojaimovitch/stanza_resources/en/backward_charlm/1billion.pt for backward charlm
2023-04-16 21:39:34 INFO: Running train step with args: ['--train_file', 'data/constituency/en_wsj_tra

### Portuguese

#### Data preparation

In [None]:
import xml.etree.ElementTree as ET

from stanza.models.constituency import tree_reader
from stanza.utils.datasets.constituency import utils

def read_xml_file(input_filename):
    """
    Convert the CINTIL xml file to id & test
    Returns a list of tuples: (id, text)
    """
    with open(input_filename, encoding="utf-8") as fin:
        dataset = ET.parse(fin)
    dataset = dataset.getroot()
    corpus = dataset.find("{http://www.iula.upf.edu}corpus")
    if not corpus:
        raise ValueError("Unexpected dataset structure : no 'corpus'")
    trees = []
    for sentence in corpus:
        if sentence.tag != "{http://www.iula.upf.edu}sentence":
            raise ValueError("Unexpected sentence tag: {}".format(sentence.tag))
        id_node = None
        raw_node = None
        tree_node = None
        for node in sentence:
            if node.tag == '{http://www.iula.upf.edu}id':
                id_node = node
            elif node.tag == '{http://www.iula.upf.edu}raw':
                raw_node = node
            elif node.tag == '{http://www.iula.upf.edu}tree':
                tree_node = node
            else:
                raise ValueError("Unexpected tag in sentence {}: {}".format(sentence, node.tag))
        if id_node is None or raw_node is None or tree_node is None:
            raise ValueError("Missing node in sentence {}".format(sentence))
        tree_id = "".join(id_node.itertext())
        tree_text = "".join(tree_node.itertext())
        trees.append((tree_id, tree_text))
    return trees

def convert_cintil_treebank(input_filename, train_size=0.8, dev_size=0.1):
    """
    dev_size is the size for splitting train & dev
    """
    trees = read_xml_file(input_filename)

    synthetic_trees = []
    natural_trees = []
    for tree_id, tree_text in trees:
        if tree_text.find(" _") >= 0:
            raise ValueError("Unexpected underscore")
        tree_text = tree_text.replace("_)", ")")
        tree_text = tree_text.replace("(A (", "(A' (")
        # trees don't have ROOT, but we typically use a ROOT label at the top
        tree_text = "(ROOT %s)" % tree_text
        trees = tree_reader.read_trees(tree_text)
        if len(trees) != 1:
            raise ValueError("Unexpectedly found %d trees in %s" % (len(trees), tree_id))
        tree = trees[0]
        if tree_id.startswith("aTSTS"):
            synthetic_trees.append(tree)
        elif tree_id.find("TSTS") >= 0:
            raise ValueError("Unexpected TSTS")
        else:
            natural_trees.append(tree)

    print("Read %d synthetic trees" % len(synthetic_trees))
    print("Read %d natural trees" % len(natural_trees))
    train_trees, dev_trees, test_trees = utils.split_treebank(natural_trees, train_size, dev_size)
    print("Split %d trees into %d train %d dev %d test" % (len(natural_trees), len(train_trees), len(dev_trees), len(test_trees)))
    train_trees = synthetic_trees + train_trees
    print("Total lengths %d train %d dev %d test" % (len(train_trees), len(dev_trees), len(test_trees)))
    return train_trees, dev_trees, test_trees


In [None]:
from stanza.utils.datasets.constituency.utils import write_dataset
datasets = convert_cintil_treebank("./datasets/portuguese/CINTIL/CINTIL-Treebank.xml")
write_dataset(datasets, "./datasets/portuguese/CINTIL", "pt_cintil")

#### Train

In [5]:
import time
start = time.time()

!python3 -m stanza.utils.training.run_constituency --train pt_cintil --save_dir ./output_stanza_pt --epochs 20

end = time.time()
print(end - start)

2023-04-17 15:13:08 INFO: Training program called with:
/Users/gonzalojaimovitch/Desktop/projects/constituency_parser/.env/lib/python3.8/site-packages/stanza/utils/training/run_constituency.py --train pt_cintil --save_dir ./output_stanza_pt --epochs 20
2023-04-17 15:13:08 INFO: Save file for pt_cintil model: pt_cintil_constituency.pt
2023-04-17 15:13:08 INFO: pt_cintil: ./output_stanza_pt/pt_cintil_constituency.pt does not exist, training new model
2023-04-17 15:13:09 INFO: Using default pretrain for language, found in /Users/gonzalojaimovitch/stanza_resources/pt/pretrain/bosque.pt  To use a different pretrain, specify --wordvec_pretrain_file
2023-04-17 15:13:09 INFO: Running train step with args: ['--train_file', 'data/constituency/pt_cintil_train.mrg', '--eval_file', 'data/constituency/pt_cintil_dev.mrg', '--shorthand', 'pt_cintil', '--mode', 'train', '--retag_method', 'upos', '--wordvec_pretrain_file', '/Users/gonzalojaimovitch/stanza_resources/pt/pretrain/bosque.pt', '--bert_model'

### Turkish

In [23]:
!python3 -m stanza.utils.datasets.constituency.prepare_con_dataset tr_starlang

Reading 25329 total files
100%|███████████████████████████████████| 22975/22975 [00:19<00:00, 1177.58it/s]
100%|███████████████████████████████████████| 959/959 [00:00<00:00, 1477.74it/s]
100%|█████████████████████████████████████| 1395/1395 [00:00<00:00, 1795.46it/s]
Writing 22975 trees to data/constituency/tr_starlang_train.mrg
Writing 959 trees to data/constituency/tr_starlang_dev.mrg
Writing 1395 trees to data/constituency/tr_starlang_test.mrg


In [24]:
import time
start = time.time()

!python3 -m stanza.utils.training.run_constituency --train tr_starlang --save_dir ./output_stanza_tr --epochs 20

end = time.time()
print(end - start)

2023-05-14 01:47:18 INFO: Training program called with:
/Users/gonzalojaimovitch/Desktop/projects/master/constituency_parser/.env/lib/python3.8/site-packages/stanza/utils/training/run_constituency.py --train tr_starlang --save_dir ./output_stanza_tr --epochs 20
2023-05-14 01:47:18 INFO: Save file for tr_starlang model: tr_starlang_constituency.pt
2023-05-14 01:47:18 INFO: tr_starlang: ./output_stanza_tr/tr_starlang_constituency.pt does not exist, training new model
2023-05-14 01:47:18 INFO: Using default pretrain for language, found in /Users/gonzalojaimovitch/stanza_resources/tr/pretrain/imst.pt  To use a different pretrain, specify --wordvec_pretrain_file
2023-05-14 01:47:18 INFO: Using model /Users/gonzalojaimovitch/stanza_resources/tr/forward_charlm/conll17.pt for forward charlm
2023-05-14 01:47:18 INFO: Using model /Users/gonzalojaimovitch/stanza_resources/tr/backward_charlm/conll17.pt for backward charlm
2023-05-14 01:47:18 INFO: Running train step with args: ['--train_file', 'da