# Comparing lemmas of auxiliaries Italian treebanks
In this example, we consider the 10 Italian treebanks in the UD collection version 2.14 (we suppose that there are available locally in the folder).
We build a new file `aux_lemmas.tsv` with a comparison of the `AUX` usage in the treebanks:

In [1]:
from grewpy import Request, Corpus
import os

connected to port: 63703


In [2]:
# We get the locally accessible treebanks (i.e. local folders with a name staring with `UD_`).
treebanks = [s for s in os.listdir(".") if s.startswith("UD_") and os.path.isdir(s)]

In [3]:
# Run the request, with clustering of each treebanks
r = Request ("X [upos = AUX]")
observations = dict()
for treebank in treebanks:
	corpus = Corpus(treebank)
	observations[treebank] = corpus.count (r, clustering_keys=["X.lemma"])
	corpus.clean()  # helps to save memory

In [4]:
# Compute the list of lemmas in all treebanks
lemmas_set = set()
for tb in observations:
	lemmas_set.update(observations[tb].keys())
lemmas = list (lemmas_set)
print (lemmas)

['fare', 'potere', 'avere', 'sapere', 'volere', 'andare', 'stare', 'dovere', 'venire', 'essere']


In [6]:
# Save the TSV file with percentage
tab="\t"
with open ('aux_lemmas.tsv', 'w') as f:
	f.write (f'Treebanks{tab}{tab.join(lemmas)}\n')
	for tb in treebanks:
		obs = observations[tb]
		total = sum(obs.values())
		ratios = [str(obs.get(lemma, 0)/total) for lemma in lemmas]
		f.write (f'{tb}{tab}{tab.join(ratios)}\n')