In [2]:
from pathlib import Path
import csv
import re

## Pipeline

In [3]:
import stanza

  from .autonotebook import tqdm as notebook_tqdm


We create a pipeline for the italian language. We want to tokenize, pos tag, lemmatize and parse dependencies.
For the moment we use the default model ([which is a combined model](https://stanfordnlp.github.io/stanza/combined_models.html)) but we should consider to use also models based on a specific Treebank (that we can specify with the parameter `package`).

Here you can see the
[Universal Dependencies Treebanks available in Stanza](https://stanfordnlp.github.io/stanza/performance.html#system-performance-on-ud-treebanks)

In [3]:
?stanza.Pipeline

[0;31mInit signature:[0m
[0mstanza[0m[0;34m.[0m[0mPipeline[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mlang[0m[0;34m=[0m[0;34m'en'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdir[0m[0;34m=[0m[0;34m'/home/fab/stanza_resources'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpackage[0m[0;34m=[0m[0;34m'default'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprocessors[0m[0;34m=[0m[0;34m{[0m[0;34m}[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlogging_level[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_gpu[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmodel_dir[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdownload_method[0m[0;34m=[0m[0;34m<[0m[0mDownloadMethod[0m[0;34m.[0m[0mDOWNLOAD_RESOURCES[0m[0;34m:[0m [0;36m3[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mresources_url[

In [4]:
nlp = stanza.Pipeline(lang='it', processors='tokenize,mwt,pos,lemma,depparse', verbose=True)

2024-12-18 09:56:12 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 29.5MB/s]                    
2024-12-18 09:56:12 INFO: Downloaded file to /home/fab/stanza_resources/resources.json
2024-12-18 09:56:13 INFO: Loading these models for language: it (Italian):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2024-12-18 09:56:13 INFO: Using device: cpu
2024-12-18 09:56:13 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-12-18 09:56:15 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda sto

## Preparation of the dataset

### Load the dataset

In [5]:
data_dir = Path("../data")
training_sets_dir = "haspeede2_dev"
training_file = "haspeede2_dev_taskAB.tsv"

train_path = data_dir / training_sets_dir / training_file

train_set = []

with open(train_path, 'r') as file:
    reader = csv.DictReader(file, delimiter='\t')
    for row in reader:
        train_set.append(row)

### Preprocess

Remove some elements that can create problems to the pipeline: mentions, urls, sequences of more than one full stop.

We keep the text of hashtags (since they are often syntactically integrated in the text) but remove the hash.

In [6]:
HASHTAG_RE = re.compile(r'#([\w]+)')

def preprocess(text):
    text = text.replace('@user', '')
    text = re.sub(r'\.{2,}', ' ', text)
    text = text.replace('URL', '')
    text = re.sub(HASHTAG_RE, r'\1', text)
    return text.strip()

In [7]:
sample_doc = train_set[1]
print(sample_doc['text '])
print(preprocess(sample_doc['text ']))

@user @user infatti finché ci hanno guadagnato con i campi #rom tutto era ok con #Alemanno #Ipocriti 
infatti finché ci hanno guadagnato con i campi rom tutto era ok con Alemanno Ipocriti


In [8]:
clean_documents = [preprocess(doc['text ']) for doc in train_set]

## Process Documents

In [9]:
stanza_documents = nlp.bulk_process(clean_documents)

In [10]:
stanza_documents[0]

[
  [
    {
      "id": 1,
      "text": "È",
      "lemma": "essere",
      "upos": "AUX",
      "xpos": "V",
      "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
      "head": 2,
      "deprel": "cop",
      "start_char": 0,
      "end_char": 1
    },
    {
      "id": 2,
      "text": "terrorismo",
      "lemma": "terrorismo",
      "upos": "NOUN",
      "xpos": "S",
      "feats": "Gender=Masc|Number=Sing",
      "head": 0,
      "deprel": "root",
      "start_char": 2,
      "end_char": 12
    },
    {
      "id": 3,
      "text": "anche",
      "lemma": "anche",
      "upos": "ADV",
      "xpos": "B",
      "head": 4,
      "deprel": "advmod",
      "start_char": 13,
      "end_char": 18
    },
    {
      "id": 4,
      "text": "questo",
      "lemma": "questo",
      "upos": "PRON",
      "xpos": "PD",
      "feats": "Gender=Masc|Number=Sing|PronType=Dem",
      "head": 2,
      "deprel": "nsubj",
      "start_char": 19,
      "end_char": 25,
      "misc": "S

In [11]:
type(stanza_documents[0])

stanza.models.common.doc.Document

## Save processed Documents

Save each document in a dictionary:
- `id` document id: **string**
- `raw_text` document text as is (no processing): **string**
- `hs`: **int**
- `stereotype`: **int**
- `proc_text` document text processed by Stanza Pipeline: **stanza Document**

See documentation for stanza Document object: [https://stanfordnlp.github.io/stanza/data_objects.html#document](https://stanfordnlp.github.io/stanza/data_objects.html#document)

In [34]:
import pickle

In [13]:
documents = [dict(
    id=doc['id'],
    raw_text=doc['text '],
    hs=int(doc['hs']),
    stereotype=int(doc['stereotype']),
    proc_text=proc_doc
) for doc, proc_doc in zip(train_set, stanza_documents)]

### Pickle the documents

In [32]:
results_dir = Path('../results')

In [15]:
outpath = results_dir / 'stanza_proc_train.pkl'

with open(outpath, 'wb') as outfile:
    pickle.dump(documents, outfile)

### Load the documents and check

In [16]:
del documents

In [17]:
with open(outpath, 'rb') as infile:
    loaded_docs = pickle.load(infile)

In [18]:
type(loaded_docs[0]['proc_text'])

stanza.models.common.doc.Document

In [19]:
len(loaded_docs)

6837

## Process scraped headlines for domain adaptation

In [22]:
headlines = []
headlines_clean = []
with open(data_dir / "headlines.txt", 'r', encoding='utf-8') as infile:
    for index, line in enumerate(infile.readlines()):
        raw_text = line.strip()
        doc = dict(
            id=f"H{index+1}",
            raw_text=raw_text,
            hs=None,
            stereotype=None
        )
        headlines.append(doc)
        headlines_clean.append(preprocess(raw_text))


In [24]:
stanza_headlines = nlp.bulk_process(headlines_clean)

In [27]:
for doc, proc_doc in zip(headlines, stanza_headlines):
    doc['proc_text'] = proc_doc

In [35]:
with open(results_dir / 'stanza_proc_train_headlines.pkl', 'wb') as outfile:
    pickle.dump(headlines, outfile)