## **PREPARING MANUAL ANNOTATION FOR TRAINING**

In [2]:
!pip install stanza

import stanza
stanza.download('fr')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stanza
  Downloading stanza-1.4.2-py3-none-any.whl (691 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m691.3/691.3 KB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.9/240.9 KB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234926 sha256=ee9bfbd540f6825ef5bd7d203c18f63515ae768a5735cee4ad6a2e4838bfa9eb
  Stored in directory: /root/.cache/pip/wheels/86/62/9e/a6b27a681abcde69970dbc0326ff51955f3beac72f15696984
Successfully built emoji
Installing collected packages: emoji, stanza
Successfully installed emoji-2.2.0 stan

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Downloading default packages for language: fr (French) ...


Downloading https://huggingface.co/stanfordnlp/stanza-fr/resolve/v1.4.1/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.


In [3]:
!git clone https://github.com/cvbrandoe/coursTAL.git

Cloning into 'coursTAL'...
remote: Enumerating objects: 379, done.[K
remote: Counting objects: 100% (105/105), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 379 (delta 99), reused 104 (delta 99), pack-reused 274[K
Receiving objects: 100% (379/379), 22.91 MiB | 20.53 MiB/s, done.
Resolving deltas: 100% (174/174), done.


In [4]:
directory_tr = "coursTAL/2023/EltecFRTxtTrain"

In [None]:
import json
import requests, sys
import os
import glob

nb_p_training = 20

def read_text(file_name, nb_p_training):
  lines = []
  count = 0
  with open (file_name, "r", encoding="utf_8") as myfile:
    for line in myfile:
      if line and line != ' \n' and count < nb_p_training:
        lines.append(line.strip())
        count = count + 1
    #lines = list(line for line in (l.strip() ) if line)
    return '\n'.join([str(item) for item in lines])

def get_class_id(label):
  """
  Translates the spaCy label id into the tagtog entity type id
  - label: spaCy label id
  """
  choices = {'PER': 'e_1', 'LOC': 'e_2', 'ORG': 'e_3', 'MISC':'e_4'}
  return choices.get(label, None)

def get_entities(spans, pipeline):
  default_prob = 1
  default_part_id = 's1v1'
  default_state = 'pre-added'
  tagtog_entities = []
  for span in spans:
    class_id = get_class_id(span.type)
    if class_id is not None:
      tagtog_entities.append( {
        'classId': class_id,
        'part': default_part_id,
        'offsets':[{'start': span.start_char, 'text': span.text}],
        'confidence': {'state': default_state,'who': ['ml:' + pipeline],'prob': default_prob},
        'fields':{},
        # this is related to the kb_id (knowledge base ID) field from the Span spaCy object
        'normalizations': {}} )
  return tagtog_entities

# Set the credentials at tagtog and project name
MY_USERNAME = 'YOURUSER'
MY_PASSWORD = 'YOURPASS'
MY_PROJECT = 'YOURPROJ'

# API authentication
tagtogAPIUrl = "https://www.tagtog.com/-api/documents/v1"
auth = requests.auth.HTTPBasicAuth(username=MY_USERNAME, password=MY_PASSWORD)

nlp = stanza.Pipeline('fr', processors='tokenize,pos,ner')

for filename in glob.glob(directory_tr+'/*.txt'):
  filen = filename.split("/")[-1]
  print(filen)
  mytext = read_text(filename, nb_p_training)
  #print(mytext)
  doc = nlp(mytext)
  # Initialize ann.json (specification: https://docs.tagtog.com/anndoc.html#ann-json)
  annjson = {}
  # Set the document as not confirmed, an annotator will manually confirm whether the annotations are correct
  annjson['anncomplete'] = False
  annjson['metas'] = {}
  annjson['relations'] = []                      
  # Transform the spaCy entities into tagtog entities
  annjson['entities'] = get_entities(doc.ents, 'stanza')
  print(mytext)
  #save excerpt to file
  f = open(directory_tr+"/tr_"+filen, "w", encoding="utf_8")
  f.write(mytext)
  f.close()
  print(json.dumps(annjson))
  # Parameters for the API call 
  # see https://docs.tagtog.net/API_documents_v1.html#examples-import-pre-annotated-plain-text-file
  params = {'owner': MY_USERNAME, 'project': MY_PROJECT, 'output': 'null', 'format': 'default-plus-annjson'}
  # Pre-annotated document composed of the content and the annotations
  files=[(filen, mytext), (filen+'.ann.json', json.dumps(annjson))]
  # POST request to send the pre-annotated document

  response = requests.post(tagtogAPIUrl, params=params, auth=auth, files=files)
  print(response.text)






INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Loading these models for language: fr (French):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| ner       | wikiner |

INFO:stanza:Use device: gpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


tr_FRA04801_Corday.txt
I
— On peut entrer ?... Ah ! Elle est encore couchée, la petite loche ... Bonjour, mon amour, bonjour ma vieille Lucette ...
Zonzon—un diminutif de Suzon—se penchait à la porte entr'ouverte. En longue chemise, la gorge épanouie crevant la dentelle, la face brillante parmi ses cheveux qui la coiffaient d'un gros bonnet de fourrure châtain, les pieds nus dans des sandales rouges, la jeune femme courut au lit de sa sœur.
Elle était royale et claire, la chambre de Lucette. Royale par ses dimensions, par ses lignes, par le style de ses meubles et de ses panneaux, d'un Louis XVI fleuri, laqué blanc. Claire de toutes ces neigeuses sculptures, des miroirs à biseaux, des tentures délicates et tendres, des bibelots de Saxe et d'argent, toute une fraîcheur scintillante qu'exagérait encore la folle lumière du matin de juin. Lucette, qui s'apercevait dans les glaces, semblait perdue, parmi ses cheveux noirs répandus sur l'oreiller, dans le vaste lit de milieu exhaussé de deux