In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/course/toy_language_annotations.json


In [2]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [3]:
import json
 
with open('../input/course/toy_language_annotations.json', 'r') as f:
    data = json.load(f)

In [4]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [5]:
all_training_data = {'classes' : ['competence'], 'annotations' : []}
for example in data['examples']:
    temp_dict = {}
    temp_dict['text'] = example['content']
    temp_dict['entities'] = []
    for annotation in example['annotations']:
        start = annotation['start']
        end = annotation['end']
        label = annotation['tag'].upper()
        temp_dict['entities'].append((start, end, label))
        all_training_data['annotations'].append(temp_dict)

pp.pprint(all_training_data['annotations'][176])

{   'entities': [   (430, 444, 'COMPETENCE'),
                    (84, 106, 'COMPETENCE'),
                    (0, 8, 'COMPETENCE'),
                    (88, 107, 'COMPETENCE'),
                    (401, 423, 'COMPETENCE'),
                    (67, 106, 'COMPETENCE')],
    'text': 'Spanisch A1 Curso de repaso y profundización. Für Teilnehmende '
            'mit Grundkenntnissen der spanischen Sprache  und '
            'Wiedereinsteiger*innen geeignet.\n'
            '\n'
            'Dieser Kurs richtet sich an Teilnehmende, die das Niveau A1 '
            'auffrischen und vertiefen wollen, wobei auch Elemente des Niveaus '
            'A2 einfließen werden. Dabei stehen die Wiederholung, '
            'Reaktivierung und Festigung der bereits vorhandenen Kenntnisse, '
            'die leichte mündliche Konversation sowie Hörverständnisübungen im '
            'Mittelpunkt. Spiele und Lektüre sind ebenfalls Bestandteil des '
            'Auffrischungskurses.\n'
            'Das Arbeit

In [6]:
## select training data 
split = 30
training_data = {'classes' : ['competence'], 'annotations' : all_training_data['annotations'][: split]}

In [7]:
pp.pprint(training_data['annotations'][10])

{   'entities': [   (281, 289, 'COMPETENCE'),
                    (88, 96, 'COMPETENCE'),
                    (0, 8, 'COMPETENCE'),
                    (264, 273, 'COMPETENCE'),
                    (88, 106, 'COMPETENCE')],
    'text': 'Spanisch am Wochenende - Anfängerkurs . In diesem Intensivkurs '
            'erlernen Sie Ihre ersten Spanischkenntnisse in einer lockeren '
            'Atmosphäre mit Spaß, ohne Lerndruck. Sie bekommen erste Einblicke '
            'in die Sprache und lernen schnell einfache, alltägliche '
            'Redewendungen zu verstehen und zu sprechen. Dieser Kurs ist für '
            'Teilnehmende ohne Vorkenntnisse und Wiedereinsteigende '
            'geeignet.     '}


In [8]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin() # create a DocBin object

In [9]:
from spacy.util import filter_spans

for training_example  in tqdm(training_data['annotations']): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("./training_data.spacy") # save the docbin object

100%|██████████| 30/30 [00:00<00:00, 579.57it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity





In [10]:
!python -m spacy init config config.cfg --lang de --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: de
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [11]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[2022-06-23 17:51:54,787] [INFO] Set up nlp object from config
[2022-06-23 17:51:54,799] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-06-23 17:51:54,805] [INFO] Created vocabulary
[2022-06-23 17:51:54,806] [INFO] Finished initializing nlp object
[2022-06-23 17:51:55,068] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     47.00    6.82    3.69   44.12    0.07
  7     200        153.13   1031.07  100.00  100.00  100.00    1.00
 16     400          0.00      0.00  100.00  100.00  100.00    1.00
 26     600          0.00      0.00  100.00  100.00

In [12]:
nlp_ner = spacy.load("./model-best")

In [13]:
test_text = all_training_data['annotations'][176]['text']

doc = nlp_ner(test_text)

colors = {'COMPETENCE': "#C4DFE6"}
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)