In [1]:
import sys
sys.path.append("..\\")

In [108]:
from estnltk_patches.label_studio.labelling_configurations.phrase_tagging_configuration import PhraseTaggingConfiguration
from estnltk_patches.label_studio.labelling_tasks.phrase_tagging_task import PhraseTaggingTask

from estnltk_patches.label_studio.labelling_configurations.phrase_classification_configuration import PhraseClassificationConfiguration
from estnltk_patches.label_studio.labelling_tasks.phrase_classification_task import PhraseClassificationTask

In [3]:
from estnltk import Text
from estnltk.taggers.system.rule_taggers.extraction_rules.ruleset import AmbiguousRuleset, Ruleset
from estnltk.taggers.system.rule_taggers.extraction_rules.static_extraction_rule import StaticExtractionRule
from estnltk.taggers.system.rule_taggers.taggers.substring_tagger import SubstringTagger
import sqlite3

In [4]:
rules = Ruleset([
    StaticExtractionRule('kass', {'label': 'kass'}),
    StaticExtractionRule('koer', {'label': 'koer'})
])

tagger = SubstringTagger(rules, output_attributes=['label'], ignore_case=True)

In [5]:
text = Text('Koer tuletas omanikule meelde, et uus kass, mille peremees võtab, ei tohi olla ilusam kui vana')
tagger(text)
None

In [6]:
conf = PhraseTaggingConfiguration(['Koer', 'kass'])
task = PhraseTaggingTask(conf, input_layer='terms', output_layer='terms', labelling_function=lambda x: x.text)

In [7]:
print(task.interface_file)

<View>
  <Labels name="terms" toName="text" >
    <Label value="Koer" background="green" />
    <Label value="kass" background="blue" />
  </Labels>
  <Text name="text" value="$text" granularity="word" />
</View>


In [8]:
print(task.export_data(text, indent=2))
assert task.exported_labels == {'Koer', 'kass'}

[
  {
    "data": {
      "text": "Koer tuletas omanikule meelde, et uus kass, mille peremees v\u00f5tab, ei tohi olla ilusam kui vana"
    },
    "annotations": [
      {
        "result": [
          {
            "value": {
              "start": 0,
              "end": 4,
              "labels": [
                "Koer"
              ]
            },
            "from_name": "terms",
            "to_name": "text",
            "type": "labels"
          },
          {
            "value": {
              "start": 38,
              "end": 42,
              "labels": [
                "kass"
              ]
            },
            "from_name": "terms",
            "to_name": "text",
            "type": "labels"
          }
        ]
      }
    ]
  }
]


### On more data

In [31]:
# getting data from database
con = sqlite3.connect("tagged_noun_phrases2.db")
cur = con.cursor()

In [32]:
cur.execute("SELECT extraction_pattern, ner_pattern, raw_text, raw_lemmas, count(*) FROM tagged_phrases GROUP BY raw_lemmas ORDER BY count(*) DESC")
data = cur.fetchall()

In [33]:
data[0]

('1 2 nmod,2 0 root,3 2 nmod,H-S-S',
 'LOC-OTHER-OTHER',
 'Euroopa tunnetamises koduna',
 'Euroopa tunnetamine kodu',
 33)

In [34]:
con.close()

In [35]:
rules_list = []

for row in data:
    syntax_tree = ','.join(row[0].split(',')[:-1])
    pos_pattern = row[0].split(',')[-1]
    rule = StaticExtractionRule(row[3], {'syntax_tree_pattern': syntax_tree, 'POS_pattern': pos_pattern, 'NER_pattern': row[1], 'raw_text': row[2]})
    #print(rule.output_attributes)
    rules_list.append(rule)

In [36]:
rules = AmbiguousRuleset(rules_list)
tagger = SubstringTagger(rules, output_attributes=['syntax_tree_pattern', 'POS_pattern', 'NER_pattern', 'raw_text'], ignore_case=True)

In [37]:
print(data[0])

('1 2 nmod,2 0 root,3 2 nmod,H-S-S', 'LOC-OTHER-OTHER', 'Euroopa tunnetamises koduna', 'Euroopa tunnetamine kodu', 33)


In [38]:
data = [list(row) for row in data]

In [39]:
texts = []

for row in data:
    row[3] = Text(row[3])
    tagger.tag(row[3])
    texts.append(row[3])
    None

In [40]:
display(texts[0].terms)

layer name,attributes,parent,enveloping,ambiguous,span count
terms,"syntax_tree_pattern, POS_pattern, NER_pattern, raw_text",,,True,1

text,syntax_tree_pattern,POS_pattern,NER_pattern,raw_text
Euroopa tunnetamine kodu,"1 2 nmod,2 0 root,3 2 nmod",H-S-S,LOC-OTHER-OTHER,Euroopa tunnetamises koduna


In [103]:
conf = PhraseTaggingConfiguration(['syntax_tree_pattern', 'POS_pattern', 'NER_pattern'])
task = PhraseTaggingTask(conf, input_layer='terms', output_layer='terms', label_attribute='NER_pattern')

In [104]:
print(task.interface_file)

<View>
  <Labels name="terms" toName="text" >
    <Label value="syntax_tree_pattern" background="#1b9e77" />
    <Label value="POS_pattern" background="#d95f02" />
    <Label value="NER_pattern" background="#7570b3" />
  </Labels>
  <Text name="text" value="$text" granularity="word" />
</View>


In [101]:
print(task.exported_labels)

set()


In [105]:
print(task.export_data(texts[0], indent=2))

[
  {
    "data": {
      "text": "Euroopa tunnetamine kodu"
    },
    "annotations": [
      {
        "result": [
          {
            "value": {
              "start": 0,
              "end": 24,
              "labels": [
                "LOC-OTHER-OTHER"
              ]
            },
            "from_name": "terms",
            "to_name": "text",
            "type": "labels"
          }
        ]
      }
    ]
  }
]


In [106]:
print(task.export_data(texts, indent=2))

[
  {
    "data": {
      "text": "Euroopa tunnetamine kodu"
    },
    "annotations": [
      {
        "result": [
          {
            "value": {
              "start": 0,
              "end": 24,
              "labels": [
                "LOC-OTHER-OTHER"
              ]
            },
            "from_name": "terms",
            "to_name": "text",
            "type": "labels"
          }
        ]
      }
    ]
  },
  {
    "data": {
      "text": "Euroopa tunnetamine"
    },
    "annotations": [
      {
        "result": [
          {
            "value": {
              "start": 0,
              "end": 19,
              "labels": [
                "LOC-OTHER"
              ]
            },
            "from_name": "terms",
            "to_name": "text",
            "type": "labels"
          }
        ]
      }
    ]
  },
  {
    "data": {
      "text": "ca 5%"
    },
    "annotations": [
      {
        "result": [
          {
            "value": {
              "start": 0

In [107]:
task.export_data(texts, file_path='label_studio_confs/export_str_3.json', indent=2)

In [110]:
conf2 = PhraseClassificationConfiguration(phrase_labels=['syntax_tree_pattern', 'POS_pattern', 'NER_pattern'], class_labels={'True': 'Jah', 'False': 'Ei'})
task2 = PhraseClassificationTask(conf2, input_layer='terms', output_layer='terms', label_attribute='NER_pattern')
print(task2.interface_file)

<View>
  <Labels name="phrase" toName="text" >
    <Label value="syntax_tree_pattern" background="#1b9e77" />
    <Label value="POS_pattern" background="#d95f02" />
    <Label value="NER_pattern" background="#7570b3" />
  </Labels>
  <Text name="text" value="$text" />
  <Choices name="phrase_class" toName="text" >
    <Choice value="Jah" alias="True" />
    <Choice value="Ei" alias="False" />
  </Choices>
</View>


In [111]:
print(task2.export_data(texts[0], indent=2))

[
  {
    "data": {
      "text": "Euroopa tunnetamine kodu"
    },
    "annotations": [
      {
        "result": [
          {
            "value": {
              "start": 0,
              "end": 24,
              "labels": [
                "LOC-OTHER-OTHER"
              ]
            },
            "from_name": "phrase",
            "to_name": "text",
            "type": "labels"
          }
        ]
      }
    ]
  }
]


Unexpected label classes occurred during the export.
Use the field exported_labels to see all class labels generated by the export
and update the labelling configuration by calling set_class_labels(...)


In [112]:
print(task2.export_data(texts, indent=2))

[
  {
    "data": {
      "text": "Euroopa tunnetamine kodu"
    },
    "annotations": [
      {
        "result": [
          {
            "value": {
              "start": 0,
              "end": 24,
              "labels": [
                "LOC-OTHER-OTHER"
              ]
            },
            "from_name": "phrase",
            "to_name": "text",
            "type": "labels"
          }
        ]
      }
    ]
  },
  {
    "data": {
      "text": "Euroopa tunnetamine"
    },
    "annotations": [
      {
        "result": [
          {
            "value": {
              "start": 0,
              "end": 19,
              "labels": [
                "LOC-OTHER"
              ]
            },
            "from_name": "phrase",
            "to_name": "text",
            "type": "labels"
          }
        ]
      }
    ]
  },
  {
    "data": {
      "text": "ca 5%"
    },
    "annotations": [
      {
        "result": [
          {
            "value": {
              "start":

In [113]:
task2.export_data(texts, file_path='label_studio_confs/export_str_4.json', indent=2)

In [114]:
conf3 = PhraseClassificationConfiguration(phrase_labels=['syntax_tree_pattern', 'POS_pattern', 'NER_pattern'], class_labels={'True': 'True', 'False': 'False'})
task3 = PhraseClassificationTask(conf3, input_layer='terms', output_layer='terms', label_attribute='NER_pattern', exported_meta_fields=['syntax_tree_pattern', 'POS_pattern', 'NER_pattern'])
print(task3.interface_file)

<View>
  <Labels name="phrase" toName="text" >
    <Label value="syntax_tree_pattern" background="#1b9e77" />
    <Label value="POS_pattern" background="#d95f02" />
    <Label value="NER_pattern" background="#7570b3" />
  </Labels>
  <Text name="text" value="$text" />
  <Choices name="phrase_class" toName="text" >
    <Choice value="True" alias="True" />
    <Choice value="False" alias="False" />
  </Choices>
</View>


In [115]:
task3.export_data(texts, file_path='label_studio_confs/export_str_5.json', indent=2)