In [1]:
import sys
sys.path.append('../../')

In [23]:
import pandas as pd
import csv
from estnltk import Text
import time
import sqlite3
from estnltk.converters import text_to_json

from estnltk_neural.taggers import StanzaSyntaxTagger
from estnltk.taggers import NerTagger

from pattern_taggers.PhrasePatternTagger import PhrasePatternTagger
from pattern_taggers.PhrasePatternTagger2 import PhrasePatternTagger2
from pattern_taggers.PhrasePatternConsistencyTagger import PhrasePatternConsistencyTagger

### Reading data

In [4]:
rows = []
with open('data/100000_random_sentences.csv', encoding='UTF-8') as csv_file:
    rows = []
    reader = csv.DictReader(csv_file)
    for row in reader:
        rows.append(row)

In [5]:
df = pd.DataFrame(rows)

In [6]:
df

Unnamed: 0,sentence_id,document_id,sentence_start,sentence_end,subcorpus,text
0,135,1,820,905,aja_kr,Järgmise aasta jooksul nägime teineteist kuskil ehk paar korda - mingitel üritustel .
1,312,2,4807,4858,aja_kr,Aivar Mäed võiks väga hästi ka Mesikäpaks kutsuda .
2,621,9,7153,7242,aja_kr,"Tiina Mõis on aga sõnanud , et lapsed teavad , et haridus on tähtsam kui suur taskuraha ."
3,784,11,4721,4761,aja_kr,Ega selleks pole oma tüdrukut tarvis ! ”
4,839,12,1871,1973,aja_kr,"“ Me ikka ei teadvusta endale , kui õige on ütlemine , et igaüks on oma õnne sepp , ” rõhutab Kersti ."
...,...,...,...,...,...,...
99995,21212466,705319,200739,200759,jututoavestlus,gariby: ma ka ei tea
99996,21212538,705319,202890,202905,jututoavestlus,Shad: irw citty
99997,21212577,705319,203928,203976,jututoavestlus,nuux: 'a nagu käisin rootsis iga päev panin pidu
99998,21212644,705319,205639,205651,jututoavestlus,citty: on on


### Initializing taggers

In [3]:
stanza_tagger = StanzaSyntaxTagger(input_type='morph_analysis', input_morph_layer='morph_analysis',
                                   add_parent_and_children=True)

In [4]:
ner_tagger = NerTagger()

In [7]:
# tags phrases that match syntax tree, POS and ner patterns in rules file
pattern_tagger = PhrasePatternTagger(rules_file='data/indicator_patterns_ner_tree_pos_updated.csv')
pattern_tagger

name,output layer,output attributes,input layers
PhrasePatternTagger,phrase_patterns,"('extraction_pattern', 'ner_pattern', 'pattern_id', 'score', 'phrase_pattern_id', 'phrase_class')","('morph_analysis', 'words', 'stanza_syntax', 'ner')"

0,1
rules_file,data/indicator_patterns_ner_tree_pos_updated.csv
ruleset_map,"defaultdict(<class 'list'>, {'': [['int64', 'string', 'string']], '1 2 nmod,2 0 ..., type: <class 'collections.defaultdict'>, length: 7"


In [6]:
# tags phrases that match syntax tree and POS patterns in rules file, does not check ner pattern match
pattern_tagger2 = PhrasePatternTagger2(rules_file='data/indicator_patterns_ner_tree_pos_updated.csv')
pattern_tagger2

name,output layer,output attributes,input layers
PhrasePatternTagger2,phrase_patterns2,"('extraction_pattern', 'ner_pattern', 'pattern_id', 'score', 'phrase_pattern_id', 'phrase_class')","('morph_analysis', 'words', 'stanza_syntax', 'ner')"

0,1
rules_file,data/indicator_patterns_ner_tree_pos_updated.csv
ruleset_map,"defaultdict(<class 'list'>, {'': [['int64', 'string']], '1 2 nmod,2 0 *': [['1', ..., type: <class 'collections.defaultdict'>, length: 7"


In [7]:
# tags phrases according to rules file, adds info about mistakes, mistake locations and mistake types
consistency_tagger = PhrasePatternConsistencyTagger(rules_file='data/indicator_patterns_ner_tree_pos_updated.csv')
consistency_tagger

name,output layer,output attributes,input layers
PhrasePatternConsistencyTagger,pattern_consistency,"('syntax', 'pos', 'ner', 'is_correct', 'error_source', 'error_mask', 'correction')","('morph_analysis', 'words', 'stanza_syntax', 'ner')"

0,1
rules_file,data/indicator_patterns_ner_tree_pos_updated.csv
ruleset_map,"defaultdict(<class 'list'>, {'': [['string', 'string', 'string', 'string', 'stri ..., type: <class 'collections.defaultdict'>, length: 7"


### Tagging 100000 random sentences

In [15]:
text_objs = []

start = time.time()

for idx, row in df.iterrows():
    text = Text(row['text']).tag_layer('morph_analysis')
    text_objs.append(text)

print(f'Creating Text-objects and tagging morph_analysis layer on {len(text_objs)} sentences took {time.time()-start} seconds.')
df['tagged_text'] = text_objs

Creating Text-objects and tagging morph_analysis layer on 100000 sentences took 689.030485868454 seconds.


In [16]:
df.to_pickle(f"./data/100000_sentences_morph_analysis.pkl")

In [18]:
start = time.time()

for idx, row in df.iterrows():
    ner_tagger.tag(row['tagged_text'])

print(f'Tagging NER layer on {len(text_objs)} sentences took {time.time()-start} seconds.')

Tagging NER layer on 100000 sentences took 5658.154953241348 seconds.


In [19]:
df.to_pickle(f"./data/100000_sentences_ner.pkl")

In [23]:
start = time.time()

for idx, row in df.iterrows():
    stanza_tagger.tag(row['tagged_text'])

print(f'Tagging stanza syntax layer on {len(df)} sentences took {time.time()-start} seconds.')

Tagging stanza syntax layer on 100000 sentences took 5070.641754388809 seconds.


In [24]:
df.to_pickle(f"./data/100000_sentences_stanza.pkl")

In [16]:
start = time.time()

for idx, row in df.iterrows():
    pattern_tagger.tag(row['tagged_text'])

print(f'Tagging phrase pattern layer on {len(df)} sentences took {time.time()-start} seconds.')

Tagging phrase pattern layer on 100000 sentences took 5831.112302303314 seconds.


In [17]:
df.to_pickle(f"./data/100000_sentences_pattern.pkl")

In [8]:
start = time.time()

for idx, row in df.iterrows():
    pattern_tagger2.tag(row['tagged_text'])

print(f'Tagging phrase pattern layer (2) on {len(df)} sentences took {time.time()-start} seconds.')

Tagging phrase pattern layer (2) on 100000 sentences took 6025.065106868744 seconds.


In [9]:
df.to_pickle(f"./data/100000_sentences_pattern2.pkl")

In [20]:
start = time.time()

for idx, row in df.iterrows():
    consistency_tagger.tag(row['tagged_text'])

print(f'Tagging phrase pattern consistency layer on {len(df)} sentences took {time.time()-start} seconds.')

Tagging phrase pattern consistency layer on 100000 sentences took 5818.178214073181 seconds.


In [21]:
df.to_pickle(f"./data/100000_sentences_consistency.pkl")

In [31]:
df

Unnamed: 0,sentence_id,document_id,sentence_start,sentence_end,subcorpus,text,tagged_text
0,135,1,820,905,aja_kr,Järgmise aasta jooksul nägime teineteist kuskil ehk paar korda - mingitel üritustel .,Text(text='Järgmise aasta jooksul nägime teineteist kuskil ehk paar korda - mingitel üritustel .')
1,312,2,4807,4858,aja_kr,Aivar Mäed võiks väga hästi ka Mesikäpaks kutsuda .,Text(text='Aivar Mäed võiks väga hästi ka Mesikäpaks kutsuda .')
2,621,9,7153,7242,aja_kr,"Tiina Mõis on aga sõnanud , et lapsed teavad , et haridus on tähtsam kui suur taskuraha .","Text(text='Tiina Mõis on aga sõnanud , et lapsed teavad , et haridus on tähtsam kui suur taskuraha .')"
3,784,11,4721,4761,aja_kr,Ega selleks pole oma tüdrukut tarvis ! ”,Text(text='Ega selleks pole oma tüdrukut tarvis ! ”')
4,839,12,1871,1973,aja_kr,"“ Me ikka ei teadvusta endale , kui õige on ütlemine , et igaüks on oma õnne sepp , ” rõhutab Kersti .","Text(text='“ Me ikka ei teadvusta endale , kui õige on ütlemine , et igaüks on oma õnne sepp , ” rõhutab Kersti .')"
...,...,...,...,...,...,...,...
99995,21212466,705319,200739,200759,jututoavestlus,gariby: ma ka ei tea,Text(text='gariby: ma ka ei tea')
99996,21212538,705319,202890,202905,jututoavestlus,Shad: irw citty,Text(text='Shad: irw citty')
99997,21212577,705319,203928,203976,jututoavestlus,nuux: 'a nagu käisin rootsis iga päev panin pidu,"Text(text=""nuux: 'a nagu käisin rootsis iga päev panin pidu"")"
99998,21212644,705319,205639,205651,jututoavestlus,citty: on on,Text(text='citty: on on')


In [28]:
# layer of phrase patterns (with ner pattern check)
df['tagged_text'][0].phrase_patterns

layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Järgmise', 'aasta']","1 2 amod,2 0 *,A-S",OTHER-OTHER,64,,,


In [29]:
# layer of phrase patterns (without ner pattern check)
df['tagged_text'][0].phrase_patterns2

layer name,attributes,parent,enveloping,ambiguous,span count
phrase_patterns2,"extraction_pattern, ner_pattern, pattern_id, score, phrase_pattern_id, phrase_class",,words,True,1

text,extraction_pattern,ner_pattern,pattern_id,score,phrase_pattern_id,phrase_class
"['Järgmise', 'aasta']","1 2 amod,2 0 *,A-S",OTHER-OTHER,64,,,
,"1 2 amod,2 0 *,A-S",OTHER-OTHER,65,,,
,"1 2 amod,2 0 *,A-S",OTHER-OTHER,66,,,
,"1 2 amod,2 0 *,A-S",OTHER-OTHER,67,,,


In [30]:
# layer of phrase pattern consistency
df['tagged_text'][0].pattern_consistency

layer name,attributes,parent,enveloping,ambiguous,span count
pattern_consistency,"syntax, pos, ner, is_correct, error_source, error_mask, correction",,words,True,1

text,syntax,pos,ner,is_correct,error_source,error_mask,correction
"['Järgmise', 'aasta']","1 2 amod,2 0 *",A-S,OTHER-OTHER,T,-,0-0,-


### Saving tagged sentences and phrases in database

In [26]:
con = sqlite3.connect("tagged_100000_sentences.db")
cur = con.cursor()
cur.execute('pragma encoding=UTF8')
cur.execute("CREATE TABLE tagged_phrases(ID INTEGER PRIMARY KEY, extraction_pattern TEXT, ner_pattern TEXT, actual_syntax_pattern TEXT, pattern_id INTEGER, raw_lemmas TEXT, raw_text TEXT, parent_sentence TEXT)")

<sqlite3.Cursor at 0x1f020497b40>

In [27]:
# the phrases will be taken from phrase_patterns2 layer because of absence of ner pattern rules in PhrasePatternTagger2
# which means that more phrases may have been tagged

start = time.time()

for idx, row in df.iterrows():
    sentence_json = text_to_json(row['tagged_text'])
    for phrase in row['tagged_text'].phrase_patterns2:
        p_lemmas = []
        p_words = []
        p_syntax_info = []
        for span in phrase:
            morph_word = row['tagged_text'].morph_analysis.get(span)
            # first lemma is always chosen
            p_lemmas.append(morph_word.lemma[0])
            p_words.append(span.text)
            # adding actual syntax info
            stanza_word = row['tagged_text'].stanza_syntax.get(span)
            p_syntax_info.append(f'{stanza_word.id} {stanza_word.head} {stanza_word.deprel}')
            
        raw_lemmas = ' '.join([l for l in p_lemmas])
        raw_text = ' '.join([w for w in p_words])
        cur.execute("""INSERT INTO tagged_phrases
                                (extraction_pattern, ner_pattern, actual_syntax_pattern, pattern_id, raw_lemmas, raw_text, parent_sentence)
                                VALUES (?, ?, ?, ?, ?, ?, ?);""", (phrase['extraction_pattern'][0], phrase['ner_pattern'][0], ','.join(p_syntax_info), phrase['pattern_id'][0], raw_lemmas, raw_text, sentence_json))
    
        con.commit()

con.close()

print(f'Saving {len(df)} tagged sentences and found phrases in sqlite3 database took {time.time()-start} seconds.')

Saving 100000 tagged sentences and found phrases in sqlite3 database took 1122.2807488441467 seconds.
