In [1]:
import os, xml, json, spacy
import xml.etree.ElementTree as ET

from tqdm import tqdm

In [2]:
nlp = spacy.load('hr_core_news_sm')

OSError: [E050] Can't find model 'hr_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [3]:
pauza_root = '../data/cropinion/pauza'

In [8]:
import re

class Lemmatizer():
    def __init__(self):
        self.lemmas = {}
        # __init__ should ensure the file is downloaded on import
        self.pattern = re.compile("\w+")
        with open("../data/molex/molex14_word2lemma.txt") as fin:
            for line in fin:
                word, lemma = line.split()
                self.lemmas[word] = lemma

    def lemmatize_word(self, word):
        lemma = self.lemmas.get(word.lower())
        if lemma:
            return lemma
        else:
            return word[:5]

    def lemmatize_string(self, string):
        lemmas = []
        for token in re.findall(self.pattern, string.lower()):
            lemmas.append(self.lemmatize_word(token))
        return " ".join(lemmas)

In [18]:
def parse_word(tagged_word):
    word = tagged_word.find('Word').text
    tag = tagged_word.find('POSTag').text
    lemma = tagged_word.find('Lemma').text
    stem = tagged_word.find('BasicStem').text

    return word, tag, lemma, stem

In [9]:
molex = Lemmatizer()

In [13]:
all_tags = set()
instance_files = os.listdir(pauza_root)
raw_documents = {}

pos_tag_words = {}

for instance_file in tqdm(instance_files, total=len(instance_files)):
    if not instance_file.endswith('xml'): continue
    # print(instance_file)
    docid = instance_file.replace(".xml", "")
    fp = f"{pauza_root}/{instance_file}"
    tree = ET.parse(fp)
    root = tree.getroot()
    # print(root.tag, root.attrib)
    text = root.find('Text').text
    rating = root.find('Rating').text

    doc = nlp(text)
    doc_tokens = []
    
    for token in doc:
        word = token.text
        pos = token.pos_.lower()
        lemma = molex.lemmatize_word(word.lower()) 
        # lemmatize
        doc_tokens.append((lemma, pos))
        
    raw_documents[docid] = {
        'text':doc_tokens,
        'rating':rating,
        'docid': docid,
    }

    #for child in root.findall('TaggedWords'):
    #    for word in child:
    #        w, t, l, s = parse_word(word)
    #        
    #        if t not in pos_tag_words:
    #            pos_tag_words[t] = {}
    #        if w not in pos_tag_words[t]:
    #            pos_tag_words[t][w] = 0
    #        pos_tag_words[t][w] += 1
        #print(" ".join(document))    

100%|█████████████████████████████████████████████████████| 3310/3310 [01:03<00:00, 52.18it/s]


In [39]:
for tag, tag_freqs in pos_tag_words.items():
    print(tag, sorted(tag_freqs.items(), key=lambda t:-t[1])[:20])

A [('sve', 716), (':', 687), ('...', 449), ('super', 426), ('odlična', 343), ('ukusna', 199), ('fino', 193), (';', 188), ('..', 179), ('odlično', 178), ('ukusno', 153), ('brza', 150), ('dobra', 138), ('jako', 127), ('+', 127), ('toplo', 125), ('fina', 122), ('odličan', 121), ('svemu', 120), ('ljubazan', 118)]
N [('dostava', 906), ('pizza', 689), ('hrana', 590), ('vrijeme', 464), ('dostavljač', 312), ('minuta', 283), ('vremena', 271), ('put', 263), ('mina', 249), ('pohvale', 243), ('palačinke', 242), ('sat', 240), ('quot', 230), (':', 211), ('porcija', 181), ('puta', 176), ('pizzu', 171), ('porcije', 169), ('čast', 162), ('pizze', 154)]
Z [(',', 6161), ('.', 4719), (')', 1418), ('!', 1396), ('-', 596), ('(', 585), ('*', 276), ('&', 247), ("'", 75), ('/', 74), ('%', 12)]
S [('u', 1276), ('na', 1087), ('za', 955), ('od', 749), ('s', 628), ('sa', 394), ('iz', 200), ('bez', 180), ('po', 180), ('nakon', 138), ('uz', 123), ('do', 118), ('umjesto', 88), ('zbog', 84), ('kod', 81), ('o', 70), ('

In [None]:
POS_TAGS = {'I': 'INTJ', 
            'P': 'DET', 
            'Y', 
            'N' : 'NOUN', 
            'M': 'NUM', 
            'V': 'VERB', 
            'S': 'ADP', 
            'Z' : 'PUNCT',
            'R', 'ADV',
            'Q': 'PART', 
            'C': 'CCONJ', 
            'A': 'ADJ',
}

In [14]:
out_name = '../data/cropinion/pauza_clean_tokens.jsonl'

with open(out_name, 'w') as outfile:
    for k, instance in raw_documents.items():
        outfile.write(json.dumps(instance)+"\n")

In [15]:
with open(out_name, 'r') as infile:
    for line in infile:
        print(json.loads(line))
        break

{'text': [['savršen', 'propn'], ['hrana', 'noun'], [',', 'punct'], ['dostava', 'verb'], ['točan', 'adj'], ['u', 'adp'], ['minuta', 'noun'], ['.', 'punct'], ['vrao', 'adv'], ['bogat', 'adj'], ['porcija', 'noun'], ['.', 'punct'], ['pristojan', 'noun'], ['i', 'cconj'], ['na', 'adp'], ['vriti', 'noun'], ['.', 'punct'], ['misliti', 'verb'], ['dati', 'sconj'], ['ne', 'part'], ['postojati', 'verb'], ['bolj', 'adv'], ['od', 'adp'], ['toga', 'det'], ['.', 'punct'], ['uz', 'adp'], ['sve', 'det'], ['to', 'det'], ['dolaziti', 'verb'], ['od', 'adp'], ['dostavljač', 'noun'], ['u', 'adp'], ['zalogajnica', 'noun'], ['.', 'punct'], ['tako', 'adv'], ['dati', 'sconj'], ['morati', 'verb'], ['pohvaliti', 'noun'], ['makar', 'adv'], ['ste', 'aux'], ['&', 'punct'], ['quot;', 'punct'], [';', 'punct'], ['!', 'punct']], 'rating': '6', 'docid': 'comment889'}
