In [17]:
import pandas as pd
import stanza
import re

In [18]:
# use the stanza tokenizer
stanza.download('sv', processors='tokenize,pos,lemma,depparse')
nlp = stanza.Pipeline(lang='sv', processors='tokenize,pos,lemma,depparse')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.2.json: 140kB [00:00, 4.57MB/s]                    
2022-02-21 13:31:28 INFO: Downloading these customized packages for language: sv (Swedish)...
| Processor | Package   |
-------------------------
| tokenize  | talbanken |
| pos       | talbanken |
| lemma     | talbanken |
| depparse  | talbanken |
| pretrain  | talbanken |

2022-02-21 13:31:28 INFO: File exists: /Users/filippakarrfelt/stanza_resources/sv/tokenize/talbanken.pt.
2022-02-21 13:31:28 INFO: File exists: /Users/filippakarrfelt/stanza_resources/sv/pos/talbanken.pt.
2022-02-21 13:31:28 INFO: File exists: /Users/filippakarrfelt/stanza_resources/sv/lemma/talbanken.pt.
2022-02-21 13:31:28 INFO: File exists: /Users/filippakarrfelt/stanza_resources/sv/depparse/talbanken.pt.
2022-02-21 13:31:28 INFO: File exists: /Users/filippakarrfelt/stanza_resources/sv/pretrain/talbanken.pt.
2022-02-21 13:31:28 INFO: Finished downloading models and sa

In [28]:
# load the dataset
df_train = pd.read_json(r'../data/training.json', orient='split')
df_train.head()
context_corpus = set()

In [35]:
def add_words_to_corpus(doc):
    for sentence in doc.sentences:
        for raw_word in sentence.words:
            # only add if character is letter or number (removes , . ? ! etc.)
            w_r  = re.sub('[^\s]', '', raw_word.text) # only remove space, escape char etc.
            if not w_r.isnumeric():
                context_corpus.add(w_r.lower())
            w_1  = re.sub('[^\sa-zåäöA-ZÅÄÖ0-9_-]', '', raw_word.text) # braod definition of words, including numbers, _ and -
            w_2  = re.sub('[^\sa-zåäöA-ZÅÄÖ]', '', raw_word.text)
            if len(w_1) > 0 and not w_1.isnumeric():
                context_corpus.add(w_1.lower())
            if len(w_2) > 0:
                context_corpus.add(w_2.lower())
            word_lemma = str(raw_word.lemma)
            if word_lemma != raw_word.text and not word_lemma.isnumeric():
                context_corpus.add(word_lemma.lower())


In [36]:
def add_context_words(df):
    print('adding context words to corpus..')
    for index, row in df.iterrows():
        context = row['context']
        context_parsed = nlp(context)
        add_words_to_corpus(context_parsed)
    return context_corpus

In [37]:
# Re-generate the context corpus
context_corpus = add_context_words(df_train)
context_corpus_list = list(context_corpus)

adding context words to corpus..


In [38]:
# save context corpus to file
def save_context_corpus(filename, list):
    list.sort()
    with open(filename, 'w') as out:
        for word in list:
            out.write(word + '\n')

save_context_corpus('../context-corpus.txt', context_corpus_list)