In [71]:
import json
import requests

## Functions for Skrutten APIs

### Stava

In [72]:
def spell_check(words, coding='json'):
    url = 'https://skrutten.csc.kth.se/granskaapi/spell/'

    if isinstance(words, list):
        words = '\n'.join(words)

    params = {'coding': coding, 'words': words}

    response = requests.post(url, data=params)

    if response.status_code == 200:
        result = response.json()
        return result
    else:
        return None

### Taggstava

In [73]:
def word_class_tagging(words, coding='json'):
    url = 'https://skrutten.csc.kth.se/granskaapi/taggstava/'

    if isinstance(words, list):
        words = '\n'.join(words)

    params = {'coding': coding, 'words': words}

    response = requests.post(url, data=params)

    if response.status_code == 200:
        result = response.json()
        return result
    else:
        return None

### Särstava

In [74]:
def compound_check(words, coding='json'):
    url = 'https://skrutten.csc.kth.se/granskaapi/compound/best'

    if isinstance(words, list):
        words = '\n'.join(words)

    params = {'coding': coding, 'words': words}

    response = requests.post(url, data=params)

    if response.status_code == 200:
        result = response.json()
        return result
    else:
        return None

## Check grammar

In [75]:
def read_terms(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        terms = [json.loads(line) for line in file if line.strip()]
    return terms

# get the swedish lemmas, replace " with blank, and remove terms with | and spaces
def get_swedish_lemmas(terms):
    return [term['swe']['lemma'].replace('"', '') for term in terms if term['swe']['lemma'] and ' ' not in term['swe']['lemma'] and "|" not in term['swe']['lemma']]

In [80]:
terms_file_path = 'stunda-terms.jsonl'

terms_to_check = read_terms(terms_file_path)
swedish_lemmas = get_swedish_lemmas(terms_to_check)
print(len(swedish_lemmas))


3787


In [81]:
# SKRUTTEN STAVA
spell_checking = spell_check(swedish_lemmas)
print(spell_checking[:5])

# Filter and print elements where 'correct' is False
incorrect_spellings = [item for item in spell_checking if not item['correct']]
print(len(incorrect_spellings))
print(incorrect_spellings[:5])

[{'word': 'realtid', 'suggestions': ['realtid'], 'correct': True}, {'word': 'Osäkra', 'suggestions': ['Osäkra'], 'correct': True}, {'word': 'D-metod', 'suggestions': ['D-metod'], 'correct': True}, {'word': 'D-modellering', 'suggestions': ['D-modellering'], 'correct': True}, {'word': 'D-utformning', 'suggestions': ['D-utformning'], 'correct': True}]
226
[{'word': 'accelerometern', 'suggestions': [], 'correct': False}, {'word': 'atkomstkontroll', 'suggestions': ['ankomstkontroll,utkomstkontroll,avkomstkontroll'], 'correct': False}, {'word': 'agil', 'suggestions': ['agio,askil'], 'correct': False}, {'word': 'agila', 'suggestions': ['askila,argila,dagila,jagila,agaila,lagila,tagila'], 'correct': False}, {'word': 'aliaserande', 'suggestions': ['aliasreande,aliasekande,aliasenande,aliasexande,aliasärande,aliasyrande'], 'correct': False}]


In [78]:
# SKRUTTEN TAGGSTAVA
class_tagging = word_class_tagging(swedish_lemmas)
print(class_tagging[:5])

[{'word': 'realtid', 'tags': ['nn.utr.sin.ind.nom']}, {'word': 'Osäkra', 'tags': ['vb.inf.akt', 'vb.imp.akt', 'jj.pos.utr/neu.sin.def.nom', 'jj.pos.utr/neu.plu.ind/def.nom']}, {'word': 'D-metod', 'tags': ['nn.utr.sin.ind.nom']}, {'word': 'D-modellering', 'tags': ['nn.utr.sin.ind.nom']}, {'word': 'D-utformning', 'tags': ['nn.utr.sin.ind.nom']}]


In [79]:
# SKRUTTEN SÄRSTAVA
compounds = compound_check(swedish_lemmas)
print(compounds[:5])

[{'word': 'realtid', 'parts': ['real|tid']}, {'word': 'Osäkra', 'parts': []}, {'word': '2D-metod', 'parts': []}, {'word': '3D-modellering', 'parts': []}, {'word': '3D-utformning', 'parts': []}]
