### Import needed functionality

In [65]:
import pandas as pd
from utils import is_word_in_english, swedish_spell_check, english_pos, granska_pos, custom_english_lemmatizer
from collections import Counter

### Read Data

In [66]:
path = "../../public_data/"
file = "lemma_data_silver_stage1.csv"

df = pd.read_csv(path+file)[["id", "eng_lemma", "swe_lemma", "status", "pos", "src"]]

### Spell check

In [67]:
df["exist_in_english"] = df["eng_lemma"].apply(is_word_in_english)
df["exist_in_swedish"] = df["swe_lemma"].apply(swedish_spell_check)

#### Get statistics of correctly spelled words

In [None]:
df[["exist_in_swedish", "exist_in_english"]].value_counts()

exist_in_swedish  exist_in_english
True              True                1517
                  False                237
False             True                 160
                  False                 82
Name: count, dtype: int64

#### Save incorreclty spelled words to file

In [None]:
df['spell'] = df.apply(lambda row: 'ok' if row['exist_in_english'] and row['exist_in_swedish'] else 
                             'both' if not row['exist_in_english'] and not row['exist_in_swedish'] else 
                             'eng' if not row['exist_in_english'] else 'swe', axis=1)

df[df.spell != 'ok'].to_csv(path+'incorrect_spellings.csv',index=False)

df_exist = df[df.spell == 'ok'].copy(deep=True)

### POS tagging

#### Get statistics of length of terms

In [None]:
df_exist["words_count_english"] = df_exist["eng_lemma"].apply(lambda x: len(x.split(" ")))
df_exist["words_count_swedish"] = df_exist["swe_lemma"].apply(lambda x: len(x.split(" ")))


df_exist[["words_count_english", "words_count_swedish"]].value_counts()

words_count_english  words_count_swedish
2                    1                      548
                     2                      385
1                    1                      242
3                    2                      111
                     1                       89
                     3                       71
4                    2                       14
                     4                       14
2                    3                       13
4                    3                       11
                     1                        7
3                    4                        4
1                    3                        2
                     2                        1
2                    4                        1
5                    4                        1
6                    2                        1
                     4                        1
                     5                        1
Name: count, dtype: int64

#### POS-tagging

In [None]:
df_exist["swedish_pos"] = df_exist["swe_lemma"].apply(granska_pos)
pd.set_option("display.max_rows", None)
print(df_exist.swedish_pos.value_counts())

swedish_pos
nn.utr.sin.ind.nom                                                                                    478
jj.pos.utr.sin.ind.nom nn.utr.sin.ind.nom                                                             195
nn.neu.sin.ind.nom                                                                                    162
nn.utr.plu.ind.nom                                                                                    123
jj.pos.utr/neu.plu.ind/def.nom nn.utr.plu.ind.nom                                                      67
jj.pos.utr/neu.plu.ind/def.nom nn.neu.plu.ind.nom                                                      51
nn.utr.sin.ind.nom nn.utr.sin.ind.nom                                                                  46
jj.pos.utr.sin.ind.nom                                                                                 32
jj.pos.neu.sin.ind.nom nn.neu.sin.ind.nom                                                              20
nn.utr.sin.def.nom                

In [None]:
from utils import swedish_lemmatizing
import requests

def convert_to_simple_pos(terms):
    terms = terms.split(" ")
    terms = [term.split(".")[0].upper() for term in terms]
    return " ".join(terms)

def lemmatize_noun(noun):
    lemmatized_noun = swedish_lemmatizing(noun)[0]["lemma"]
    return lemmatized_noun

def get_inflections(term, form):
    url = "https://skrutten.csc.kth.se/granskaapi/inflect.php"

    params = {"coding" : "json", "word": term, "tag": form}

    response = requests.get(url, params=params)

    # print(response.status_code)

    return response.json()[0]["interpretations"][0]["inflections"]

def lemmatize_adjective(adj, form, genus=None):
    if genus:
        target_tag = f"jj.pos.{genus}.sin.ind.nom"
        inflections = get_inflections(adj, form)

        for inflection in inflections:
            if inflection["tag"].strip("* ") == target_tag:
                return inflection["word"]
        return None
    else:
        return lemmatize_noun(adj)
    
def advanced_lemmatizer(term, simple_pos, swedish_pos):
    if not " " in term.strip(" "):
        return lemmatize_noun(term), "ok"
    elif simple_pos == "JJ NN":
        noun = lemmatize_noun(term.split(" ")[1])
        genus = swedish_pos.split(" ")[1].split(".")[1]
        adj = lemmatize_adjective(term.split(" ")[0], swedish_pos.split(" ")[0], genus)
        
        if adj:
            return adj + " " + noun, "ok"
        else:
            return term, "not ok"
    else:
        return term, "not ok"
    

    


df_exist["simple_pos"] = df_exist["swedish_pos"].apply(convert_to_simple_pos)

df_exist[df_exist["simple_pos"] == "NN"].swedish_pos.value_counts()

swedish_pos
nn.utr.sin.ind.nom    478
nn.neu.sin.ind.nom    162
nn.utr.plu.ind.nom    123
nn.utr.sin.def.nom     17
nn.neu.plu.ind.nom     16
nn.neu.sin.def.nom      7
nn.utr.sin.ind.gen      4
nn.utr.plu.def.nom      1
nn                      1
Name: count, dtype: int64

In [None]:
df_exist["simple_pos"].value_counts()

simple_pos
NN                809
JJ NN             360
NN NN             110
JJ                 47
NN PP NN           37
PC NN              19
JJ NN NN           18
VB                 10
JJ JJ NN           10
NN PP JJ NN        10
NN MID              8
AB JJ NN            6
NN NN NN            6
NN VB               5
VB NN               5
AB                  5
PC                  4
PM                  4
NN JJ NN            4
AB JJ               3
JJ NN VB            2
JJ JJ NN NN         2
AB NN               2
JJ NN PM            1
AB VB               1
NN VB NN            1
AB NN NN            1
JJ JJ PP NN         1
PC NN NN            1
JJ NN NN NN NN      1
JJ NN JJ NN         1
PM NN               1
NN NN AB            1
NN JJ               1
VB NN NN            1
NN MID NN           1
VB JJ NN            1
JJ JJ VB NN         1
AB PC NN            1
JJ VB NN            1
VB KN VB            1
NN PC               1
JJ PP NN            1
JJ JJ NN AB         1
JJ AB JJ NN         1

In [None]:
df_exist["swe_lemma"], df_exist["swe_lemmatizer_status"] = zip(*df_exist.apply(lambda x: advanced_lemmatizer(x["swe_lemma"], x["simple_pos"], x["swedish_pos"]), axis=1))

df_exist["swe_lemmatizer_status"].value_counts()

swe_lemmatizer_status
ok        1208
not ok     309
Name: count, dtype: int64

In [None]:
df_exist[df_exist["simple_pos"] == df_exist["english_pos"]]

Unnamed: 0,id,eng_lemma,swe_lemma,status,pos,src,exist_in_english,exist_in_swedish,spell,words_count_english,words_count_swedish,swedish_pos,simple_pos,swe_lemmatizer_status,english_pos
8,ICT13,3d-visualization,3d-visualisering,bronze,N?,ICT,True,True,ok,1,1,nn.utr.sin.ind.nom,NN,ok,NN
18,ICT66,address-centric,adress-centrerad,bronze,N?,ICT,True,True,ok,1,1,jj.pos.utr.sin.ind.nom,JJ,ok,JJ
20,ICT78,aesthetic interaction,interaktion estetisk,bronze,N?,ICT,True,True,ok,2,2,jj.pos.utr.sin.ind.nom nn.utr.sin.ind.nom,JJ NN,ok,JJ NN
25,ICT120,ambient intelligence,omgivning intelligent,bronze,N?,ICT,True,True,ok,2,2,jj.pos.utr.sin.ind.nom nn.utr.sin.ind.nom,JJ NN,ok,JJ NN
28,ICT127,amplitude-shift,amplitud-skift,bronze,N?,ICT,True,True,ok,1,1,nn.neu.sin.ind.nom,NN,ok,NN
29,ICT130,analog front-end,analog framända,bronze,N?,ICT,True,True,ok,2,2,nn.utr.sin.ind.nom nn.utr.sin.ind.nom,NN NN,not ok,NN NN
31,ICT153,application,tillämpning,bronze,N?,ICT,True,True,ok,1,1,nn.utr.sin.ind.nom,NN,ok,NN
44,ICT195,asymmetric cryptography,kryptografi asymmetrisk,bronze,N?,ICT,True,True,ok,2,2,jj.pos.utr.sin.ind.nom nn.utr.sin.ind.nom,JJ NN,ok,JJ NN
45,ICT198,asynchronous communication,kommunikation asynkron,bronze,N?,ICT,True,True,ok,2,2,jj.pos.utr.sin.ind.nom nn.utr.sin.ind.nom,JJ NN,ok,JJ NN
56,ICT230,automatic scheduling,schemaläggning automatisk,bronze,N?,ICT,True,True,ok,2,2,jj.pos.utr.sin.ind.nom nn.utr.sin.ind.nom,JJ NN,ok,JJ NN


In [None]:
df_exist["english_pos"] = df_exist["eng_lemma"].apply(english_pos)
pd.set_option("display.max_rows", None)
print(df_exist.english_pos.value_counts())

english_pos
NN NN                   407
JJ NN                   201
NN                      153
NN NNS                  140
NN NN NN                 74
JJ NNS                   73
JJ NN NN                 68
JJ                       59
NNS                      28
NN VBG                   27
NN NN NNS                24
JJ NN NNS                19
VBN NN                   18
VBG NN                   15
NNS NN                   14
VBN NNS                  13
JJ NN NN NN              13
VBG NNS                  11
NNS NNS                   8
JJ JJ NN                  8
NNS NN NN                 8
VBN NN NN                 8
JJ NNS NN                 6
NN NNS NN                 5
JJ JJ NN NN               4
NN VBD NN                 4
NN NN NN NN               4
NNS NN NNS                3
NN JJ NN                  3
NNS VBG                   3
VBG NN NN                 3
CD NN                     3
NN VBN NN                 3
NNS IN NNS                3
NNS IN JJ NNS             3
VBN JJ N

### Lemmatizing

In [None]:
df_exist["eng_lemma"], df_exist["lemmatizer_status"] = zip(*df_exist.apply(lambda x: custom_english_lemmatizer(x["eng_lemma"], x["english_pos"]), axis=1))

print(df_exist.lemmatizer_status.value_counts())

df_exist[df_exist["lemmatizer_status"] == "not ok"][["eng_lemma", "swe_lemma", "english_pos", "lemmatizer_status"]].sort_values("english_pos")

ok        1351
not ok     166
Name: lemmatizer_status, dtype: int64


Unnamed: 0,eng_lemma,swe_lemma,english_pos,lemmatizer_status
556,million,miljoner,CD,not ok
68,billion,miljarder,CD,not ok
1,2d method,2d-metod,CD NN,not ok
3,3d modelling,3d-modellering,CD NN,not ok
1192,zero knowledge,noll kunskap,CD NN,not ok
624,near inductive field,nära induktivt fält,IN JJ NN,not ok
623,near field communication,närfältskommunikation,IN NN NN,not ok
871,responsible on site,ansvarig på plats,JJ IN NN,not ok
1549,high-speed arithmetic,höghastighetsaritmetik,JJ JJ,not ok
728,parallel discrete event simulation,parallella diskret händelsestyrd simulering,JJ JJ NN NN,not ok


In [None]:
swedish_mapping = {
    "nn": "N",
    "vb": "V",
    "jj": "A",
    "ab": "Ab",
    "pc": "P"
}

english_mapping = {
    "NOUN" : "N",
    "VERB" : "V",
    "ADJ" : "A",
    "X" : "X",
    "ADV": "Ab",
    "PRT": "P",
    "NUM": "NUM"
}

In [None]:
word_class = []

eng_lemmas = df["eng_lemma"].to_list()
swe_lemmas = df["swe_lemma"].to_list()
ids = df["id"].to_list()

for i in range(len(eng_lemmas)):
    eng_lemma = eng_lemmas[i]
    swe_lemma = swe_lemmas[i]
    id = ids[i]
    
    if len(eng_lemma.split()) > 1:
        word_class.append({"pos": "N", "id":id})
    else:
        eng_tags = english_pos(eng_lemma.split()[-1])
        eng_tags = set([english_mapping[x[0]] for x in eng_tags])

        swe_tags = swedish_pos_tagging(swe_lemma)[-1]["tags"]

        
        swe_tags = set([swedish_mapping[x.split(".")[0]] for x in swe_tags])
        tag = swe_tags.intersection(eng_tags)

        if len(tag) == 1:
            word_class.append({"pos": tag.pop(), "id":id})
        elif len(tag) == 0:
            word_class.append({"pos": "no pos", "id":id})
        else:
            word_class.append({"pos": "too many", "id":id})

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3378, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/03/r7dnhp8d4jb_ljhhgr_4gw3r0000gn/T/ipykernel_37652/717200778.py", line 16, in <module>
    eng_tags = set([english_mapping[x[0]] for x in eng_tags])
  File "/var/folders/03/r7dnhp8d4jb_ljhhgr_4gw3r0000gn/T/ipykernel_37652/717200778.py", line 16, in <listcomp>
    eng_tags = set([english_mapping[x[0]] for x in eng_tags])
KeyError: 'C'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 1997, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core

In [None]:
df_pos = pd.DataFrame(word_class)

df_pos.head()
df

Unnamed: 0,id,eng_lemma,swe_lemma,status,pos,src,exist_in_english,exist_in_swedish,spell
1,ICT5,2d method,2d-metod,bronze,N?,ICT,True,True,ok
3,ICT9,3d modelling,3d-modellering,bronze,N?,ICT,True,True,ok
11,ICT24,absolute semantic,absolut semantisk,bronze,N?,ICT,True,True,ok
12,ICT26,abstract program models,abstrakta programmodeller,bronze,N?,ICT,True,True,ok
14,ICT34,access model,åtkomstmetod,bronze,N?,ICT,True,True,ok
...,...,...,...,...,...,...,...,...,...
1984,ICT3555,two-way,tvåvägs,silver,N?,ICT,True,True,ok
1987,ICT3626,user-friendly,användarvänlig,silver,N?,ICT,True,True,ok
1991,ICT3668,vehicle,fordon,silver,N?,ICT,True,True,ok
1992,ICT3732,walkie-talkie,komradio,silver,N?,ICT,True,True,ok


In [None]:
df_merged = pd.merge(df, df_pos, how="inner", on="id").drop("pos_x", axis=1)

In [None]:
df_merged.pos_y.value_counts()

N           1297
no pos        46
A              8
V              5
too many       1
Name: pos_y, dtype: int64

In [None]:
df_merged[df_merged["pos_y"] == "too many"]

Unnamed: 0,id,eng_lemma,swe_lemma,status,src,exist_in_english,exist_in_swedish,spell,pos_y
1299,ICT79,affect,påverka,silver,ICT,True,True,ok,too many


In [None]:
df_merged[df_merged["pos_y"] == "no pos"]

Unnamed: 0,id,eng_lemma,swe_lemma,status,src,exist_in_english,exist_in_swedish,spell,pos_y
43,ICT291,billion,miljarder,bronze,ICT,True,True,ok,no pos
112,ICT599,consults,konsulterar,bronze,ICT,True,True,ok,no pos
200,ICT997,e-mail,e-post,bronze,ICT,True,True,ok,no pos
239,ICT1074,environmental,miljö-,bronze,ICT,True,True,ok,no pos
285,ICT1348,golden,guld-,bronze,ICT,True,True,ok,no pos
398,ICT1957,million,miljoner,bronze,ICT,True,True,ok,no pos
465,ICT2227,off-the-shelf,färdig,bronze,ICT,True,True,ok,no pos
509,ICT2414,person-to-person,personlig,bronze,ICT,True,True,ok,no pos
669,ICT3162,solar,sol-,bronze,ICT,True,True,ok,no pos
694,ICT3283,suburban,förorts-,bronze,ICT,True,True,ok,no pos


In [None]:
df_ready_for_lemmatisering = df_merged[~df_merged["pos_y"].isin(["no pos", "too many"])]

df_ready_for_lemmatisering = df_ready_for_lemmatisering.reset_index()

In [None]:
from utils import english_lemmatizer, swedish_lemmatizing

eng_pos_mapping = {
    "N": "n",
    "V": "v",
    "A": "a",
    "Ab": "r"
}

eng_lemmas = df_ready_for_lemmatisering["eng_lemma"].to_list()
swe_lemmas = df_ready_for_lemmatisering["swe_lemma"].to_list()
ids = df_ready_for_lemmatisering.index.to_list()

for i in range(len(eng_lemmas)):
    eng_lemma = eng_lemmas[i]
    swe_lemma = swe_lemmas[i]
    id = ids[i]

    if len(eng_lemma.split()) > 1 or len(swe_lemma.split()) > 1:
        last_eng_word = eng_lemma.split()[-1]
        last_swe_word = swe_lemma.split()[-1]

        pos = eng_pos_mapping[df_ready_for_lemmatisering.at[id, "pos_y"]]
        new_eng_lemma = english_lemmatizer(last_eng_word, pos)
        new_swe_lemma = swedish_lemmatizing(last_swe_word)[0]["lemma"]

        t = eng_lemma.split(" ")[:-1]
        t.append(new_eng_lemma)
        full_eng_lemma = " ".join(t)

        t = swe_lemma.split(" ")[:-1]
        t.append(new_swe_lemma)
        full_swe_lemma = " ".join(t)

        df_ready_for_lemmatisering.iloc[id, df_ready_for_lemmatisering.columns.get_loc('eng_lemma')] = full_eng_lemma
        df_ready_for_lemmatisering.iloc[id, df_ready_for_lemmatisering.columns.get_loc('swe_lemma')] = full_swe_lemma

    else:
        pos = eng_pos_mapping[df_ready_for_lemmatisering.at[id, "pos_y"]]
        new_eng_lemma = english_lemmatizer(eng_lemma, pos)
        new_swe_lemma = swedish_lemmatizing(swe_lemma)[0]["lemma"]
                             
        df_ready_for_lemmatisering.iloc[id, df_ready_for_lemmatisering.columns.get_loc('eng_lemma')] = new_eng_lemma
        df_ready_for_lemmatisering.iloc[id, df_ready_for_lemmatisering.columns.get_loc('swe_lemma')] = new_swe_lemma


In [None]:
df_ready_for_lemmatisering

Unnamed: 0,index,id,eng_lemma,swe_lemma,status,src,exist_in_english,exist_in_swedish,spell,pos_y
0,0,ICT5,2d method,2d-metod,bronze,ICT,True,True,ok,N
1,1,ICT9,3d modelling,3d-modellering,bronze,ICT,True,True,ok,N
2,2,ICT24,absolute semantic,absolut semantisk,bronze,ICT,True,True,ok,N
3,3,ICT26,abstract program model,abstrakta programmodell,bronze,ICT,True,True,ok,N
4,4,ICT34,access model,åtkomstmetod,bronze,ICT,True,True,ok,N
...,...,...,...,...,...,...,...,...,...,...
1305,1343,ICT2927,scenario,scenarier,silver,ICT,True,True,ok,N
1306,1346,ICT3066,short-range,kortdistans,silver,ICT,True,True,ok,N
1307,1348,ICT3146,socket,uttag,silver,ICT,True,True,ok,N
1308,1354,ICT3668,vehicle,fordon,silver,ICT,True,True,ok,N
