### Import needed functionality

In [1]:
import pandas as pd
from utils import is_word_in_english, swedish_spell_check, english_pos, swedish_pos_tagging, custom_english_lemmatizer
from collections import Counter

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/chickenthug/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     /home/chickenthug/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/chickenthug/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/chickenthug/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


('abstract network', 'ok')
type


### Read Data

In [2]:
path = "../../public_data/"
file = "lemma_data_silver_stage1.csv"

df = pd.read_csv(path+file)[["id", "eng_lemma", "swe_lemma", "status", "pos", "src"]]

### Spell check

In [3]:
df["exist_in_english"] = df["eng_lemma"].apply(is_word_in_english)
df["exist_in_swedish"] = df["swe_lemma"].apply(swedish_spell_check)

#### Get statistics of correctly spelled words

In [4]:
df[["exist_in_swedish", "exist_in_english"]].value_counts()

exist_in_swedish  exist_in_english
True              True                1517
                  False                237
False             True                 160
                  False                 82
Name: count, dtype: int64

#### Save incorreclty spelled words to file

In [5]:
df['spell'] = df.apply(lambda row: 'ok' if row['exist_in_english'] and row['exist_in_swedish'] else 
                             'both' if not row['exist_in_english'] and not row['exist_in_swedish'] else 
                             'eng' if not row['exist_in_english'] else 'swe', axis=1)

df[df.spell != 'ok'].to_csv(path+'incorrect_spellings.csv',index=False)

df_exist = df[df.spell == 'ok'].copy(deep=True)

### POS tagging

#### Get statistics of length of terms

In [6]:
df_exist["words_count_english"] = df_exist["eng_lemma"].apply(lambda x: len(x.split(" ")))
df_exist["words_count_swedish"] = df_exist["swe_lemma"].apply(lambda x: len(x.split(" ")))


df_exist[["words_count_english", "words_count_swedish"]].value_counts()

words_count_english  words_count_swedish
2                    1                      548
                     2                      385
1                    1                      242
3                    2                      111
                     1                       89
                     3                       71
4                    2                       14
                     4                       14
2                    3                       13
4                    3                       11
                     1                        7
3                    4                        4
1                    3                        2
                     2                        1
2                    4                        1
5                    4                        1
6                    2                        1
                     4                        1
                     5                        1
Name: count, dtype: int64

#### English POS-tagging

In [7]:
df_exist["english_pos"] = df_exist["eng_lemma"].apply(english_pos)
pd.set_option("display.max_rows", None)
print(df_exist.english_pos.value_counts())

english_pos
NN NN                   407
JJ NN                   201
NN                      153
NN NNS                  140
NN NN NN                 74
JJ NNS                   73
JJ NN NN                 68
JJ                       59
NNS                      28
NN VBG                   27
NN NN NNS                24
JJ NN NNS                19
VBN NN                   18
VBG NN                   15
NNS NN                   14
VBN NNS                  13
JJ NN NN NN              13
VBG NNS                  11
NNS NNS                   8
JJ JJ NN                  8
NNS NN NN                 8
VBN NN NN                 8
JJ NNS NN                 6
NN NNS NN                 5
JJ JJ NN NN               4
NN VBD NN                 4
NN NN NN NN               4
NNS NN NNS                3
NN JJ NN                  3
NNS VBG                   3
VBG NN NN                 3
CD NN                     3
NN VBN NN                 3
NNS IN NNS                3
NNS IN JJ NNS             3
VBN JJ N

### Lemmatizing

In [11]:
df_exist["eng_lemma"], df_exist["lemmatizer_status"] = zip(*df_exist.apply(lambda x: custom_english_lemmatizer(x["eng_lemma"], x["english_pos"]), axis=1))

print(df_exist.lemmatizer_status.value_counts())

df_exist[df_exist["lemmatizer_status"] == "not ok"][["eng_lemma", "swe_lemma", "english_pos", "lemmatizer_status"]].sort_values("english_pos")

lemmatizer_status
ok        1351
not ok     166
Name: count, dtype: int64


Unnamed: 0,eng_lemma,swe_lemma,english_pos,lemmatizer_status
556,million,miljoner,CD,not ok
68,billion,miljarder,CD,not ok
1,2d method,2d-metod,CD NN,not ok
3,3d modelling,3d-modellering,CD NN,not ok
1192,zero knowledge,noll kunskap,CD NN,not ok
624,near inductive field,nära induktivt fält,IN JJ NN,not ok
623,near field communication,närfältskommunikation,IN NN NN,not ok
871,responsible on site,ansvarig på plats,JJ IN NN,not ok
1549,high-speed arithmetic,höghastighetsaritmetik,JJ JJ,not ok
728,parallel discrete event simulation,parallella diskret händelsestyrd simulering,JJ JJ NN NN,not ok


In [9]:
swedish_mapping = {
    "nn": "N",
    "vb": "V",
    "jj": "A",
    "ab": "Ab",
    "pc": "P"
}

english_mapping = {
    "NOUN" : "N",
    "VERB" : "V",
    "ADJ" : "A",
    "X" : "X",
    "ADV": "Ab",
    "PRT": "P",
    "NUM": "NUM"
}

In [10]:
word_class = []

eng_lemmas = df["eng_lemma"].to_list()
swe_lemmas = df["swe_lemma"].to_list()
ids = df["id"].to_list()

for i in range(len(eng_lemmas)):
    eng_lemma = eng_lemmas[i]
    swe_lemma = swe_lemmas[i]
    id = ids[i]
    
    if len(eng_lemma.split()) > 1:
        word_class.append({"pos": "N", "id":id})
    else:
        eng_tags = english_pos(eng_lemma.split()[-1])
        eng_tags = set([english_mapping[x[0]] for x in eng_tags])

        swe_tags = swedish_pos_tagging(swe_lemma)[-1]["tags"]

        
        swe_tags = set([swedish_mapping[x.split(".")[0]] for x in swe_tags])
        tag = swe_tags.intersection(eng_tags)

        if len(tag) == 1:
            word_class.append({"pos": tag.pop(), "id":id})
        elif len(tag) == 0:
            word_class.append({"pos": "no pos", "id":id})
        else:
            word_class.append({"pos": "too many", "id":id})

KeyError: 'C'

In [None]:
df_pos = pd.DataFrame(word_class)

df_pos.head()
df

Unnamed: 0,id,eng_lemma,swe_lemma,status,pos,src,exist_in_english,exist_in_swedish,spell
1,ICT5,2d method,2d-metod,bronze,N?,ICT,True,True,ok
3,ICT9,3d modelling,3d-modellering,bronze,N?,ICT,True,True,ok
11,ICT24,absolute semantic,absolut semantisk,bronze,N?,ICT,True,True,ok
12,ICT26,abstract program models,abstrakta programmodeller,bronze,N?,ICT,True,True,ok
14,ICT34,access model,åtkomstmetod,bronze,N?,ICT,True,True,ok
...,...,...,...,...,...,...,...,...,...
1984,ICT3555,two-way,tvåvägs,silver,N?,ICT,True,True,ok
1987,ICT3626,user-friendly,användarvänlig,silver,N?,ICT,True,True,ok
1991,ICT3668,vehicle,fordon,silver,N?,ICT,True,True,ok
1992,ICT3732,walkie-talkie,komradio,silver,N?,ICT,True,True,ok


In [None]:
df_merged = pd.merge(df, df_pos, how="inner", on="id").drop("pos_x", axis=1)

In [None]:
df_merged.pos_y.value_counts()

N           1297
no pos        46
A              8
V              5
too many       1
Name: pos_y, dtype: int64

In [None]:
df_merged[df_merged["pos_y"] == "too many"]

Unnamed: 0,id,eng_lemma,swe_lemma,status,src,exist_in_english,exist_in_swedish,spell,pos_y
1299,ICT79,affect,påverka,silver,ICT,True,True,ok,too many


In [None]:
df_merged[df_merged["pos_y"] == "no pos"]

Unnamed: 0,id,eng_lemma,swe_lemma,status,src,exist_in_english,exist_in_swedish,spell,pos_y
43,ICT291,billion,miljarder,bronze,ICT,True,True,ok,no pos
112,ICT599,consults,konsulterar,bronze,ICT,True,True,ok,no pos
200,ICT997,e-mail,e-post,bronze,ICT,True,True,ok,no pos
239,ICT1074,environmental,miljö-,bronze,ICT,True,True,ok,no pos
285,ICT1348,golden,guld-,bronze,ICT,True,True,ok,no pos
398,ICT1957,million,miljoner,bronze,ICT,True,True,ok,no pos
465,ICT2227,off-the-shelf,färdig,bronze,ICT,True,True,ok,no pos
509,ICT2414,person-to-person,personlig,bronze,ICT,True,True,ok,no pos
669,ICT3162,solar,sol-,bronze,ICT,True,True,ok,no pos
694,ICT3283,suburban,förorts-,bronze,ICT,True,True,ok,no pos


In [None]:
df_ready_for_lemmatisering = df_merged[~df_merged["pos_y"].isin(["no pos", "too many"])]

df_ready_for_lemmatisering = df_ready_for_lemmatisering.reset_index()

In [None]:
from utils import english_lemmatizer, swedish_lemmatizing

eng_pos_mapping = {
    "N": "n",
    "V": "v",
    "A": "a",
    "Ab": "r"
}

eng_lemmas = df_ready_for_lemmatisering["eng_lemma"].to_list()
swe_lemmas = df_ready_for_lemmatisering["swe_lemma"].to_list()
ids = df_ready_for_lemmatisering.index.to_list()

for i in range(len(eng_lemmas)):
    eng_lemma = eng_lemmas[i]
    swe_lemma = swe_lemmas[i]
    id = ids[i]

    if len(eng_lemma.split()) > 1 or len(swe_lemma.split()) > 1:
        last_eng_word = eng_lemma.split()[-1]
        last_swe_word = swe_lemma.split()[-1]

        pos = eng_pos_mapping[df_ready_for_lemmatisering.at[id, "pos_y"]]
        new_eng_lemma = english_lemmatizer(last_eng_word, pos)
        new_swe_lemma = swedish_lemmatizing(last_swe_word)[0]["lemma"]

        t = eng_lemma.split(" ")[:-1]
        t.append(new_eng_lemma)
        full_eng_lemma = " ".join(t)

        t = swe_lemma.split(" ")[:-1]
        t.append(new_swe_lemma)
        full_swe_lemma = " ".join(t)

        df_ready_for_lemmatisering.iloc[id, df_ready_for_lemmatisering.columns.get_loc('eng_lemma')] = full_eng_lemma
        df_ready_for_lemmatisering.iloc[id, df_ready_for_lemmatisering.columns.get_loc('swe_lemma')] = full_swe_lemma

    else:
        pos = eng_pos_mapping[df_ready_for_lemmatisering.at[id, "pos_y"]]
        new_eng_lemma = english_lemmatizer(eng_lemma, pos)
        new_swe_lemma = swedish_lemmatizing(swe_lemma)[0]["lemma"]
                             
        df_ready_for_lemmatisering.iloc[id, df_ready_for_lemmatisering.columns.get_loc('eng_lemma')] = new_eng_lemma
        df_ready_for_lemmatisering.iloc[id, df_ready_for_lemmatisering.columns.get_loc('swe_lemma')] = new_swe_lemma


In [None]:
df_ready_for_lemmatisering

Unnamed: 0,index,id,eng_lemma,swe_lemma,status,src,exist_in_english,exist_in_swedish,spell,pos_y
0,0,ICT5,2d method,2d-metod,bronze,ICT,True,True,ok,N
1,1,ICT9,3d modelling,3d-modellering,bronze,ICT,True,True,ok,N
2,2,ICT24,absolute semantic,absolut semantisk,bronze,ICT,True,True,ok,N
3,3,ICT26,abstract program model,abstrakta programmodell,bronze,ICT,True,True,ok,N
4,4,ICT34,access model,åtkomstmetod,bronze,ICT,True,True,ok,N
...,...,...,...,...,...,...,...,...,...,...
1305,1343,ICT2927,scenario,scenarier,silver,ICT,True,True,ok,N
1306,1346,ICT3066,short-range,kortdistans,silver,ICT,True,True,ok,N
1307,1348,ICT3146,socket,uttag,silver,ICT,True,True,ok,N
1308,1354,ICT3668,vehicle,fordon,silver,ICT,True,True,ok,N
