In [32]:
import pandas as pd
from utils import is_word_in_english, swedish_spell_check, english_pos, swedish_pos_tagging
from collections import Counter

In [33]:
path = "../../public_data/"
file = "lemma_data_silver_stage1.csv"

df = pd.read_csv(path+file)[["id", "eng_lemma", "swe_lemma", "status", "pos", "src"]]

In [34]:
df["exist_in_english"] = df["eng_lemma"].apply(is_word_in_english)
df["exist_in_swedish"] = df["swe_lemma"].apply(swedish_spell_check)

In [35]:
df["exist_in_swedish"].value_counts()

exist_in_swedish
True     1754
False     242
Name: count, dtype: int64

In [36]:
df["exist_in_english"].value_counts()

exist_in_english
True     1486
False     510
Name: count, dtype: int64

In [37]:
df['spell'] = df.apply(lambda row: 'ok' if row['exist_in_english'] and row['exist_in_swedish'] else 
                             'both' if not row['exist_in_english'] and not row['exist_in_swedish'] else 
                             'eng' if not row['exist_in_english'] else 'swe', axis=1)

In [38]:
df[df.spell != 'ok'].to_csv(path+'incorrect_spellings.csv',index=False)

In [39]:
df = df[df.spell == 'ok']

In [40]:
swedish_mapping = {
    "nn": "N",
    "vb": "V",
    "jj": "A",
    "ab": "Ab",
    "pc": "P"
}

english_mapping = {
    "NOUN" : "N",
    "VERB" : "V",
    "ADJ" : "A",
    "X" : "X",
    "ADV": "Ab",
    "PRT": "P",
    "NUM": "NUM"
}

In [41]:
word_class = []

eng_lemmas = df["eng_lemma"].to_list()
swe_lemmas = df["swe_lemma"].to_list()
ids = df["id"].to_list()

for i in range(len(eng_lemmas)):
    eng_lemma = eng_lemmas[i]
    swe_lemma = swe_lemmas[i]
    id = ids[i]
    
    if len(eng_lemma.split()) > 1:
        word_class.append({"pos": "N", "id":id})
    else:
        eng_tags = english_pos(eng_lemma.split()[-1])
        eng_tags = set([english_mapping[x[0]] for x in eng_tags])

        swe_tags = swedish_pos_tagging(swe_lemma)[-1]["tags"]

        
        swe_tags = set([swedish_mapping[x.split(".")[0]] for x in swe_tags])
        tag = swe_tags.intersection(eng_tags)

        if len(tag) == 1:
            word_class.append({"pos": tag.pop(), "id":id})
        elif len(tag) == 0:
            word_class.append({"pos": "no pos", "id":id})
        else:
            word_class.append({"pos": "too many", "id":id})



    
    

In [42]:
df_pos = pd.DataFrame(word_class)

df_pos.head()
df

Unnamed: 0,id,eng_lemma,swe_lemma,status,pos,src,exist_in_english,exist_in_swedish,spell
1,ICT5,2d method,2d-metod,bronze,N?,ICT,True,True,ok
3,ICT9,3d modelling,3d-modellering,bronze,N?,ICT,True,True,ok
11,ICT24,absolute semantic,absolut semantisk,bronze,N?,ICT,True,True,ok
12,ICT26,abstract program models,abstrakta programmodeller,bronze,N?,ICT,True,True,ok
14,ICT34,access model,åtkomstmetod,bronze,N?,ICT,True,True,ok
...,...,...,...,...,...,...,...,...,...
1984,ICT3555,two-way,tvåvägs,silver,N?,ICT,True,True,ok
1987,ICT3626,user-friendly,användarvänlig,silver,N?,ICT,True,True,ok
1991,ICT3668,vehicle,fordon,silver,N?,ICT,True,True,ok
1992,ICT3732,walkie-talkie,komradio,silver,N?,ICT,True,True,ok


In [43]:
df_merged = pd.merge(df, df_pos, how="inner", on="id").drop("pos_x", axis=1)

In [46]:
df_merged.pos_y.value_counts()

pos_y
N           1297
no pos        46
A              8
V              5
too many       1
Name: count, dtype: int64

In [47]:
df_merged[df_merged["pos_y"] == "too many"]

Unnamed: 0,id,eng_lemma,swe_lemma,status,src,exist_in_english,exist_in_swedish,spell,pos_y
1299,ICT79,affect,påverka,silver,ICT,True,True,ok,too many


In [50]:
df_merged[df_merged["pos_y"] == "no pos"]

Unnamed: 0,id,eng_lemma,swe_lemma,status,src,exist_in_english,exist_in_swedish,spell,pos_y
43,ICT291,billion,miljarder,bronze,ICT,True,True,ok,no pos
112,ICT599,consults,konsulterar,bronze,ICT,True,True,ok,no pos
200,ICT997,e-mail,e-post,bronze,ICT,True,True,ok,no pos
239,ICT1074,environmental,miljö-,bronze,ICT,True,True,ok,no pos
285,ICT1348,golden,guld-,bronze,ICT,True,True,ok,no pos
398,ICT1957,million,miljoner,bronze,ICT,True,True,ok,no pos
465,ICT2227,off-the-shelf,färdig,bronze,ICT,True,True,ok,no pos
509,ICT2414,person-to-person,personlig,bronze,ICT,True,True,ok,no pos
669,ICT3162,solar,sol-,bronze,ICT,True,True,ok,no pos
694,ICT3283,suburban,förorts-,bronze,ICT,True,True,ok,no pos


In [None]:
df_ready_for_lemmatisering = df_merged[~df_merged["pos_y"].isin(["no pos", "too many pos"])]