#### File for initial data exploration

In [67]:
import json
import pandas as pd

In [68]:
path = "../data/"
file = "stunda-terms.jsonl"

In [69]:
def read_jsonl_file(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Load each line as a JSON object
            json_object = json.loads(line)
            data.append(json_object)
    return data

#### Read data and show an entry

In [70]:
data = read_jsonl_file(path + file)
print(data[17])

{'eng': {'lemma': '3D (three dimensional)', 'inflection': []}, 'swe': {'lemma': 'tre dimensionell', 'inflection': []}, 'pos': 'N?', 'status': '0', 'src': 'ICT', 'row': '7', 'comment': 'from data', 'synonyms': [], 'definition': None}


#### Basic data statistics

In [71]:
# Extract the English and Swedish lemmas 
simplified_data = [(entry["eng"]["lemma"], entry["swe"]["lemma"]) for entry in data]

# Convert data to pandas dataframe for easier manipulations
df_simplified_data = pd.DataFrame(simplified_data, columns=["english", "swedish"])
print("Number of terms in data", len(df_simplified_data))

# Group by english and swedish lemma to find number of duplicate entries
df_no_duplicates = df_simplified_data.groupby(df_simplified_data.columns.to_list(), as_index=False).size().sort_values(by="size", ascending=False)

# Number of unique entries
print("Number of unique entries", len(df_no_duplicates))
df_no_duplicates.head()

Number of terms in data 6883
Number of unique entries 6547


Unnamed: 0,english,swedish,size
839,Miscellaneous,Blandat,39
514,General,Allmänt,38
780,MISCELLANEOUS,BLANDAT,11
501,GENERAL,ALLMÄNT,11
1386,Test generation [**],Testgenerering [**],7


In [72]:
df_simplified_data.groupby(by=["swedish"], as_index=False).size().sort_values(by="size", ascending=False).head()

Unnamed: 0,swedish,size
0,,565
388,Blandat,39
307,Allmänt,38
357,BLANDAT,11
287,ALLMÄNT,11


565 terms without translation what to do with these??

In [73]:
df_simplified_data.groupby(by=["english"], as_index=False).size().sort_values(by="size", ascending=False).head()

Unnamed: 0,english,size
827,Miscellaneous,39
505,General,38
492,GENERAL,11
768,MISCELLANEOUS,11
72,Applications,7


In [74]:
# Convert all strings to lowercase
df_ignore_case = df_simplified_data.map(lambda x: x.lower() if isinstance(x, str) else x)

# Group by english and swedish lemma to find number of duplicate entries
df_no_duplicates_ignore_case = df_ignore_case.groupby(df_ignore_case.columns.to_list(), as_index=False).size().sort_values(by="size", ascending=False)

# Number of unique entries
print("Number of unique entries ignoring case", len(df_no_duplicates_ignore_case))
df_no_duplicates_ignore_case.head()

Number of unique entries ignoring case 6480


Unnamed: 0,english,swedish,size
3519,miscellaneous,blandat,50
2343,general,allmänt,49
5536,standards,standarder,7
1967,error-checking [**],felkontroll [**],7
5850,test generation [**],testgenerering [**],7


#### Entries which contains a space in the swedish lemma

In [75]:
df_simplified_data[df_simplified_data["swedish"].str.contains(" ")]

Unnamed: 0,english,swedish
4,"""""""a lot"""" """"lots of""""""","""""""mycket av"""" """"massor av"""""""
5,"""""""energy efficiency"""" """"energy-efficient""""""","""energieffektivisering/energieffektivitet """"en..."
7,"""ad hoc [Latin: """"for this"""" (purpose/task/...)]""",till detta
13,1 pulse per second,1 puls per sekund
15,2G (Third Generation),2G (andra generationen)
...,...,...
6866,write,skriva
6869,X Windowing system,X Fönstersystem
6875,year years,år åratal
6881,zero knowledge,noll kunskap


#### Investigate the use of | to handle inflections

In [76]:
df_simplified_data[df_simplified_data["english"].str.contains("\|")]

Unnamed: 0,english,swedish
51,acceleration|s,acceleration|er
69,accommodate|s,ackommodera
90,acoustic|s,akustik
107,actor|s,aktör|er
147,affect|s,påverka|r
...,...,...
6436,tutorial|s,handledning
6600,variant|s,variant varianter
6611,vehicle|s,fordon
6797,winner|s,vinnare vinnare


In [110]:
data = read_jsonl_file(path + file)
# Function to handle inflections when there is a pipe character in lemma
def inflections_with_pipe_char_in_lemma(data):
    for entry in data:
        eng_entry = entry["eng"]
        eng_lemma = eng_entry["lemma"]
        if "|" in eng_lemma:
            print("Before:", entry["eng"])
            if not eng_entry["inflection"]:
                inflection = ''.join([char for char in eng_lemma if char != "|"])
                lemma = eng_lemma.split('|')[0]
                eng_entry["inflection"].append(inflection)
                eng_entry["lemma"] = lemma
            else:
                print("Inflections not empty:", eng_entry["inflection"])
            print("After:", entry["eng"])
    return data

processed_data = inflections_with_pipe_char_in_lemma(data)

Before: {'lemma': 'acceleration|s', 'inflection': []}
After: {'lemma': 'acceleration', 'inflection': ['accelerations']}
Before: {'lemma': 'accommodate|s', 'inflection': []}
After: {'lemma': 'accommodate', 'inflection': ['accommodates']}
Before: {'lemma': 'acoustic|s', 'inflection': []}
After: {'lemma': 'acoustic', 'inflection': ['acoustics']}
Before: {'lemma': 'actor|s', 'inflection': []}
After: {'lemma': 'actor', 'inflection': ['actors']}
Before: {'lemma': 'affect|s', 'inflection': []}
After: {'lemma': 'affect', 'inflection': ['affects']}
Before: {'lemma': 'amendment|s', 'inflection': []}
After: {'lemma': 'amendment', 'inflection': ['amendments']}
Before: {'lemma': 'amount|s', 'inflection': []}
After: {'lemma': 'amount', 'inflection': ['amounts']}
Before: {'lemma': 'antenna|s', 'inflection': []}
After: {'lemma': 'antenna', 'inflection': ['antennas']}
Before: {'lemma': 'argument|s', 'inflection': []}
After: {'lemma': 'argument', 'inflection': ['arguments']}
Before: {'lemma': 'array|s',