#### File for initial data exploration

In [34]:
import json
import pandas as pd

In [35]:
path = "../data/"
file = "stunda-terms.jsonl"

In [36]:
def read_jsonl_file(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Load each line as a JSON object
            json_object = json.loads(line)
            data.append(json_object)
    return data

#### Read data and show an entry

In [37]:
data = read_jsonl_file(path + file)
print(data[17])

{'eng': {'lemma': '3D (three dimensional)', 'inflection': []}, 'swe': {'lemma': 'tre dimensionell', 'inflection': []}, 'pos': 'N?', 'status': '0', 'src': 'ICT', 'row': '7', 'comment': 'from data', 'synonyms': [], 'definition': None}


#### Basic data statistics

In [45]:
# Extract the English and Swedish lemmas 
simplified_data = [(entry["eng"]["lemma"], entry["swe"]["lemma"]) for entry in data]

# Convert data to pandas dataframe for easier manipulations
df_simplified_data = pd.DataFrame(simplified_data, columns=["english", "swedish"])

print("Number of terms in data", len(df_simplified_data))

# Group by english and swedish lemma to find number of duplicate entries
df_no_duplicates = df_simplified_data.groupby(df_simplified_data.columns.to_list(), as_index=False).size().sort_values(by="size", ascending=False)

# Number of unique entries
print("Number of unique entries", len(df_no_duplicates))
df_no_duplicates.head()

Number of terms in data 6883
Number of unique entries 6524


Unnamed: 0,english,swedish,size
839,Miscellaneous,Blandat,39
514,General,Allmänt,38
501,GENERAL,ALLMÄNT,11
780,MISCELLANEOUS,BLANDAT,11
73,Applications,Tillämpningar,7


In [39]:
df_simplified_data.groupby(by=["swedish"], as_index=False).size().sort_values(by="size", ascending=False).head()

Unnamed: 0,swedish,size
0,,565
388,Blandat,39
307,Allmänt,38
357,BLANDAT,11
287,ALLMÄNT,11


565 terms without translation what to do with these??

In [40]:
df_simplified_data.groupby(by=["english"], as_index=False).size().sort_values(by="size", ascending=False).head()

Unnamed: 0,english,size
827,Miscellaneous,39
505,General,38
492,GENERAL,11
768,MISCELLANEOUS,11
72,Applications,7


In [41]:
# Convert all strings to lowercase
df_ignore_case = df_simplified_data.map(lambda x: x.lower() if isinstance(x, str) else x)

# Group by english and swedish lemma to find number of duplicate entries
df_no_duplicates_ignore_case = df_ignore_case.groupby(df_ignore_case.columns.to_list(), as_index=False).size().sort_values(by="size", ascending=False)

# Number of unique entries
print("Number of unique entries ignoring case", len(df_no_duplicates_ignore_case))
df_no_duplicates_ignore_case.head()

Number of unique entries ignoring case 6480


Unnamed: 0,english,swedish,size
3519,miscellaneous,blandat,50
2343,general,allmänt,49
5536,standards,standarder,7
1967,error-checking [**],felkontroll [**],7
5850,test generation [**],testgenerering [**],7


#### Entries which contains a space in the swedish lemma

In [42]:
df_simplified_data[df_simplified_data["swedish"].str.contains(" ")]

Unnamed: 0,english,swedish
4,"""""""a lot"""" """"lots of""""""","""""""mycket av"""" """"massor av"""""""
5,"""""""energy efficiency"""" """"energy-efficient""""""","""energieffektivisering/energieffektivitet """"en..."
7,"""ad hoc [Latin: """"for this"""" (purpose/task/...)]""",till detta
13,1 pulse per second,1 puls per sekund
15,2G (Third Generation),2G (andra generationen)
...,...,...
6866,write,skriva
6869,X Windowing system,X Fönstersystem
6875,year years,år åratal
6881,zero knowledge,noll kunskap


#### Investigate the use of | to handle inflections

In [43]:
df_simplified_data[df_simplified_data["english"].str.contains("\|")]

Unnamed: 0,english,swedish
51,acceleration|s,acceleration|er
69,accommodate|s,ackommodera
90,acoustic|s,akustik
107,actor|s,aktör|er
147,affect|s,påverka|r
...,...,...
6436,tutorial|s,handledning
6600,variant|s,variant varianter
6611,vehicle|s,fordon
6797,winner|s,vinnare vinnare


In [44]:
data = read_jsonl_file(path + file)

# Function to handle inflections when there is a pipe character in lemma
def inflections_with_pipe_char_in_lemma(data):
    for entry in data:
        # Get variables
        eng_entry = entry["eng"]
        eng_lemma = eng_entry["lemma"]

        swe_entry = entry["swe"]
        swe_lemma = swe_entry["lemma"]

        # Check if entry satifies "word|inflection_ending" format
        if "|" in eng_lemma:
            if not eng_entry["inflection"]:

                # Remove | char
                inflection = ''.join([char for char in eng_lemma if char != "|"])

                # Get actual lemma
                lemma = eng_lemma.split('|')[0]

                # Update entry
                eng_entry["inflection"].append(inflection)
                eng_entry["lemma"] = lemma

                # Handle swedish case

                # We already have something in inflections
                if not swe_entry["inflection"]:
                    # Pipe char in swedish case
                    if "|" in swe_lemma:
                        # Remove | char
                        inflection = ''.join([char for char in swe_lemma if char != "|"])

                        # Get actual lemma
                        lemma = swe_lemma.split('|')[0]

                        # Update entry
                        swe_entry["inflection"].append(inflection)
                        swe_entry["lemma"] = lemma
                    else:
                        print("No pipe char in swedish lemma", swe_entry["lemma"], eng_lemma)
                else:
                    print("Inflections not empty:", swe_entry["inflection"])
            else:
                print("Inflections not empty:", eng_entry["inflection"])
    return data

processed_data = inflections_with_pipe_char_in_lemma(data)

No pipe char in swedish lemma ackommodera accommodate|s
No pipe char in swedish lemma akustik acoustic|s
No pipe char in swedish lemma belopp uppgår amount|s
No pipe char in swedish lemma antenn antenner antenna|s
No pipe char in swedish lemma argumentet/argument argument|s
No pipe char in swedish lemma tillgångs tillgångar asset|s
No pipe char in swedish lemma märke märken badge|s
No pipe char in swedish lemma bandet banden band|s
No pipe char in swedish lemma bärare bearer|s
No pipe char in swedish lemma sändning/sänder sändningsföretag broadcast|s broadcasting broadcasters
No pipe char in swedish lemma avboka/avbryter cancel|s
No pipe char in swedish lemma kort card|s
No pipe char in swedish lemma upphör/upphöra cease|s
No pipe char in swedish lemma chip chip|s
No pipe char in swedish lemma välja/väljer valda choose|s chosen
No pipe char in swedish lemma "kluster/""samlas i klunga"" kluster klustrade klustring" cluster|s clustered clustering
No pipe char in swedish lemma fullständig

#### Convert data to dataframe

In [108]:
data = read_jsonl_file(path + file)

df = pd.json_normalize(data).drop(columns = ["synonyms", "definition"])


# Explode the lists in the specified columns
df = df.explode("eng.inflection").explode("swe.inflection")


print(df["src"].value_counts())

# Reset the index after exploding
df = df.reset_index(drop=True)

# Drop empty lemmas
df = df[~((df["eng.lemma"] == "") & (df["swe.lemma"] == ""))]

print(df["src"].value_counts())

df_no_swe_lemma = df[(df["swe.lemma"] == "")]

# write to file
df_no_swe_lemma.to_csv('no_swedish_tranlsation.csv', index=False)


df = df[(df["swe.lemma"] != "")]

print(df["src"].value_counts())

df = df[(df["eng.inflection"].isna()) & (df["swe.inflection"].isna())]

print(df[(df["src"] == "ICT") & ((~df["eng.inflection"].isna()) | (~df["swe.inflection"].isna()))])

df["eng.lemma"] = df["eng.lemma"].str.replace(r'\[\*\*?\]', '', regex=True)

df["swe.lemma"] = df["swe.lemma"].str.replace(r'\[\*\*?\]', '', regex=True)

df_acm = df[(df["src"] == "ACM")]

df_ict = df[(df["src"] == "ICT")]

df_gf = df[(df["src"] == "GF")]

print(len(df_ict))
print(len(df_acm))
print(len(df_gf))

src
ICT    3867
GF     3574
ACM    1708
Name: count, dtype: int64
src
ICT    3864
GF     3574
ACM    1707
Name: count, dtype: int64
src
ICT    3633
GF     3574
ACM    1352
Name: count, dtype: int64
Empty DataFrame
Columns: [pos, status, src, row, comment, eng.lemma, eng.inflection, swe.lemma, swe.inflection]
Index: []
3614
1061
0


In [119]:
pattern = r'^[a-z]*$'
df_ict
filtered_df = df_ict[(df_ict['eng.lemma'].str.contains(pattern, case=False, na=False)) & (df_ict['swe.lemma'].str.contains(pattern, case=False, na=False))]
print(len(filtered_df))

filtered_df

776


Unnamed: 0,pos,status,src,row,comment,eng.lemma,eng.inflection,swe.lemma,swe.inflection
37,N?,0,ICT,25,from data,abstract,,abstrakt,
50,N?,0,ICT,28,from data,abstraction,,abstraktion,
64,N?,0,ICT,30,from data,accelerometer,,accelerometern,
93,N?,0,ICT,39,from data,according,,enligt,
97,N?,0,ICT,41,from data,accounting,,redovisning,
...,...,...,...,...,...,...,...,...,...
9110,N?,0,ICT,3827,from data,workstation,,arbetsstation,
9126,N?,0,ICT,3835,from data,would,,skulle,
9134,N?,0,ICT,3838,from data,written,,skriven,
9143,N?,0,ICT,3847,from data,yields,,utbyten,
