In [421]:
import tiktoken
import pandas as pd
import csv
import json
import re
import os
from pathlib import Path

In [524]:
# Code used to check how many tokens are in a single html
enc = tiktoken.get_encoding("cl100k_base")
file_path = "/Users/tollymon/Desktop/research_mars/Lojban Wave Lessons_Single page - La Lojban.html"
with open(file_path, "r", encoding="utf-8") as f:
    html_string = f.read()
len(enc.encode(html_string))

96385

In [523]:
def remove_html_tags(html_content):
    clean_text = re.sub(r'<[^>]*>', '', html_content)
    return clean_text

def process_html_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
            cleaned_content = remove_html_tags(html_content)
            print("--- Cleaned Text ---")
            print(cleaned_content)
    except Exception as e:
        print(f"An error occurred: {e}")

# dummy_file_name = "sample.html"/÷
with open(file_path, 'w', encoding='utf-8') as f:
    f.write(file_path)
print(f"Created a dummy HTML file: '{file_path}'")

process_html_file(file_path)


Created a dummy HTML file: '/Users/tollymon/Desktop/research_mars/la karda - La Lojban.html'
--- Cleaned Text ---
/Users/tollymon/Desktop/research_mars/la karda - La Lojban.html


In [446]:
notebook_dir = Path(__file__).parent if '__file__' in locals() else Path.cwd()
input_tsv_gismu = notebook_dir / "input_data" / "gismu_list_format_2_(less_info_but_tab_delimited).txt"
input_txt_lujvo = notebook_dir / "input_data" / "lujvo.txt"
input_txt_cmavo = notebook_dir / "input_data" / "cmavo.txt"
output_csv_cmavo = notebook_dir / "input_data" / "output_cmavo.csv"
input_txt_rafsi = notebook_dir / "input_data" / "rafsi.txt"
output_csv_rafsi = notebook_dir / "input_data" / "output_rafsi.csv"


In [447]:
column_titles = ["Lojban", "Lojban definition", "English"] 
column_titles_lujvo = ["Lojban", "Lojban Composition", "English", "Arguments"] 
column_titles_cmavo = ["Lojban", "Formal Language", "English", "Definition", "Confer"]
column_titles_rafsi= ["Lojban", "Lojban Gismu", "English"]

In [None]:

def convert_cmavo_rafsi(input_txt, output_txt, rafsi_flag):
    """
    Converts cmavo and rafsi dubious formatting

    """
    if rafsi_flag:
        split_pattern = r"[ \u00A0]{1,}" # any space like character
    else:
        split_pattern = r"[ \u00A0]{2,}"
    with open(input_txt, "r", newline="", encoding="utf-8") as infile:
        with open(output_txt, "w", newline="", encoding="utf-8") as outfile:
            csv_writer = csv.writer(outfile, delimiter=",")

            for line in infile:
                clean_line = line.strip()
                parts = re.split(split_pattern, clean_line)
                csv_writer.writerow(parts)

def handle_bad_line(bad_line):
    if len(bad_line) > 3:
        return bad_line[:3]
    return bad_line

def handle_bad_rafsi(bad_line):
    joined_rest = " ".join(bad_line[2:])
    return [bad_line[0], bad_line[1], joined_rest]

In [449]:
data_lujvo = pd.read_csv(input_txt_lujvo, sep=":", encoding='utf-8', on_bad_lines='warn', names=column_titles_lujvo)
data_lujvo["Type"] = "lujvo"
data_gismu = pd.read_csv(input_tsv_gismu, sep="\t", on_bad_lines='warn', names=column_titles)
data_gismu["Type"] = "gismu"
convert_cmavo_rafsi(input_txt_cmavo, output_csv_cmavo, rafsi_flag=False)
data_cmavo = pd.read_csv(output_csv_cmavo, on_bad_lines=handle_bad_line, names=column_titles_cmavo, engine="python")
data_cmavo["Type"] = "cmavo"
convert_cmavo_rafsi(input_txt_rafsi, output_csv_rafsi, rafsi_flag=True)
data_rafsi = pd.read_csv(output_csv_rafsi, sep=",", encoding='utf-8', on_bad_lines=handle_bad_rafsi, header=None, names=column_titles_rafsi, engine="python")
data_rafsi["Type"] = "rafsi"


  data_lujvo = pd.read_csv(input_txt_lujvo, sep=":", encoding='utf-8', on_bad_lines='warn', names=column_titles_lujvo)


In [425]:
data_lujvo.head()

Unnamed: 0,Lojban,Lojban Composition,English,Arguments,Type
0,ba'armo'a,barna+morna,pattern of marks,$morna1 $morna2 $=barna1 $morna3 $barna2,lujvo
1,ba'orzu'e,banro+zukte,to grow something,$zukte1 $banro1 $banro2 $banro3,lujvo
2,ba'ostu,banro+stuzi,nursery; growth/colony of germs,$stuzi1 $banro1 $banro2 $banro3,lujvo
3,ba'urdu'u,bacru+dunku,to bitch/whine about something,$dunku1 $=bacru1 $dunku2 $bacru2,lujvo
4,ba'urnoi,bacru+notci,spoken message; to tell someone something,$notci1 $=bacru2 $notci2 $notci3 $=bacru1 $notci4,lujvo


In [429]:
data_gismu.head()

Unnamed: 0,Lojban,Lojban definition,English,Type
0,bacru,x_{1} utters verbally/says/phonates/speaks [vo...,- utter,gismu
1,badna,x_{1} is a banana/plantain [fruit/plant] of sp...,- plantain; banana<br>- banana; fruit or plant,gismu
2,badri,x_{1} is sad/depressed/dejected/[unhappy/feels...,- dejected<br>- sad<br>- depressed; sad,gismu
3,bajra,x_{1} runs on surface x_{2} using limbs x_{3} ...,- run,gismu
4,bakfu,x_{1} is a bundle/package/cluster/clump/pack [...,- bundle,gismu


In [430]:
data_cmavo.head()

Unnamed: 0,Lojban,Formal Language,English,Definition,Confer,Type
0,.a,A,sumti or,logical connective: sumti afterthought or,,cmavo
1,.a'a,UI1,attentive,attitudinal: attentive - inattentive - avoiding,"(cf. jundi, rivbi)",cmavo
2,.a'acu'i,UI*1,inattentive,attitudinal: attentive - inattentive - avoiding,,cmavo
3,.a'anai,UI*1,avoiding,attitudinal: attentive - inattentive - avoiding,,cmavo
4,.a'e,UI1,alertness,attitudinal: alertness - exhaustion,"(cf. sanji, cikna, tatpi)",cmavo


In [None]:
def searching_match(word, datasets):
    """
    Code that searches through the datasets for matching Lojban words
    """
    for d in datasets:
        for w in d["Lojban"]:
            if str(w) == word:
                exact_matches = d[d['Lojban'] == word]
                return exact_matches
    return None

In [None]:
def csv_converter(csv_file, jsonl_file):
    """
    Convert the csv file to the jsonl
    """
    with open(csv_file, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)

        with open(jsonl_file, 'w', encoding='utf-8') as jsonl_file:
            for row in csv_reader:
                json_line = json.dumps(row, ensure_ascii=False)
                jsonl_file.write(json_line + '\n')
        

csv_converter("data.csv", "converted_data")

In [None]:
# pattern = r'"(.*?)"'
def return_sentences():
    """
    Return the Lojban sentences from questions.
    """
    all_senteces = []
    with open("converted_data", "r", encoding="utf-8") as f:
            for line in f:
                data_json = json.loads(line.strip())
                lines = data_json["prompt"].splitlines()
                for l in lines:
                    if l.startswith("(A)"):
                        all_senteces.append(l)
                    elif l.startswith("(B)"):
                        all_senteces.append(l)
    
    return all_senteces

            # matches = re.findall(pattern, data_json["prompt"])
            # all_senteces.append(matches[0])

In [None]:
def grab_unique_words(sentences):
    """
    Obtain the unique words from the list of sentences
    """
    uniqueWords = []
    for s in sentences:
        for w in s.split():
            if w not in uniqueWords:
                uniqueWords.append(w)

    return uniqueWords

In [440]:
list_unique_words = grab_unique_words(return_sentences())

In [485]:
# jsonl_file = "to_pass_definitions"

def create_jsonl_definitions(gismu_def, lujvo_def, cmavo_def):
    """
    Creates the jsonl for each of the type.
    """
    not_found_words = []
    found_gismu = []
    found_lujvo = []
    found_cmavo = []
    with open(gismu_def, 'w', encoding='utf-8', newline="") as gismu_f, \
         open(lujvo_def, 'w', encoding='utf-8', newline="") as lujvo_f, \
         open(cmavo_def, 'w', encoding='utf-8', newline="") as cmavo_f: 
        for w in list_unique_words:
            matched = searching_match(w, [data_gismu,data_lujvo, data_cmavo])
            if matched is not None:
                matched_dict = matched.iloc[0].to_dict()
                # cleaned_dict = {}
                # for key, value in matched_dict.items():
                #     if isinstance(value, dict) and len(value) == 1:
                #         cleaned_dict[key] = list(value.values())[0]
                #     else:
                #         cleaned_dict[key] = value
                json_line = json.dumps(matched_dict, ensure_ascii=False)

                if matched_dict["Type"] == "gismu":
                    gismu_f.write(json_line + "\n")
                    found_gismu.append(matched_dict)
                elif matched_dict["Type"] == "lujvo":
                    lujvo_f.write(json_line + "\n")
                    found_lujvo.append(matched_dict)
                elif matched_dict["Type"] == "cmavo":
                    cmavo_f.write(json_line + "\n")
                    found_cmavo.append(matched_dict)

            else:
                not_found_words.append(w)
            
    
    return not_found_words, pd.DataFrame(found_gismu), pd.DataFrame(found_lujvo), pd.DataFrame(found_cmavo)


In [486]:
gismu_jsonl = "gismu_def"
lujvo_jsonl = "lujvo_def"
cmavo_jsonl = "cmavo_def"
rasfi_def = "rasfi_def"
not_found_words, found_gismu, found_lujvo, found_cmavo = create_jsonl_definitions(gismu_jsonl, lujvo_jsonl, cmavo_jsonl)

In [444]:
len(not_found_words)

73

In [464]:
def return_rasfi(rasfi_def):
    """
    Returns the rasfi from the **not** matched words
    """
    rafsi_search = data_rafsi["Lojban"].to_list()
    rest_found = set()
    compiled_patterns = [(re.escape(str(s))) for s in rafsi_search]

    for i in compiled_patterns:
        for j in not_found_words:
            match = re.findall(str(i), j)
            for k in match:
                if k is not None:
                    rest_found.add(str(match[0]))


    with open(rasfi_def, 'w', encoding='utf-8', newline="") as rasfi_f:
        for w in rest_found:
            matched = searching_match(w, [data_rafsi])
            if matched is not None:
                matched_dict = matched.iloc[0].to_dict()
                json_line = json.dumps(matched_dict, ensure_ascii=False)
                rasfi_f.write(json_line + "\n")
    

    filtered_rasfi = data_rafsi[data_rafsi["Lojban"].isin(rest_found)]        

    return filtered_rasfi

In [467]:
filtered_data_rasfi = return_rasfi(rasfi_def)

In [474]:
filtered_rasfi_gismu = data_gismu[data_gismu["Lojban"].isin(filtered_data_rasfi["Lojban Gismu"])]

In [490]:
filtered_gismu = pd.concat([filtered_rasfi_gismu, found_gismu], ignore_index=True).drop_duplicates()
filtered_gismu.head()

Unnamed: 0,Lojban,Lojban definition,English,Type
0,bakni,x_{1} is a cow/cattle/kine/ox/[bull/steer/calf...,- bovine,gismu
1,bartu,x_{1} is on the outside of x_{2}; x_{1} is ext...,- out,gismu
2,berti,x_{1} is to the north/northern side [right-han...,- north,gismu
3,bitmu,x_{1} is a wall/fence separating x_{2} and x_{...,- wall,gismu
4,bolci,x_{1} is a ball/sphere/orb/globe [shape/form] ...,- ball,gismu


In [494]:
not_filtered_rasfi_gismu = filtered_data_rasfi[~filtered_data_rasfi["Lojban Gismu"].isin(filtered_rasfi_gismu["Lojban"])]
not_filtered_rasfi_gismu

Unnamed: 0,Lojban,Lojban Gismu,English,Type
766,mei,mei,cardinal selbri,rafsi
903,nun,nu,event abstract,rafsi
1047,rel,re,2,rafsi
1142,sel,se,2nd conversion,rafsi
1232,su'o,su'o,at least,rafsi
1292,ter,te,3rd conversion,rafsi
1325,tol,to'e,polar opposite,rafsi
1449,xel,xe,5th conversion,rafsi
1535,zor,zo'i,inward,rafsi


In [497]:
filtered_rasfi_cmavo = data_cmavo[data_cmavo["Lojban"].isin(not_filtered_rasfi_gismu["Lojban Gismu"])]
filtered_cmavo = pd.concat([filtered_rasfi_cmavo, found_cmavo]).drop_duplicates()
filtered_cmavo.head()

Unnamed: 0,Lojban,Formal Language,English,Definition,Confer,Type
467,mei,MOI,cardinal selbri,convert number to cardinality selbri; x1 is th...,,cmavo
561,nu,NU,event abstract,abstractor: generalized event abstractor; x1 i...,,cmavo
689,re,PA1,2,digit/number: 2 (digit) [two],,cmavo
749,se,SE,2nd conversion,2nd conversion; switch 1st/2nd places,,cmavo
859,su'o,PA4,at least,digit/number: at least (some); no less than,,cmavo


In [511]:
filtered_lujvo_gismu_list = []
for i in found_lujvo["Lojban Composition"].to_list():
    for j in i.split("+"):
        filtered_lujvo_gismu_list.append(j)

In [512]:
filtered_lujvo_gismu = data_gismu[data_gismu["Lojban"].isin(filtered_lujvo_gismu_list)]

In [514]:
filtered_lujvo_gismu

Unnamed: 0,Lojban,Lojban definition,English,Type
11,balvi,x_{1} is in the future of/later than/after x_{...,- future; unspecified,gismu
42,bevri,x_{1} carries/hauls/bears/transports cargo x_{...,- carry; transport,gismu
82,bukpu,x_{1} is an amount of cloth/fabric of type/mat...,- cloth,gismu
94,calku,"x_{1} is a shell/husk [hard, protective coveri...",- shell,gismu
125,cfari,x_{1} [state/event/process] commences/initiate...,- happen; occur<br>- initiate; non-agentive,gismu
132,cidja,x_{1} is food/feed/nutriment for x_{2}; x_{1} ...,- food<br>- nutriment,gismu
150,cinki,x_{1} is an insect/arthropod of species x_{2};...,- insect,gismu
161,ciska,x_{1} inscribes/writes x_{2} on display/storag...,- write,gismu
184,clani,x_{1} is long in dimension/direction x_{2} (de...,- long,gismu
301,djedi,x_{1} is x_{2} full days in duration (default ...,- full day,gismu


In [518]:
final_filtered_gismus = pd.concat([filtered_gismu, filtered_lujvo_gismu]).drop_duplicates()

In [519]:
final_filtered_gismus

Unnamed: 0,Lojban,Lojban definition,English,Type
0,bakni,x_{1} is a cow/cattle/kine/ox/[bull/steer/calf...,- bovine,gismu
1,bartu,x_{1} is on the outside of x_{2}; x_{1} is ext...,- out,gismu
2,berti,x_{1} is to the north/northern side [right-han...,- north,gismu
3,bitmu,x_{1} is a wall/fence separating x_{2} and x_{...,- wall,gismu
4,bolci,x_{1} is a ball/sphere/orb/globe [shape/form] ...,- ball,gismu
...,...,...,...,...
899,purci,x_{1} is in the past of/earlier than/before x_...,- past,gismu
1083,sraji,x_{1} is vertical/upright/erect/plumb/oriented...,- vertical,gismu
1267,xamsi,x_{1} is a sea/ocean/gulf/[atmosphere] of plan...,- sea,gismu
1323,zekri,x_{1} (event/state) is a punishable crime/[tab...,- crime,gismu
