# Filter Sentences for Training Data

In [1]:
import pandas as pd
import ast

# Specify the path to your CSV file
file_path = "parsed_sentences.csv"

# Read the CSV file into a DataFrame
data = pd.read_csv(file_path)
data.iloc[:, 1] = data.iloc[:, 1].apply(ast.literal_eval)

# Display the first few rows
print(data.head())

                      Sentence  \
0                           a!   
1                       a a a!   
2                       a a a.   
3  akesi ike a li moku e moku!   
4    akesi ike li moku e moku.   

                                      Parser Outputs  
0  [s(exc(inj(expletive(interjection(a))))), s(in...  
1  [s(exc(inj(laughing(interjection(a),interjecti...  
2  [s(dec(inj(laughing(interjection(a),interjecti...  
3  [s(exc(inj_to_do(subj_p((noun(akesi),adjective...  
4  [s(dec(subj_p((noun(akesi),adjective(ike))),pr...  


In [2]:
filtered_data = data[data["Parser Outputs"].apply(len) == 1]
filtered_data.iloc[:, 1] = filtered_data.iloc[:, 1].apply(lambda x: x[0])

In [3]:
filtered_data

Unnamed: 0,Sentence,Parser Outputs
2,a a a.,"s(dec(inj(laughing(interjection(a),interjectio..."
4,akesi ike li moku e moku.,"s(dec(subj_p((noun(akesi),adjective(ike))),pre..."
5,akesi kin li moku e moku.,"s(dec(subj_p((noun(akesi),adjective(kin))),pre..."
6,"akesi kin, tu kin li moku e moku.","s(dec(subj_p((noun(akesi),adjective(kin),card(..."
7,"akesi kin, tu mi kin li moku e moku.","s(dec(subj_p((noun(akesi),adjective(kin),card(..."
...,...,...
2109,wan taso,"headline(obj_i((noun(wan),adjective(taso))))"
2122,wile ala moku.,"s(dec(answer(no(verb_p(wile),adverb(ala),verb_..."
2124,wile moku.,"s(dec(answer(yes(verb_p(wile),verb_t(moku)))))"
2125,wile pi jan ike li pakala e ijo.,"s(dec(subj_p((noun(wile),noun(jan),adjective(i..."


In [4]:
filtered_data.to_csv("filtered_sentences.csv")

# Categorize Official Words List

In [5]:
import json

# Read JSON data from a file
with open('official_words.json', 'r') as file:
    input_json = json.load(file)

# Print each key-value pair and transform the JSON
output_json = {}
for key, value in input_json.items():
    print(f"Category: {key}, Words: {list(value.keys())}")  # Print the key-value pair
    output_json[key] = list(value.keys())  # Transform the value into a list of keys

Category: adjective, Words: ['akesi', 'ala', 'alasa', 'ale', 'ali', 'anpa', 'ante', 'awen', 'esun', 'ijo', 'ike', 'ilo', 'insa', 'jaki', 'jan', 'jelo', 'jo', 'kala', 'kalama', 'kama', 'kasi', 'kili', 'kin', 'kiwen', 'kon', 'kule', 'kulupu', 'kute', 'lape', 'laso', 'lawa', 'len', 'lete', 'lili', 'linja', 'lipu', 'loje', 'lon', 'luka', 'lukin', 'lupa', 'ma', 'mama', 'mani', 'meli', 'mije', 'moku', 'moli', 'monsi', 'monsuta', 'mu', 'mun', 'musi', 'mute', 'namako', 'nanpa', 'nasa', 'nasin', 'nena', 'noka', 'oko', 'olin', 'open', 'pakala', 'pali', 'palisa', 'pana', 'pilin', 'pimeja', 'pini', 'poka', 'pona', 'pu', 'sama', 'seli', 'sewi', 'sijelo', 'sike', 'sin', 'sinpin', 'sitelen', 'sona', 'soweli', 'suli', 'suno', 'supa', 'suwi', 'tan', 'taso', 'tawa', 'telo', 'tenpo', 'toki', 'tomo', 'unpa', 'uta', 'utala', 'walo', 'waso', 'wawa', 'weka', 'wile']
Category: adverb, Words: ['ala', 'ale', 'ali', 'anpa', 'awen', 'ijo', 'ike', 'ilo', 'jaki', 'jan', 'kama', 'kili', 'kin', 'kiwen', 'kon', 'lape'

In [6]:
import pprint
pprint.pprint(output_json)

{'adjective': ['akesi',
               'ala',
               'alasa',
               'ale',
               'ali',
               'anpa',
               'ante',
               'awen',
               'esun',
               'ijo',
               'ike',
               'ilo',
               'insa',
               'jaki',
               'jan',
               'jelo',
               'jo',
               'kala',
               'kalama',
               'kama',
               'kasi',
               'kili',
               'kin',
               'kiwen',
               'kon',
               'kule',
               'kulupu',
               'kute',
               'lape',
               'laso',
               'lawa',
               'len',
               'lete',
               'lili',
               'linja',
               'lipu',
               'loje',
               'lon',
               'luka',
               'lukin',
               'lupa',
               'ma',
               'mama',
               'm

# Prompt Generation

In [9]:
i = 0
for _, row in filtered_data.iterrows():
    print("Give ONLY the CoNNL-U output for the following sentence-parser output pair")
    print(f"Sentence : {row['Sentence']}")
    print(f"Parser Output : {row['Parser Outputs']}")
    print()

Give ONLY the CoNNL-U output for the following sentence-parser output pair
Sentence : a a a.
Parser Output : s(dec(inj(laughing(interjection(a),interjection(a),interjection(a)))))

Give ONLY the CoNNL-U output for the following sentence-parser output pair
Sentence : akesi ike li moku e moku.
Parser Output : s(dec(subj_p((noun(akesi),adjective(ike))),pred_p(verb_t(moku),obj_d(noun(moku)))))

Give ONLY the CoNNL-U output for the following sentence-parser output pair
Sentence : akesi kin li moku e moku.
Parser Output : s(dec(subj_p((noun(akesi),adjective(kin))),pred_p(verb_t(moku),obj_d(noun(moku)))))

Give ONLY the CoNNL-U output for the following sentence-parser output pair
Sentence : akesi kin, tu kin li moku e moku.
Parser Output : s(dec(subj_p((noun(akesi),adjective(kin),card(numeral(tu)),adjective(kin))),pred_p(verb_t(moku),obj_d(noun(moku)))))

Give ONLY the CoNNL-U output for the following sentence-parser output pair
Sentence : akesi kin, tu mi kin li moku e moku.
Parser Output : 