Skip to content

Commit

Permalink
Added a code that checks for ambiguous words in a dix file (For Lexic…
Browse files Browse the repository at this point in the history
…al Selection Rules)
  • Loading branch information
kamush901 committed Jul 13, 2021
1 parent 5a75d8f commit 820767f
Showing 1 changed file with 75 additions and 0 deletions.
75 changes: 75 additions & 0 deletions texts/lexical_selection_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# lexical_selection_check.py
# Checks the dix file for multiple translations for a single entry for both sides.
# Generates two files for each side with ambigious translations.
# Output file names: left_variants.txt, right_variants.txt

from argparse import ArgumentParser
from collections import defaultdict

# importing element tree
# under the alias of ET
import xml.etree.ElementTree as ET

def get_variants(side):
found_pairs = list()
for key in side.keys():
variants = side[key]
if(len(variants)>1):
print("Found: ", key, ": ", variants)
for variant in variants:
found_pairs.append(key + " " + variant)
return found_pairs

def write_variants(file_name,side_list):
with open(file_name,'w') as out_file:
previous_el = side_list[0].split(" ")[0]
for el in side_list:
tmp = el.split(" ")[0]
if previous_el != tmp:
out_file.write("\n")
previous_el = tmp
out_file.write(el)
out_file.write("\n")
out_file.close()

if __name__ == "__main__":
# START
print("------------------------------------------------------------------------------------------------------------------------------------")

# Parsing arguments:
arg_parser = ArgumentParser()

arg_parser.add_argument("--input_file", help="Path to the dix file you want to analyse.", required=True)
args = arg_parser.parse_args()

# input_file = '../../apertium-kaz-uzb.kaz-uzb.dix'
input_file = args.input_file
print("Starting analysis with file:", input_file)

# Passing the path of the xml document to enable the parsing process
tree = ET.parse(input_file)

# getting the parent tag of
# the xml document
root = tree.getroot()

section = root[2]
l_side = defaultdict(list)
r_side = defaultdict(list)
for element in section:
# Checking regex, skipping if present:
if len(element.findall("re"))>0:
continue
l = ET.tostring(element.find("p/l"), encoding='unicode').strip()
r = ET.tostring(element.find("p/r"), encoding='unicode').strip()
l_side[l].append(r)
r_side[r].append(l)

l_list = get_variants(l_side)
r_list = get_variants(r_side)
print("Writing results to a file:")
write_variants("left_variants.txt",l_list)
write_variants("right_variants.txt",r_list)
print("Finished writing to files.")


0 comments on commit 820767f

Please sign in to comment.