In [154]:
import re
import sys
import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [155]:
# first, read in file

with open("Texts-UTF8.txt", encoding="utf-8") as f:
    contents = f.read()

# print(contents)

# now, split contents into useful segments

segments = re.findall(r"\\tx.*?(?=\n\\ref)", contents, re.DOTALL)
# looks for matches starting with \tx and ending with the next file's name
# no need to filter out irrelevant data with this form of matching

""" for segment in segments:
    print(segment, end='\n') """

" for segment in segments:\n    print(segment, end='\n') "

In [156]:
# create a function that converts segments into iterables

def convert(segment):
    lines = segment.split('\n')
    new_lines = []
    for line in lines:
            morphemes = line.split()
            line = ' '.join(morphemes)
            line = line.replace("- ", "-").replace("= ", "=").replace(" -", "-").replace("\\", "")
            if line != '':
                new_lines.append(line)

    lines_dict = {}
    for line in new_lines:
        spline = line.split()
        lines_dict[spline[0]] = spline[1:]
    
    return lines_dict

file_data = [] # this will be a list of dictionaries, with each dictionary representing one conversation segment

for segment in segments:
    data = convert(segment)
    data.pop('mb', None)
    if data['tx'] != ['EMPTY']: # filter out empty data
        for i in range(len(data['tx'])):
            data['tx'][i] = data['tx'][i].lower()
            data['ps'][i] = data['ps'][i].lower()
        file_data.append(data)

# print out contents of file_data just to check
""" for element in file_data:
    for items in element.items():
        print(items, end='\n')
    print('\n')  """

" for element in file_data:\n    for items in element.items():\n        print(items, end='\n')\n    print('\n')  "

In [157]:
# function definitions for search function(s)

def generate_ngrams(input_list, n):
    return list(zip(*[input_list[i:] for i in range(n)]))

def print_search_results(element):
    translation = " ".join(element.get("ft", []))
    element_copy = element.copy()
    keys = ['tx', 'ph', 'ge', 'ps']
    max_len = max([len(element.get(key, [])) for key in keys])
    for key in element_copy.keys():
        if key not in keys:
            element.pop(key, None)
        else:
            if max_len > len(element.get(key, [])):
                for i in range(max_len - len(element.get(key, []))):
                    element[key].append('')

    # now, create pd dataframe
    # pandas allows us to easily identify and align each index with one another
    df = pd.DataFrame(element)
    df = df.transpose()
    df.rename({'tx': 'Text:', 'ph': 'IPA:', 'ge': 'Gloss:', 'ps': 'POS:'}, axis='index', inplace=True)
    print(df, '\n')
    print("Trans:\t"+translation)
    print('\n' + '='*100 + '\n')


# these search functions check if the input token is present in a list of n-grams of element
# for any instance of the token within the list of n-grams, 
# a list of indexes denoting the position of the token within the n-gram list is returned
# then, in the main search() function, these respective indexes can be compared to see if they match up with each other

def searchByText(element, text_tokens):
    text_indexes = []
    text_ngram = generate_ngrams(element['tx'], len(text_tokens))
    for index, value in enumerate(text_ngram):
        if value == text_tokens:
            text_indexes.append(index)
    return text_indexes
        
def searchByGloss(element, gloss_tokens):
    gloss_indexes = []
    gloss_ngram = generate_ngrams(element['ge'], len(gloss_tokens))
    for index, value in enumerate(gloss_ngram):
        if value == gloss_tokens:
            gloss_indexes.append(index)

    return gloss_indexes

def searchByPos(element, pos_tokens):
    pos_indexes = []
    pos_ngram = generate_ngrams(element['ps'], len(pos_tokens))
    for index, value in enumerate(pos_ngram):
        if value == pos_tokens:
            pos_indexes.append(index)
    
    return pos_indexes

def searchByPhono(element, phono_tokens):
    phono_indexes = []
    phono_ngram = generate_ngrams(element['ph'], len(phono_tokens))
    for index, value in enumerate(phono_ngram):
        if value == phono_tokens:
            phono_indexes.append(index)

    return phono_indexes

def search(file_data, text, phono, gloss, pos):
    sys.stdout = open('output.txt','wt') # writes output to a file
    # tokenize inputs
    matches = []

    text_tokens = tuple(text.split())
    gloss_tokens = tuple(gloss.split()) 
    pos_tokens = tuple(pos.split())
    phono_tokens = tuple(phono.split())

    token_categories = [text_tokens, gloss_tokens, pos_tokens, phono_tokens]
    nonEmptyCategories = 0
    num_tokens = set()
    for token_category in token_categories:
        if token_category:
            nonEmptyCategories += 1
            num_tokens.add(len(token_category))

    if len(num_tokens) > 1:
        print("Error: number of tokens in non-empty categories are not the same.")
        return []
    elif len(num_tokens) == 0:
        print("Error: There should be at least one non-empty search category.")
        return []
    else:
        for element in file_data:
            indexes = []
            if text_tokens:
                text_indexes = searchByText(element, text_tokens)
                indexes.append(text_indexes)
            if gloss_tokens:
                gloss_indexes = searchByGloss(element, gloss_tokens)
                indexes.append(gloss_indexes)
            if pos_tokens:
                pos_indexes = searchByPos(element, pos_tokens)
                indexes.append(pos_indexes)
            if phono_tokens:
                phono_indexes = searchByPhono(element, phono_tokens)
                indexes.append(phono_indexes)
            
            if nonEmptyCategories == 1:
                # just need to check if indexes list has non empty list
                if indexes[0]:
                    print_search_results(element)
                    matches.append(element)
            elif nonEmptyCategories > 1:
                # check that each non-empty list within indexes have overlaps
                    intersection = set(indexes[0])
                    for sublst in indexes[1:]:
                        intersection.intersection_update(sublst)
                    if intersection:
                        print_search_results(element)
                        matches.append(element)

    if not matches:
        print("No matches.")
        return []


In [158]:
# this search works on multiple words, in all categories
# just make sure that the number of words in each category is the same
# unless you are intentionally leaving that category blank

# 'run all' instead of running this single cell again whenever you change the search parameters
# if not the translation will go missing

# sample outputs (filenames):
# output_1: 
# gloss: village
# pos: n

# output_2:
# gloss: LOC village
# pos: case n

# output_3:
# phono: ˀi ki
# pos: pn pn

# output_4:
# pos: pn pn

text = ''
phono = ''
gloss = ''
pos = ''

search(file_data, text, phono, gloss, pos)

[]