# Correcting Wordlists

This notebook corrects the data in the currently available wordlists for Greek, Aramaic, Hebrew, and Latin to be more fully normalized and to correctly link back to the wordIDs in question.

Further, for the case of Greek, this notebook collapses cases of variant accentuation into a single row of data. It is not desirable for καί and καὶ to be treated as separate grammatical forms.

In [1]:
import csv
import re
import unicodedata
from lxml import etree
from greek_accentuation.accentuation import *
from greek_accentuation.syllabify import *
from greek_accentuation.characters import *

In [2]:
grcCSV = open('csvs_old/Tagged GreekMasterNew - GreekMasterNew.csv', newline='')
grc = list(csv.reader(grcCSV))
grcCSV.close()

latCSV = open('csvs_old/Tagged LatinMasterNew - NewFormat.csv', newline='')
lat = list(csv.reader(latCSV))
latCSV.close()

hebCSV = open('csvs_old/Tagged HebrewMasterNew - HebrewTestOutput.csv', newline='')
heb = list(csv.reader(hebCSV))
hebCSV.close()

arcCSV = open('csvs_old/Tagged AramaicMasterNew - aramaic-master.csv', newline='')
arc = list(csv.reader(arcCSV))
arcCSV.close()

langCSVs = {'grc': grc, 'lat': lat, 'heb': heb, 'arc': arc}

In [3]:
#from greek_accentuation.characters import *
#import unicodedata

def grave_to_accute(word):
    characters = []
    for ch in word:
        if 'VARIA' in unicodedata.name(ch):
            characters.append(add_diacritic(strip_accents(ch), ACUTE))
        else:
            characters.append(ch)
    normalizedWord = ''.join(characters)
    
    return normalizedWord

In [4]:
#import unicodedata

def oxia_to_tonos(word):
    characters = []
    for ch in word:
        if 'OXIA' in unicodedata.name(ch):
            characters.append(unicodedata.normalize('NFC',ch))
        else:
            characters.append(ch)
    normalizedWord = ''.join(characters)
    
    return normalizedWord

In [5]:
#import unicodedata

def remove_diaeresis(word):
    convDict = {
        'ΐ': 'ί', #GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
        'Ϊ': 'Ι', #GREEK CAPITAL LETTER IOTA WITH DIALYTIKA
        'Ϋ': 'Υ', #GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA
        'ΰ': 'ύ', #GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
        'ϊ': 'ι', #GREEK SMALL LETTER IOTA WITH DIALYTIKA
        'ϋ': 'υ', #GREEK SMALL LETTER UPSILON WITH DIALYTIKA
        'ῒ': 'ὶ', #GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
        'ΐ': 'ί', #GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
        'ῗ': 'ῖ', #GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
        'ῢ': 'ὺ', #GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
        'ΰ': 'ύ', #GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
        'ῧ': 'ῦ', #GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
    }
    characters = []
    for ch in word:
        if 'DIALYTIKA' in unicodedata.name(ch):
            characters.append(convDict[ch])
        else:
            characters.append(ch)
            
    return "".join(characters)

In [6]:
#import unicodedata

def check_for_diacritics(word):
    diacritics = []
    for ch in word:
        chName = unicodedata.name(ch)
        if 'DIALYTIKA' in chName:
            diacritics.append('diaeresis')
        if 'TONOS' in chName or 'OXIA' in chName or 'VARIA' in chName or 'PERISPOMENI' in chName:
            diacritics.append('accent')
        if 'DASIA' in chName or 'PSILI' in chName:
            diacritics.append('breathing')
    diacritics = list(set(diacritics))
    
    return diacritics

In [7]:
#from greek_accentuation.syllabify import *
#import unicodedata

def check_for_breathing(word):
    breathing = False
    if is_vowel(word[0]) == True:
        syllables = syllabify(word)
        for ch in syllables[0]:
            chName = unicodedata.name(ch)
            if 'DASIA' in chName or 'PSILI' in chName:
                breathing = True
    return breathing

In [8]:
#import re

def handle_lunate_sigmas(word):
    while re.search(r"ϲ",word.lower()):
        word = re.sub(r"Ϲ","Σ",word)
        word = re.sub(r"(.*)ϲ$",'\\1ς',word)
        word = re.sub(r"(.*)ϲ(.+)",'\\1σ\\2',word)
        
    return word

In [9]:
#import re

def handle_apostrophes(word):
    # APOSTROPHE
    # MODIFIER LETTER APOSTROPHE
    # GREEK TONOS
    # GREEK KORONIS
    # GREEK PSILI
    # GREEK OXIA
    # RIGHT SINGLE QUOTATION MARK
    # GREEK NUMERAL SIGN
    
    # all replaced with RIGHT SINGLE QUOTATION MARK
    word = re.sub(r"['ʼ΄᾽᾿´’ʹ]","’", word)
    
    return word

In [10]:
def only_greek_characters(word):
    codeblocks = {'grc1': [int('0370',16),int('03FF',16)],
                  'grc2': [int('1F00',16),int('1FFF',16)]}
    characters = []
    for ch in word:
        if ch == '’':
            characters.append(ch)
        # Greek and Coptic
        elif codeblocks['grc1'][0] <= ord(ch) <= codeblocks['grc1'][1]:
            characters.append(ch)
        # Greek Extended
        elif codeblocks['grc2'][0] <= ord(ch) <= codeblocks['grc2'][1]:
            characters.append(ch)
        
    return "".join(characters)

In [11]:
#import re

def only_latin_characters(word):
    word = "".join([ch for ch in word if re.match(r"[a-zA-Z]", ch)])
    return word

In [12]:
#import re

def only_hebrew_characters(word):
    word = "".join([ch for ch in word if re.match(r"[֑-ת]", ch) or ch == '_'])
    return word

In [13]:
#import re

def unpointed_hebrew(word):
    word = "".join([ch for ch in word if re.match(r"[א-ת]", ch) or ch == '_'])
    return word

In [14]:
#import unicodedata

def count_accents(word):
    count = 0
    for ch in word:
        chName = unicodedata.name(ch)
        if 'TONOS' in chName or 'OXIA' in chName or 'VARIA' in chName or 'PERISPOMENI' in chName:
            count+=1
            
    return count

In [15]:
def normalize_lang(word, pos=None, lang=None, point=False):
    if lang == None:
        print('normalize_lang: No language declared.')
        exit()
    # strip whitespace
    word = word.strip()
    # combining characters
    word = unicodedata.normalize('NFC', word)
    # handle apostrophes
    word = handle_apostrophes(word)
    # handle upper/lowercase
    if lang == 'grc' or lang == 'lat':
        if pos != None:
            if pos.strip().upper() == 'PROPN':
                word = word.title()
            else:
                word = word.lower()
                
    # LATIN
    if lang == 'lat':
        # strip non-Latin characters
        word = only_latin_characters(word)
    # GREEK
    if lang == 'grc':
        # Latin o to Greek omicron
        word = word.replace('o','ο')
        # strip non-Greek characters
        word = only_greek_characters(word)
        # handle lunate sigmas
        word = handle_lunate_sigmas(word)
        # handle diacritics
        diacritics = check_for_diacritics(word)
        if 'diaeresis' in diacritics:
            word = remove_diaeresis(word)
        if 'accent' in diacritics:
            word = oxia_to_tonos(grave_to_accute(word))
    # HEBREW / ARAMAIC
    if lang == 'heb' or lang == 'arc':
        if point == True:
            word = only_hebrew_characters(word)
        else:
            word = unpointed_hebrew(word)
    
    return word

In [16]:
def get_occur_list(occurString):
    occurs = occurString.split()
    cleanedOccurs = []
    for occur in occurs:
        fileID, count = occur.split('-')
        count = count.strip(',')
        fileID = fileID.split('.')[0]
        cleanedOccurs.append(f"{fileID}-{count}")
    cleanedOccurs = list(set(cleanedOccurs))
    return cleanedOccurs

In [17]:
def normalize_row(row, lang=None):
    if lang == None:
        print('normalize_row: No language declared.')
        exit()

    word = normalize_lang(row[0], pos=row[3], lang=lang)
    occurr = ", ".join(get_occur_list(row[1]))
    lemma = normalize_lang(row[2], pos=row[3], lang=lang, point=True)
    normID = f"w-{word}"
    
    normalizedRow = [word, occurr, lemma, row[3].strip(), normID]
    normalizedRow = normalizedRow + [item.strip() for item in row[5:]]

    return normalizedRow

# ===============
# > The Process
# ===============

The following is the process for Greek:

1. Strip whitespace.
2. Handle combining characters. Combine if possible, delete if not (step 4).
3. Normalize apostrophes / number signs (end of word only).
  - Shouldn't be any numbers since we're only looking at words.
4. Strip non-Greek characters.
  - To be robust about this, should check that all characters are within the relevant Unicode code blocks (Greek and Coptic, Greek Extended, ~~Combining Diacritical Marks~~ allowed apostrophe).
5. Check POS. If PROPN, titlecase. Otherwise, lowercase.
6. Handle lunate sigmas.
7. Remove diaereses.
8. Convert grave accents to acute accents.
9. Handle acute accent duplication (standardize oxia to tonos).

If exact match after this, great! Merge the wordID occurrences. If not, pick the wordform with the most occurrences and use that as the default form.

And so in the future, when matching segmented wods to these CSVs, they will be normalized following the same steps as above (minus the PROPN check) and compared to the sheet. The researcher will then select which entry is correct, or input a new entry.allowed

Other languages follow similar steps, without the Greek particulars. For Hebrew and Aramaic, pointing is removed for the Normalized wordform column and the NormalizedID, but maintained for the Lemma.

In [18]:
newCSVs = {}

for lang in langCSVs:
    
    seen = {}

    for row in langCSVs[lang]:
        # skip header rows
        if row[0].strip() == 'Normalized' or row[0].strip() == '':
            continue
            
        # normalize data
        normalizedRow = normalize_row(row, lang=lang)

        word = normalizedRow[0]
        lemma = normalizedRow[2]

        noAccent = "".join([unicodedata.normalize("NFD", ch)[0].lower() for ch in word])
        newData = " | ".join([noAccent, lemma, normalizedRow[3], normalizedRow[5]])
        if newData not in seen:
            seen[newData] = {}
        if word not in seen[newData]:
            seen[newData][word] = []

        # condense any duplicates if they differ ONLY in occurrences
        noOccRow = [word, lemma] + [normalizedRow[3], normalizedRow[5]]
        matchBool = False
        for i, seenRow in enumerate(seen[newData][word]):
            noOccSeenRow = [seenRow[0], seenRow[2], seenRow[3], seenRow[5]]
            if noOccRow == noOccSeenRow:
                matchBool = True
                # combine occurrences
                oldOcc = seen[newData][word][i][1]
                newOcc = row[1]
                seen[newData][word][i][1] = ", ".join(get_occur_list(f"{oldOcc} {newOcc}"))
    #             print(seen[newData][word][i])

        if matchBool == False:
            seen[newData][word].append(normalizedRow)


    # build new CSV
    newCSV = []
    for noAccent in seen:
        for wordform in seen[noAccent]:
            for row in seen[noAccent][wordform]:
                newCSV.append(row)
                
    newCSVs[lang] = newCSV

# Check against XML. Correct Occurrences.

This process is drawing from the code in the notebook validate_create-new-sheets.

In [19]:
workdir = "/data/Data/Projects/iip/github/iip-texts/epidoc-files/"
parser = etree.XMLParser(ns_clean=True, remove_blank_text=False)

wordsWithXML = {}
    
allWordOccurr = {'grc': [], 'lat': [], 'heb': [], 'arc': []}
allFileOccurr = {'grc': [], 'lat': [], 'heb': [], 'arc': []}

for lang in newCSVs:

    for row in newCSVs[lang]:
        if row[0].strip() == 'Normalized' or row[0].strip() == '':
            pass  # This is to skip header rows
        else:
            occurrs = row[1].split(", ")
            for occurr in occurrs:
                allWordOccurr[lang].append(occurr)
                allFileOccurr[lang].append(occurr.split('-')[0])

    print(f"\n{lang.upper()}\n===")
    print(len(allWordOccurr[lang]),"wordIDs, with duplicates.")
    allWordOccurr[lang] = list(set(allWordOccurr[lang]))
    allFileOccurr[lang] = list(set(allFileOccurr[lang]))
    print(len(allWordOccurr[lang]),"unique wordIDs.")
    print(len(allFileOccurr[lang]),"unique files.")
    
    for fileID in allFileOccurr[lang]:
        fileID = fileID.split('.')[0]
        if fileID not in wordsWithXML:
            wordsWithXML[fileID] = {}
        file = fileID + '.xml'
        try:
            xmlText = etree.parse(workdir+file, parser)

            nsmap = {'tei': "http://www.tei-c.org/ns/1.0"}
            ns = {'tei': "http://www.tei-c.org/ns/1.0"}
            TEI_NS = "{http://www.tei-c.org/ns/1.0}"
            XML_NS = "{http://www.w3.org/XML/1998/namespace}"

            segmented_words = xmlText.findall(".//tei:div[@subtype=\'transcription_segmented\']/tei:p/*", namespaces=nsmap)

            for segmented in segmented_words:
                #print(etree.tostring(segmented))
                wordID = segmented.attrib['{http://www.w3.org/XML/1998/namespace}id']
                if wordID.split('-')[0] != fileID:
                    print('Error: mismatch between',fileID,'and',wordID)

                wordElem = etree.tostring(segmented, encoding='unicode').strip()

                if wordID in wordsWithXML[fileID]:
                    pass
                else:
                    wordsWithXML[fileID][wordID] = wordElem
        except:
            print('Error for file',file)


GRC
===
91978 wordIDs, with duplicates.
17124 unique wordIDs.
2049 unique files.
Error for file hmti0003.xml
Error for file jeru0196.xml
Error for file jeru0237.xml
Error for file caes0412.xml
Error for file gers0001.xml
Error for file hmti0004.xml
Error for file halu0001.xml
Error for file anri0001.xml
Error for file rehn0001.xml
Error for file dora0002.xml
Error for file hmti0005.xml
Error for file knah0002.xml

LAT
===
2213 wordIDs, with duplicates.
1045 unique wordIDs.
122 unique files.

HEB
===
1624 wordIDs, with duplicates.
1624 unique wordIDs.
369 unique files.
Error for file masa0037.xml
Error for file masa0039.xml
Error for file masa0038.xml
Error for file seph0100.xml
Error for file jeru0305.xml
Error for file jent0006.xml

ARC
===
7951 wordIDs, with duplicates.
7951 unique wordIDs.
1334 unique files.


In [20]:
# Function to remove element without losing tail text, if present.
# Function either attaches tail text of removed element to tail
# text of previous element, if extant, or adds it to the text of
# the parent element.
# From https://github.com/OaklandPeters/til/blob/master/til/python/lxml-and-tail-text.md
def remove_element(elem):
    parent = elem.getparent()
    if elem.tail:
        prev = elem.getprevious()
        if prev is not None:
            if prev.tail:
                prev.tail += elem.tail
            else:
                prev.tail = elem.tail
        else:
            if parent.text:
                parent.text += elem.tail
            else:
                parent.text = elem.tail
    parent.remove(elem)

In [21]:
wordsFromXML = dict()
chars = []

for fileID in wordsWithXML:
    wordsFromXML[fileID] = dict()
    for tokenID in wordsWithXML[fileID]:
        xmlText = etree.fromstring(wordsWithXML[fileID][tokenID], parser)
        if xmlText.tag == '{http://www.tei-c.org/ns/1.0}w':
            
            for sic in xmlText.xpath(".//tei:choice//tei:sic", namespaces=nsmap):
                remove_element(sic)
            for orig in xmlText.xpath(".//tei:choice//tei:orig", namespaces=nsmap):
                remove_element(orig)
            for am in xmlText.xpath(".//tei:expan//tei:am", namespaces=nsmap):
                remove_element(am)
            for surplus in xmlText.xpath(".//tei:surplus", namespaces=nsmap):
                remove_element(surplus)
            for g in xmlText.xpath(".//tei:g", namespaces=nsmap):
                remove_element(g)
                
            # We do need something to handle multiple <unclear>s in a <choice>.
            # Not handled yet. Maybe duplicate the wordID with the two alternatives?
            
            grcBool = False
            xmlLang = xmlText.attrib['{http://www.w3.org/XML/1998/namespace}lang']
            if xmlLang == 'grc':
                grcBool = True
            # note, nothing in XML to indicate proper noun. so when checking, will
            # have to use .lower() on the words from both the CSV and the XML
            
            word = ''.join(xmlText.itertext())
            if xmlLang == 'grc':
                word = normalize_lang(word, lang='grc')
            elif 'la' in xmlLang:
                word = normalize_lang(word, lang='lat')
            elif 'arc' in xmlLang:
                word = normalize_lang(word, lang='arc')
            elif 'he' in xmlLang:
                word = normalize_lang(word, lang='heb')
            
            word2 = ''
            for char in word:
                # This is to remove control characters which were appearing in the chars list
                if unicodedata.category(char) == 'Cc':
                    pass
                else:
                    word2 = word2+char
                    if char not in chars:
                        chars.append(char)
                        
            wordsFromXML[fileID][tokenID] = word2

In [22]:
# new CSVs with corrections for what wordIDs are actual occurrences,
# per the XML
validatedCSVs = {}
# keeping track of wordIDs in this new CSV
validatedWordIDs = []

for lang in newCSVs:
    validatedCSV = []

    for row in newCSVs[lang]:
        occurrs = row[1].split(', ')
        correctOccurrs = []

        for occurr in occurrs:
            fileID, count = occurr.split('-')

            if fileID not in wordsFromXML:
                continue
            if occurr not in wordsFromXML[fileID]:
                continue

            wordToCompare = wordsFromXML[fileID][occurr].lower()

            if row[0].lower() == wordToCompare:
                correctOccurrs.append(occurr)
                if occurr not in validatedWordIDs:
                    validatedWordIDs.append(occurr)

        newRow = [row[0], ", ".join(correctOccurrs)] + row[2:]
        validatedCSV.append(newRow)
        
    validatedCSVs[lang] = validatedCSV

In [23]:
orphanedWordIDs = []

for lang in allWordOccurr:
    for i in allWordOccurr[lang]:
        if i not in validatedWordIDs:
            orphanedWordIDs.append(i)

orphanedWordIDs = list(set(orphanedWordIDs))
        
print(len(orphanedWordIDs))
print(orphanedWordIDs)

1826
['haif0042-3', 'zoor0247-3', 'huqo0001-21', 'jord0001-176', 'jaff0057-6', 'zoor0243-14', 'jord0001-52', 'jord0001-420', 'jeru0449-14', 'zoor0035-9', 'zoor0331-10', 'khus0003-2', 'zoor0373-10', 'emma0002-1', 'jord0001-322', 'hmti0004-4', 'jord0001-294', 'beth0105-4', 'jord0001-97', 'caes0267-2', 'jord0001-375', 'zoor0270-1', 'zoor0355-7', 'jord0001-272', 'zoor0104-1', 'jord0001-343', 'zoor0337-18', 'jord0001-178', 'zoor0245-6', 'jord0001-53', 'zoor0294-4', 'zoor0202-17', 'zoor0262-26', 'zoor0368-12', 'masa0430-1', 'beth0263-2', 'jord0001-494', 'zoor0001-5', 'masa0416-3', 'zoor0339-18', 'jord0001-398', 'jeru0305-8', 'mare0301-1', 'jord0001-88', 'zoor0332-2', 'jord0001-470', 'jeru0357-5', 'jord0001-95', 'masa0549-1', 'gers0001-7', 'jord0001-487', 'rafi0008-3', 'hefz0001-62', 'hamm0002-11', 'bshe0024-73', 'beth0246-2', 'emma0002-3', 'zoor0115-1', 'jord0001-241', 'idum0303-3', 'jeru0492-29', 'masa0696-3', 'birs0005-6', 'bshe0131-1', 'jord0001-483', 'shik0003-6', 'caes0256-4', 'qumr0001

Let's see if we can condense the validated CSV down to eliminate multiple entries before trying to match the orphaned wordIDs back to the spreadsheet.

# For duplicates, select option with most occurrences

In [24]:
validatedCSVs2 = {}

for lang in validatedCSVs:
    seen = {}

    for row in validatedCSVs[lang]:
        # normalize data
        word = row[0]
        lemma = row[2]

        noAccent = "".join([unicodedata.normalize("NFD", ch)[0].lower() for ch in word])
        newData = " | ".join([noAccent, lemma, row[3].strip(), row[5].strip()])
        if newData not in seen:
            seen[newData] = {}
        if word not in seen[newData]:
            seen[newData][word] = []

        seen[newData][word].append(row)

    # build new CSV
    validatedCSV2 = []
    for noAccent in seen:
        multipleBool = False
        multipleCount = 0

        selection_options = []

        for wordform in seen[noAccent]:
            if len(seen[noAccent][wordform]) > 1:
                multipleBool = True
                print(seen[noAccent][wordform])
                print('')
                # This should be false, per current dictionary construction
            for i in seen[noAccent][wordform]:
                multipleCount+=1
        if len(seen[noAccent]) > 1:
            multipleBool = True

        # Handle the easy case: only one option
        if multipleBool == False:
            for wordform in seen[noAccent]:
                for row in seen[noAccent][wordform]:
    #                 print(row)
                    validatedCSV2.append(row)

        elif multipleBool == True:
    #         print(seen[noAccent])
    #         print('\n')

            for wordform in seen[noAccent]:
                wordform_count = len(seen[noAccent][wordform][0][1].split(', '))
                # Don't want to select option with multiple accents, so
                # set its count to zero
                if count_accents(wordform) > 1:
                    wordform_count = 0

                row = seen[noAccent][wordform][0]
                selection_options.append([wordform_count, row])

            selection_options = sorted(selection_options, reverse=True)
            keep = selection_options[0][1]
            keepOccurr = keep[1]

            # Condense other occurrences
            otherOccurr = " ".join([pair[1][1] for pair in selection_options[1:]])
            allOccurr = ", ".join(get_occur_list(f"{keepOccurr} {otherOccurr}"))
            keep[1] = allOccurr
    #         print(selection_options)
    #         print('')
            if selection_options[0][1][6:] != selection_options[1][1][6:]:
                print(selection_options)

            validatedCSV2.append(keep)
    
    validatedCSVs2[lang] = validatedCSV2

[[10, ['ἀθανατος', 'zoor0244-29, zoor0208-21, zoor0207-35, zoor0147-22, zoor0304-14, zoor0233-29, zoor0290-9, zoor0290-6, zoor0260-30, zoor0217-22, zoor0248-27, zoor0146-17', 'ἀθανατός', 'ADJ', 'w-ἀθανατος', 'Case=Nom|Degree=Pos|Gender=Masc|Number=Sing', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'y', 'immortality', '', '', '']], [2, ['ἀθἀνατος', 'zoor0147-22, zoor0146-17', 'ἀθανατός', 'ADJ', 'w-ἀθἀνατος', 'Case=Nom|Degree=Pos|Gender=Masc|Number=Sing', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'y', 'immortal', '', '', '']]]
[[10, ['Ἀλφίου', 'zoor0172-3, zoor0002-3, zoor0100-3, zoor0026-3, zoor0056-3, zoor0287-3, zoor0341-7, birs0003-3, zoor0248-8, zoor0050-3, zoor0170-3', 'Ἄλφιος', 'PROPN', 'w-Ἀλφίου', 'Case=Gen|Gender=Masc|Number=Sing', '', '', 'y', 'm', 'Jewish', 'Alphios', 'http://clas-lgpn2.classics.ox.ac.uk/name/%E1%BF%8E%CE%91%CE%BB%CF%86%CE%B

# Re-matching some of the orphaned wordIDs

If there is only one possible match for a wordID in the new validated CSV, then it can be added back in.

In [25]:
oneMatch = []
multMatches = []
noMatches = []
notInXML = []

for wordID in orphanedWordIDs:
    fileID = wordID.split('-')[0]
    if fileID not in wordsFromXML:
        notInXML.append(wordID)
        continue
    if wordID not in wordsFromXML[fileID]:
        notInXML.append(wordID)
        continue
    #print(wordID, '\t', wordsFromXML[fileID][wordID])
    wordFromXML = wordsFromXML[fileID][wordID]
    matchCount = 0
    noAccentWordFromXML = "".join([unicodedata.normalize("NFD", ch)[0].lower() for ch in wordFromXML])
        
    for lang in validatedCSVs2:
        for i, row in enumerate(validatedCSVs2[lang]):
            noAccentWordFromCSV = "".join([unicodedata.normalize("NFD", ch)[0].lower() for ch in row[0]])
            if wordFromXML == row[0]:
                matchCount+=1
            elif wordFromXML.lower() == row[0].lower():
                matchCount+=1
            elif noAccentWordFromXML.lower() == noAccentWordFromCSV.lower():
                matchCount+=1

        if matchCount == 0:
            noMatches.append(wordID)
        elif matchCount == 1:
            oneMatch.append(wordID)
        elif matchCount > 1:
            multMatches.append(wordID)
        
oneMatch = sorted(list(set(oneMatch)))
multMatches = sorted(list(set(multMatches)))
noMatches = sorted(list(set(noMatches)))
notInXML = sorted(list(set(notInXML)))
        
print(f"NOT IN XML:\n {len(notInXML)} wordIDs.")
print("\nNO MATCHES:\n", noMatches)
print("\nONE MATCH:\n", oneMatch)
print("\nMULTIPLE MATCHES:\n", multMatches)

NOT IN XML:
 1154 wordIDs.

NO MATCHES:
 ['akld0019-1', 'akld0024-3', 'apol0100-4', 'aris0001-3', 'balf0001-2', 'balf0001-4', 'beth0127-34', 'beth0174-10', 'beth0174-11', 'beth0178-2', 'beth0195-1', 'beth0236-2', 'beth0237-32', 'beth0237-33', 'beth0237-34', 'beth0238-7', 'beth0238-8', 'beth0239-10', 'beth0239-11', 'beth0239-12', 'beth0239-13', 'beth0239-8', 'beth0239-9', 'beth0246-10', 'beth0246-11', 'beth0246-12', 'beth0246-2', 'beth0246-3', 'beth0246-4', 'beth0246-5', 'beth0246-6', 'beth0246-7', 'beth0246-8', 'beth0246-9', 'beth0248-10', 'beth0248-11', 'beth0248-12', 'beth0248-13', 'beth0248-14', 'beth0248-15', 'beth0248-16', 'beth0248-17', 'beth0248-3', 'beth0248-4', 'beth0248-5', 'beth0248-6', 'beth0248-7', 'beth0248-8', 'beth0248-9', 'beth0251-1', 'beth0251-2', 'beth0261-1', 'beth0261-2', 'beth0261-3', 'beth0261-4', 'beth0261-5', 'beth0263-2', 'beth0265-1', 'beth0266-1', 'beth0271-1', 'beth0271-2', 'beth0272-1', 'beth0276-1', 'bguv0102-5', 'birs0008-8', 'bshe0003-7', 'bshe0024-66'

In [26]:
for wordID in oneMatch:
    fileID = wordID.split('-')[0]
    wordFromXML = wordsFromXML[fileID][wordID]
    noAccentWordFromXML = "".join([unicodedata.normalize("NFD", ch)[0].lower() for ch in wordFromXML])
        
    for lang in validatedCSVs2:
        for i, row in enumerate(validatedCSVs2[lang]):
            noAccentWordFromCSV = "".join([unicodedata.normalize("NFD", ch)[0].lower() for ch in row[0]])
            if wordFromXML == row[0]:
                validatedCSVs2[lang][i][1] = ", ".join(get_occur_list(f"{validatedCSVs2[lang][i][1]} {wordID}"))
            elif wordFromXML.lower() == row[0].lower():
                validatedCSVs2[lang][i][1] = ", ".join(get_occur_list(f"{validatedCSVs2[lang][i][1]} {wordID}"))
            elif noAccentWordFromXML.lower() == noAccentWordFromCSV.lower():
                validatedCSVs2[lang][i][1] = ", ".join(get_occur_list(f"{validatedCSVs2[lang][i][1]} {wordID}"))

In [27]:
for lang in validatedCSVs2:
    with open(f'csvs_new/corr_{lang}-validated.csv','w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerows(validatedCSVs2[lang])