## import statements

In [1]:
from MyCapytain.resolvers.cts.api import HttpCtsResolver
from MyCapytain.retrievers.cts5 import HttpCtsRetriever
from dicesapi.jupyter import NotebookPBar
import pandas as pd
import os

## global values

In [2]:
# CTS endpoint
cts_server = 'https://scaife-cts.perseus.org/api/cts'

# input document
xls_input = 'data/input.xlsx'
xls_worksheet = 'Homerocentones_linesfromHomer'

# output directory
output_dir = 'output'

## initialize connection to CTS server

In [3]:
# create resolver
resolver = HttpCtsResolver(HttpCtsRetriever(cts_server))

# cache to prevent multiple requests
cts_cache = dict()

In [4]:
# urns for the two poems
works = {
    'Il.': 'urn:cts:greekLit:tlg0012.tlg001.perseus-grc2',
    'Od.': 'urn:cts:greekLit:tlg0012.tlg002.perseus-grc2',
}

## build index by verse text

In [5]:
index_text = dict()
for work, urn in works.items():
    if urn not in cts_cache:
        xml = resolver.getTextualNode(urn).xml
        cts_cache[urn] = xml
    else:
        xml = cts_cache[urn]
    
    for note in xml.findall('.//l//note', namespaces=xml.nsmap):
        note.clear(keep_tail=True)
        
    if work == 'Il.':
        book_pattern = './/div[@subtype="Book"]'
    else:
        book_pattern = './/div[@subtype="book"]'
    
    for book in xml.findall(book_pattern, namespaces=xml.nsmap):
        bn = book.get('n')
        
        for line in book.findall('.//l', namespaces=xml.nsmap):
            ln = line.get('n')
            text = ''.join(line.itertext()).strip()
            
            index_text.setdefault(text, [])
            index_text[text].append(f'{work} {bn}.{ln}')

## re-index by locus

In [6]:
index_line = dict()
f = open(os.path.join(output_dir, 'repeated_lines.txt'), 'w')

for key in index_text:
    if len(index_text[key]) == 1:
        continue
    
    for ref1 in index_text[key]:
        if ref1 not in index_line:
            index_line[ref1] = []
            
        for ref2 in index_text[key]:
            if ref1 != ref2:
                if ref2 not in index_line[ref1]:
                    index_line[ref1].append(ref2)
    
    f.write(key + '\n')
    f.write(', '.join([ref for ref in index_text[key]]) + '\n\n')

f.close()

## read the input spreadsheet

In [7]:
df = pd.read_excel(xls_input, xls_worksheet, usecols=[0,1,2,3,4,5], keep_default_na=False,)

## add rows for repeated lines

In [8]:
new_rows = []
has_rep = []

for i, row in enumerate(df.itertuples()):
    key = f'{row[3]} {row[4]}.{row[5]}'
    
    if key in index_line:
        has_rep.append(i)
        
        for ref in index_line[key]:
            work, loc = ref.split()
            book, line = loc.split('.')
            
            new_rows.append(dict(
                eud_seq = row[1],
                eud_line = row[2],
                hom_work = work,
                hom_book = book,
                hom_line = line,
                part = row[6],
                rep = True,
            ))

## add new col marking repetitions

In [9]:
df.loc[:, 'rep'] = False
df.loc[has_rep, 'rep'] = True

## add new rows to data frame

In [10]:
df = pd.concat([df, pd.DataFrame(new_rows)])

## save new data frame

In [11]:
df.to_csv(os.path.join(output_dir, 'input_with_repeated_lines.csv'), index=False)