## Import statements

In [1]:
from MyCapytain.resolvers.cts.api import HttpCtsResolver
from MyCapytain.retrievers.cts5 import HttpCtsRetriever
from dicesapi.jupyter import NotebookPBar
from cltk import NLP
import pandas as pd
import os
import re
import collections
import pickle

## Global values

In [2]:
# CTS endpoint
cts_server = 'https://scaife-cts.perseus.org/api/cts'

# input document
xls_input = 'data/input.xlsx'
xls_worksheet = 'Homerocentones_linesfromHomer'

# output directories
output_dir = 'output'
cache_dir = 'temp'

## Setup NLP

- The default NLP pipeline includes word embeddings and named entity recognition, which we don't need here. Cutting them out shortens the processing time significantly.

In [3]:
cltk_nlp = NLP('grc', suppress_banner=True)
cltk_nlp.pipeline.processes = cltk_nlp.pipeline.processes[:2]

## Initialize connection to CTS server

- use MyCapytain to talk to the Perseus CTS server and request the text of the Iliad and Odyssey

In [4]:
# create resolver
resolver = HttpCtsResolver(HttpCtsRetriever(cts_server))

# cache to prevent multiple requests
cts_cache = dict()

In [5]:
# urns for the two poems
works = {
    'Il.': 'urn:cts:greekLit:tlg0012.tlg001.perseus-grc2',
    'Od.': 'urn:cts:greekLit:tlg0012.tlg002.perseus-grc2',
}

## Build index by verse text

- the first step to finding repeated lines is creating an index by verse text
- this dictionary will have the verse lines as keys, values will be loci where that text occurs
- for now includes punctuation, whitespace, anything that was in perseus
- deletes all intra-line `<note>` entities

In [6]:
# start with empty dictionary
index_text = dict()

# iterate over works
for work, urn in works.items():
    
    # download the text as one xml object,
    #   - unless we've downloaded it before
    if urn not in cts_cache:
        xml = resolver.getTextualNode(urn).xml
        cts_cache[urn] = xml
    else:
        xml = cts_cache[urn]
    
    # get rid of notes, which contain editorial text
    for note in xml.findall('.//l//note', namespaces=xml.nsmap):
        note.clear(keep_tail=True)
    
    # the tag marking book divisions is slightly different in the two docs
    if work == 'Il.':
        book_pattern = './/div[@subtype="Book"]'
    else:
        book_pattern = './/div[@subtype="book"]'
    
    # iterate over books
    for book in xml.findall(book_pattern, namespaces=xml.nsmap):
        bn = book.get('n')
        
        # iterate over lines
        for line in book.findall('.//l', namespaces=xml.nsmap):
            ln = line.get('n')
            
            # pull the text of the line
            text = ''.join(line.itertext()).strip()
            
            # add this locus to the index by line text
            index_text.setdefault(text, [])
            index_text[text].append(f'{work} {bn}.{ln}')

## Index by normalized text

- There are some lines that have identical greek but different punctuation, whitespace, etc.
- Here we go through the distinct verse lines (keys to our first index) and try to normalize by removing everything except greek letters and apostrophes.
- I hand-checked all distinct characters to get this list.
- I suspect it would be worth normalizing grave accents to acute; I haven't checked to see how many, or which exact tokens would be affected.

In [7]:
# index for normalized text
index_normalized = {}

# characters to remove
punct = re.compile(r'[ ,·.;\n—]+')

# iterate over keys to index, normalize, reindex
for text in index_text:
    normalized = punct.sub(' ', text).strip()
    index_normalized.setdefault(normalized, [])
    index_normalized[normalized].extend(index_text[text])

## Build a normalized line-lookup

- A forwards index whose keys are loci and values are normalized lines. Useful for checking results later, when all we have are loci.

In [8]:
# index for looking up normalized text by locus
line_lookup = {}

# iterate over the normalized index and re-index by locus
for normalized, loci in index_normalized.items():
    for loc in loci:
        line_lookup[loc] = normalized

## Lemmatize lines

**⚠️ This step is a bit slow**
- I usually save the lemmatized results to a pickle, to save redoing it.
- Fully lemmatizing the Iliad and the Odyssey takes about 90 minutes on my 2014 MacBook Pro.

In [9]:
# cache file to use
file_cached = os.path.join(cache_dir, 'lemmatized_lines_norm.pickle')

# check for the cache; if it exists, use it
if os.path.exists(file_cached):
    with open(file_cached, 'rb') as f:
        cache_lemmatized = pickle.load(f)
    
    print(f'loaded {len(cache_lemmatized)} records from {file_cached}.')

# otherwise, start from scratch
else:
    cache_lemmatized = {}
    
    # progress bar for long process
    pbar = NotebookPBar(max=len(index_normalized))

    # iterate over normalized lines (keys to normalized index) and lemmatize
    for i,l in enumerate(index_normalized):
        cache_lemmatized[l] = cltk_nlp(l)
        pbar.update(i)
    
    # write the cache for next time
    print(f'saving {len(cache_lemmatized)} records to {file_cached}.')
        
    with open(file_cached, 'wb') as f:
        pickle.dump(cache_lemmatized, f)

loaded 25130 records from temp/lemmatized_lines_norm.pickle.


## Index by lemmatized

- Now further consolidate our index by lemmatizing the normalized strings
- Each key to this new index will be a sequence of dictionary headwords
- All inflected forms of the same lemma will be folded together

In [10]:
index_lemmata = {}

for normalized in index_normalized:
    cltk_doc = cache_lemmatized[normalized]
    lemmata = ' '.join([w.lemma for w in cltk_doc]).strip()    
    index_lemmata.setdefault(lemmata, [])
    index_lemmata[lemmata].append(normalized)

## Examine the output

Let's take a look at the index to see which lines recur most, and how often.
- This pandas dataframe organizes lines in groups according to lemmata, with the most common lemmatized pattern first.
- But it also breaks down each lemmatized pattern into the inflected versions that actually occur.

**Counting up all the loci for a given pattern**

- The reverse index of lemmatized lines gives us, for a given string of lemmata, all the normalized lines that correspond to instances of that pattern.
- Each of these must then be looked up in the reverse index of normalized lines to get the actual loci.
- The helper function below tallies all the loci of all instances of a given pattern.

In [11]:
def total_refs(lemmata):
    return sum([len(index_normalized[line]) for line in index_lemmata[lemmata]])

**Mark lines with capital letters**

- Proper names presumably aren't allowable in Eudocia

In [12]:
def has_caps(lemmata):
    return lemmata.lower() != lemmata

**Build the dataframe**

In [13]:
df = pd.DataFrame(dict(
    line = line,
    lemmata = lemmata,
    count = len(index_normalized[line]),
    caps = has_caps(lemmata),    
    loci = '; '.join(index_normalized[line]),
) for lemmata in sorted(sorted(index_lemmata), key=total_refs, reverse=True)
            for line in sorted(sorted(index_lemmata[lemmata]), key=lambda l: len(index_normalized[l]), reverse=True))
df

Unnamed: 0,line,lemmata,count,caps,loci
0,τὸν δʼ ἀπαμειβόμενος προσέφη πολύμητις Ὀδυσσεύς,ὁ δʼ ἀπαμείβω προσφέρω πολύμητις Ὀδυσσεύς,30,True,Il. 10.382; Il. 10.423; Il. 10.554; Il. 19.154...
1,τὴν δʼ ἀπαμειβόμενος προσέφη πολύμητις Ὀδυσσεύς,ὁ δʼ ἀπαμείβω προσφέρω πολύμητις Ὀδυσσεύς,19,True,Od. 5.214; Od. 7.240; Od. 13.311; Od. 13.382; ...
2,τὸν δʼ αὖ Τηλέμαχος πεπνυμένος ἀντίον ηὔδα,ὁ δʼ αὗ Τηλέμαχος πνύω ἀντίος οἶδα,30,True,Od. 1.388; Od. 1.412; Od. 2.129; Od. 2.208; Od...
3,τὴν δʼ αὖ Τηλέμαχος πεπνυμένος ἀντίον ηὔδα,ὁ δʼ αὗ Τηλέμαχος πνύω ἀντίος οἶδα,13,True,Od. 1.213; Od. 1.230; Od. 1.306; Od. 1.345; Od...
4,καί μιν φωνήσας ἔπεα πτερόεντα προσηύδα,καί μιν φωνέω ἔπος πτερός προσεύω,30,False,Il. 1.201; Il. 2.7; Il. 4.312; Il. 4.369; Il. ...
...,...,...,...,...,...
25125,Ῥίγμον ὃς ἐκ Θρῄκης ἐριβώλακος εἰληλούθει,Ῥίγμων ὅς ἐκ Θρῄκης ἐριβάλλω εἰληλόω,1,True,Il. 20.485
25126,Ῥίπην τε Στρατίην τε καὶ ἠνεμόεσσαν Ἐνίσπην,Ῥίπη τε Στρατία τε καί ἀναμοέω Ἐνίσπα,1,True,Il. 2.606
25127,Ῥῆσος δʼ ἐν μέσῳ εὗδε παρʼ αὐτῷ δʼ ὠκέες ἵπποι,Ῥῆσος δʼ ἐν μέσος ὁράω παρά αὐτός δʼ ὠκέες ἵππος,1,True,Il. 10.474
25128,Ῥήσου ἀνεψιὸν ἐσθλόν ὃ δʼ ἐξ ὕπνου ἀνορούσας,Ῥῆσος ἀνεψιός ἔσθλός ὅς δʼ ἐκ ὕπνος ἀνορούω,1,True,Il. 10.519


**Save to a CSV**

In [14]:
df.to_csv(os.path.join(output_dir, 'repeated_lines.csv'), index=False)

## Merge all inflected forms of a given pattern

While it's helpful for debugging to be able to see all the actual variants of a given pattern, at the end of the day we want to treat all loci sharing the same lemmata as equivalent.
- Here we redo our table, merging results from multiple normalized lines

**Helper functions to group and sort loci**

- The first of these returns a sortable string for loci by padding book and line numbers with zeros. 
- The second is similar to `total_refs` above, but instead of counting loci, it sorts and returns them.

In [15]:
def sorter(loc):
    '''zero-pad loci to make string-based sorting easy'''
    
    m = re.match(r'(Il|Od)\. (\d+).(\d+)', loc)
    if m:
        w, b, l = m.groups()
        return('{w}. {b:02d}.{l:03d}'.format(
            w = w, b = int(b), l = int(l)))
    else:
        print(f'failed: {loc}')
    

def sorted_refs(lemmata):
    '''return a sorted list of all loci for lemmatized line'''
    
    loci = [loc for line in index_lemmata[lemmata] for loc in index_normalized[line]]
    loci.sort(key=sorter)
    return loci

**Build the table**

In [16]:
df_lemmata = pd.DataFrame(dict(
    lemmata = lemmata,
    count = total_refs(lemmata),
    caps = has_caps(lemmata),
    loci = '; '.join(sorted_refs(lemmata)),
) for lemmata in sorted(sorted(index_lemmata), key=total_refs, reverse=True))

df_lemmata.to_csv(os.path.join(output_dir, 'repeated_lines_by_lemmata.csv'), index=False)
df_lemmata

Unnamed: 0,lemmata,count,caps,loci
0,ὁ δʼ ἀπαμείβω προσφέρω πολύμητις Ὀδυσσεύς,49,True,Il. 10.382; Il. 10.423; Il. 10.554; Il. 19.154...
1,ὁ δʼ αὗ Τηλέμαχος πνύω ἀντίος οἶδα,43,True,Od. 1.213; Od. 1.230; Od. 1.306; Od. 1.345; Od...
2,καί μιν φωνέω ἔπος πτερός προσεύω,39,False,Il. 1.201; Il. 2.7; Il. 4.312; Il. 4.369; Il. ...
3,ἦμος δʼ ἠριγένεια φαίνω ῥοδοδάκτυλος Ἠώς,22,True,Il. 1.477; Il. 24.788; Od. 2.1; Od. 3.404; Od....
4,αὐτάνω ἐπεί πόσις καί ἐδητύος ἐκ ἔρος αί,21,False,Il. 1.469; Il. 2.432; Il. 7.323; Il. 9.92; Il....
...,...,...,...,...
24844,Ῥίγμων ὅς ἐκ Θρῄκης ἐριβάλλω εἰληλόω,1,True,Il. 20.485
24845,Ῥίπη τε Στρατία τε καί ἀναμοέω Ἐνίσπα,1,True,Il. 2.606
24846,Ῥῆσος δʼ ἐν μέσος ὁράω παρά αὐτός δʼ ὠκέες ἵππος,1,True,Il. 10.474
24847,Ῥῆσος ἀνεψιός ἔσθλός ὅς δʼ ἐκ ὕπνος ἀνορούω,1,True,Il. 10.519


## Index by locus

For the purposes of matching Eudocia references, we need an index by Homeric locus.
- keys are loci in Homer
- values are all the other loci that share a given string of lemmata

In [63]:
index_locus = dict()

for lemmata in index_lemmata:
    if total_refs(lemmata) == 1:
        continue
    
    refs = sorted_refs(lemmata)
    
    for ref1 in refs:
        index_locus.setdefault(ref1, [])
            
        for ref2 in refs:
            if ref1 != ref2:
                if ref2 not in index_locus[ref1]:
                    index_locus[ref1].append(ref2)

**Does it work?**

In [64]:
# should match Od. 5.136 and Od. 23.336
index_locus['Od. 7.257']

['Od. 5.136', 'Od. 23.336']

## Read the input spreadsheet

- Read in the list of correspondences between Eudocia and Homer.
- Each row is a line in Eudocia matched with a line in Homer.

**Format**

- `eud_seq`: I think this groups lines into conversations or "speech scenes"
- `eud_line`: Line number in Eudocia
- `hom_work`: Must be `Il.` or `Od.`
- `hom_book`: Book number in Homer
- `hom_line`: Line number in Homer
- `segment`: For those lines in Eudocia drawing on multiple Homeric sources. See note.

**Notes**
- Some lines in Eudocia draw from multiple Homeric lines. In this case, multiple rows have the same Eudocia line no. The `segment` column orders Homeric correspondences from left to right. (`eud_line`,`segment`) pairs should be unique.
- Two lines in Eudocia have no Homeric correspondence: 445 and 1131. The Homer columns are left blank.

In [65]:
df_eudocia = pd.read_excel(xls_input, xls_worksheet, usecols=[0,1,2,3,4,5], keep_default_na=False,)
df_eudocia

Unnamed: 0,eud_seq,eud_line,hom_work,hom_book,hom_line,segment
0,1,42,Il.,14,190,1
1,1,42,Il.,7,28,2
2,1,43,Il.,14,191,1
3,1,44,Od.,11,146,1
4,1,45,Il.,10,324,1
...,...,...,...,...,...,...
1125,92,2342,Od.,22,392,1
1126,92,2343,Od.,15,65,1
1127,92,2343,Il.,9,42,2
1128,92,2344,Od.,3,2,1


## Add rows for repeated lines

- For every Homeric reference in this table, we're going to look up all the additional loci that share the same lemmata, using our index by locus.
- We'll create new rows that duplicate `eud_seq`, `eud_line` and `segment`, one for each additional Homeric locus
- We're adding a new column, `rep`, that marks these as duplicates. It's set to true for all new rows.
- We're also noting which rows in the original table generated repeats, so we can mark them later.

In [66]:
# hold new row data
new_rows = []

# collect the ids of rows in the eudocia table that matched
has_rep = []

# iterate over the eudocia table
for i, row in enumerate(df_eudocia.itertuples()):
    
    # look up homeric locus in the index
    key = f'{row[3]} {row[4]}.{row[5]}'
    
    if key in index_locus:
        
        # record row id for later
        has_rep.append(i)
        
        # create a new row and save
        for ref in index_locus[key]:
            work, loc = ref.split()
            book, line = loc.split('.')
            
            new_rows.append(dict(
                eud_seq = row[1],
                eud_line = row[2],
                hom_work = work,
                hom_book = book,
                hom_line = line,
                segment = row[6],
                rep = True,
            ))

## Add new col marking repetitions

- Add a `rep` column to the original Eudocia data.
- Set to `False` by default, then mark the matching row ids as `True`
- Working on a copy of the original data just to make do-overs easier.

In [67]:
# create working copy
df = df_eudocia.copy()

# create new column, set to False
df.loc[:, 'rep'] = False

# mark matching rows as True
df.loc[has_rep, 'rep'] = True

## Add new rows to data frame

- Now append the new rows

In [68]:
df = pd.concat([df, pd.DataFrame(new_rows)])
df

Unnamed: 0,eud_seq,eud_line,hom_work,hom_book,hom_line,segment,rep
0,1,42,Il.,14,190,1,True
1,1,42,Il.,7,28,2,True
2,1,43,Il.,14,191,1,True
3,1,44,Od.,11,146,1,True
4,1,45,Il.,10,324,1,True
...,...,...,...,...,...,...,...
214,86,2274,Od.,10,547,1,True
215,86,2274,Od.,12,207,1,True
216,89,2305,Od.,16,61,1,True
217,89,2307,Il.,17,179,1,True


**Sanity check: how many new rows did we add?**

In [69]:
print(len(new_rows))

219


## Add the Homeric text

Just to help keep things straight, I'm going to add the normalized Homeric lines to the table. That way we can spot anything weird, and get a sense of whether Eudocia is using the words that stay the same or the words that vary.

In [101]:
# value for missing Homer lines
line_lookup[' .'] = None

# add column
df['hom_text'] = [line_lookup[f'{row[3]} {row[4]}.{row[5]}'] for row in df.itertuples()]

In [26]:
df

Unnamed: 0,eud_seq,eud_line,hom_work,hom_book,hom_line,segment,rep,hom_text
0,1,42,Il.,14,190,1,False,ἦ ῥά νύ μοί τι πίθοιο φίλον τέκος ὅττί κεν εἴπω
1,1,42,Il.,7,28,2,True,ἀλλʼ εἴ μοί τι πίθοιο τό κεν πολὺ κέρδιον εἴη
2,1,43,Il.,14,191,1,False,ἦέ κεν ἀρνήσαιο κοτεσσαμένη τό γε θυμῷ
3,1,44,Od.,11,146,1,False,ῥηΐδιόν τοι ἔπος ἐρέω καὶ ἐπὶ φρεσὶ θήσω
4,1,45,Il.,10,324,1,False,σοὶ δʼ ἐγὼ οὐχ ἅλιος σκοπὸς ἔσσομαι οὐδʼ ἀπὸ δ...
...,...,...,...,...,...,...,...,...
214,86,2274,Od.,10,547,1,True,μειλιχίοις ἐπέεσσι παρασταδὸν ἄνδρα ἕκαστον
215,86,2274,Od.,12,207,1,True,μειλιχίοις ἐπέεσσι παρασταδὸν ἄνδρα ἕκαστον
216,89,2305,Od.,16,61,1,True,τοιγὰρ ἐγώ τοι τέκνον ἀληθέα πάντʼ ἀγορεύσω
217,89,2307,Il.,17,179,1,True,ἀλλʼ ἄγε δεῦρο πέπον παρʼ ἔμʼ ἵστασο καὶ ἴδε ἔ...


## Save new data frame

In [27]:
df.to_csv(os.path.join(output_dir, 'eudocia_with_repeated_lines_lemmatized.csv'), index=False)



# Part 2: matching with missing lemmata

We want to make our groups of repeated lines a little fuzzier by allowing matches with one (or more?) lemma substitutions. One option would be shingles; I'm going to start with just dropping one or two lemmata at a time.

- For a line of n lemmata, I'll create n new keys, each leaving out one lemma.
- Because each line now matches multiple keys, we'll have a lot of redundancy in our index
- We can simplify it after

In [87]:
# new index
index_masked = {}

for lem_key, norm_group in index_lemmata.items():
    lemmata = lem_key.split()
    if len(lemmata) < 2:
        print(f'problem with {lem_key}')
    
    for i in range(len(lemmata)):
#         first_tier = lemmata[:i] + lemmata[i+1:]
#         for j in range(len(first_tier)):
#             second_tier = first_tier[:j] + first_tier[j+1:]
#             masked = ' '.join(second_tier)
        masked = ' '.join(lemmata[:i] + lemmata[i+1:])

        index_masked.setdefault(masked, [])
        index_masked[masked].append(lem_key)

for masked in index_masked:
    index_masked[masked] = [lem_key for lem_key in set(index_masked[masked])]

### Examine the results

In [91]:
pd.DataFrame(dict(
    masked = masked,
    lemmata = lemmata,
    n = len(lem_group),
) for masked, lem_group in index_masked.items() for lemmata in lem_group if len(lem_group) > 1)

Unnamed: 0,masked,lemmata,n
0,τε καί ἄλλος ἐϋκνήμις Ἀχαιός,Ἀτρεΐδης τε καί ἄλλος ἐϋκνήμις Ἀχαιός,2
1,τε καί ἄλλος ἐϋκνήμις Ἀχαιός,Ἀτρεΐδαι τε καί ἄλλος ἐϋκνήμις Ἀχαιός,2
2,τε ζαθύς Τενέδοιος τε ἶφις ἀνίσσημι,Κίλλα τε ζαθύς Τενέδοιος τε ἶφις ἀνίσσημι,2
3,τε ζαθύς Τενέδοιος τε ἶφις ἀνίσσημι,Κίλλας τε ζαθύς Τενέδοιος τε ἶφις ἀνίσσημι,2
4,ὁ δʼ προσφέρω πούς οὖξ Ἀχιλλεύς,ὁ δʼ ἀπαμείβω προσφέρω πούς οὖξ Ἀχιλλεύς,2
...,...,...,...
1269,ὁ δʼ ὅστις ψυχή προσφωνέω,ὁ δʼ ὅστις ψυχή προσφωνέω Ἀτρείδαος,3
1270,ὁ δʼ ὅστις ψυχή προσφωνέω,ὁ δʼ ὅστις ψυχή προσφωνέω Ἀμφιμέδων,3
1271,ὁ δʼ ὅστις ψυχή προσφωνέω,ὁ δʼ ὅστις ψυχή προσφωνέω Ἀτρεΐδαι,3
1272,αἶψος ἀμπεπαλής πρόειμι δολιχόσκιος ἔγχος,αἶψος μαγος ἀμπεπαλής πρόειμι δολιχόσκιος ἔγχος,2


## Index by locus

In [106]:
index_locus = {}

for masked, lem_group in index_masked.items():
    refs = []
    for lemmata in lem_group:

        refs.extend(sorted_refs(lemmata))

    if len(refs) == 1:
        continue
        
    for ref1 in refs:
        index_locus.setdefault(ref1, [])

        for ref2 in refs:
            if ref1 != ref2:
                if ref2 not in index_locus[ref1]:
                    index_locus[ref1].append(ref2)

In [107]:
# hold new row data
new_rows = []

# collect the ids of rows in the eudocia table that matched
has_rep = []

# iterate over the eudocia table
for i, row in enumerate(df_eudocia.itertuples()):
    
    # look up homeric locus in the index
    key = f'{row[3]} {row[4]}.{row[5]}'
    
    if key in index_locus:
        
        # record row id for later
        has_rep.append(i)
        
        # create a new row and save
        for ref in index_locus[key]:
            work, loc = ref.split()
            book, line = loc.split('.')
            
            new_rows.append(dict(
                eud_seq = row[1],
                eud_line = row[2],
                hom_work = work,
                hom_book = book,
                hom_line = line,
                segment = row[6],
                rep = True,
            ))
            
# create working copy
df = df_eudocia.copy()

# create new column, set to False
df.loc[:, 'rep'] = False

# mark matching rows as True
df.loc[has_rep, 'rep'] = True

df = pd.concat([df, pd.DataFrame(new_rows)])

# value for missing Homer lines
line_lookup[' .'] = None

# add column
df['hom_text'] = [line_lookup[f'{row[3]} {row[4]}.{row[5]}'] for row in df.itertuples()]

# save
df.to_csv(os.path.join(output_dir, 'eudocia_with_repeated_lines_masked.csv'), index=False)

# display
df

Unnamed: 0,eud_seq,eud_line,hom_work,hom_book,hom_line,segment,rep,hom_text
0,1,42,Il.,14,190,1,False,ἦ ῥά νύ μοί τι πίθοιο φίλον τέκος ὅττί κεν εἴπω
1,1,42,Il.,7,28,2,True,ἀλλʼ εἴ μοί τι πίθοιο τό κεν πολὺ κέρδιον εἴη
2,1,43,Il.,14,191,1,False,ἦέ κεν ἀρνήσαιο κοτεσσαμένη τό γε θυμῷ
3,1,44,Od.,11,146,1,False,ῥηΐδιόν τοι ἔπος ἐρέω καὶ ἐπὶ φρεσὶ θήσω
4,1,45,Il.,10,324,1,False,σοὶ δʼ ἐγὼ οὐχ ἅλιος σκοπὸς ἔσσομαι οὐδʼ ἀπὸ δ...
...,...,...,...,...,...,...,...,...
285,86,2274,Od.,10,547,1,True,μειλιχίοις ἐπέεσσι παρασταδὸν ἄνδρα ἕκαστον
286,86,2274,Od.,12,207,1,True,μειλιχίοις ἐπέεσσι παρασταδὸν ἄνδρα ἕκαστον
287,89,2305,Od.,16,61,1,True,τοιγὰρ ἐγώ τοι τέκνον ἀληθέα πάντʼ ἀγορεύσω
288,89,2307,Il.,17,179,1,True,ἀλλʼ ἄγε δεῦρο πέπον παρʼ ἔμʼ ἵστασο καὶ ἴδε ἔ...


In [100]:
print(len(new_rows))

290
