In [1]:
import docx
import pandas as pd
import numpy as np
import regex as re

In [103]:
# all of the definitions I used

def filter_greek_roots(test):
    test = test.expandtabs(1)
    test = test.split(' ')
    test = [re.findall('[\p{InGreekExtended}|\p{InGreekAndCoptic}]+/', test[i]) for i in range(len(test))]
    test = list(filter(None, test))
    flat_list = [item for sublist in test for item in sublist]
    return flat_list

def filter_greek(test):
    test = test.expandtabs(1)
    test = test.split(' ')
    test = [re.findall('[\p{InGreekExtended}|\p{InGreekAndCoptic}]+', test[i]) for i in range(len(test))]
    test = list(filter(None, test))
    test = [''.join(i) for i in test]
    return test

def remove_articles(text):
    articles = ['ἡ', 'ὁ', 'τό']
    for i in articles:
        try:
            text.remove(i)
        except:
            pass
    return text

def remove_prefix(text):
    try:
        del text[-2]
    except:
        pass
    return text

In [95]:
# import the .docx file I'll be working on

doc = docx.Document('lexis corpus.docx')

In [96]:
# extract text from each paragraph object

docs = [doc.paragraphs[i].text for i in range(len(doc.paragraphs))]

In [104]:
# create dataframe from text

df = pd.DataFrame(docs, columns=['root'])
df[:10]

Unnamed: 0,root
0,Α
1,
2,"Α, ΑΝ\t\t\t\tMAY BE A PREFIX OF A COMPOUND VERB"
3,"ἀ-, ἀν- (Ṇ-)\t\t\ta-, an-, un-, in-, non- (neg..."
4,"ἀβουλία/, ἡ \t\t\tsee βουλευ/\t\t\t\t"
5,"ἀγ/ (1) \t\t\t\tlead, act, do (Ital. agent, ag..."
6,"\tἀν/αγ/\t\t\tlead up, celebrate\n\tἀπ/αγ/\t\t..."
7,"εἰσ/αγ/ (Attic)\t\tlead in, introduce\n\tἐσ/αγ..."
8,"ἀγ/ (2)\t\t\t\tshatter, smash"
9,"\t\t\t\tἄγνυμι, ἄξω, ἔαξα, ἔαγα/ἔηγα, ἔαγμαι, ..."


In [105]:
# separate the roots from their definitions etc. A little messy

df[['root', 'definition']] = df.root.str.split(r'\t\t', n=1, expand = True)
df.head()

Unnamed: 0,root,definition
0,Α,
1,,
2,"Α, ΑΝ",\t\tMAY BE A PREFIX OF A COMPOUND VERB
3,"ἀ-, ἀν- (Ṇ-)","\ta-, an-, un-, in-, non- (negative prefix)"
4,"ἀβουλία/, ἡ",\tsee βουλευ/\t\t\t\t


In [106]:
# for some rows the above code put definitions into the root column, this moves them back

for i in range(1, len(df)):
    if df['definition'][i] == None:
        df['definition'][i] = df['root'][i]
        df['root'][i] = ''
        
df.head()

Unnamed: 0,root,definition
0,Α,
1,,
2,"Α, ΑΝ",\t\tMAY BE A PREFIX OF A COMPOUND VERB
3,"ἀ-, ἀν- (Ṇ-)","\ta-, an-, un-, in-, non- (negative prefix)"
4,"ἀβουλία/, ἡ",\tsee βουλευ/\t\t\t\t


In [107]:
# filter out the greek roots from earlier messy separation 

df['root'] = df.apply(lambda x: filter_greek_roots(x['root']), axis=1)
df.head()

Unnamed: 0,root,definition
0,[],
1,[],
2,[],\t\tMAY BE A PREFIX OF A COMPOUND VERB
3,[],"\ta-, an-, un-, in-, non- (negative prefix)"
4,[ἀβουλία/],\tsee βουλευ/\t\t\t\t


In [108]:
# removes empty lists from root column

df.root = np.where(df.root.str.len() == 0, '', df.root)
df.head()

Unnamed: 0,root,definition
0,,
1,,
2,,\t\tMAY BE A PREFIX OF A COMPOUND VERB
3,,"\ta-, an-, un-, in-, non- (negative prefix)"
4,[ἀβουλία/],\tsee βουλευ/\t\t\t\t


In [109]:
''' 
the format of the original document had some issues which lead to definitions
being off by a row from their root.
'''
for i in range(1, len(df)):
    if df['root'][i] == '':
        df['definition'][i-1] = df['definition'][i]
        
df.drop_duplicates('definition', inplace=True, keep='first')
df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,root,definition
0,,
1,,\t\tMAY BE A PREFIX OF A COMPOUND VERB
2,,"\ta-, an-, un-, in-, non- (negative prefix)"
3,[ἀβουλία/],\tsee βουλευ/\t\t\t\t
4,[ἀγ/],"\t\tlead, act, do (Ital. agent, agenda, actor)..."


In [110]:
# drop rows that contain references to other lexis entries

df.drop(index=df.loc[df['definition'].str.contains('see')].index, inplace=True)
df.head()

Unnamed: 0,root,definition
0,,
1,,\t\tMAY BE A PREFIX OF A COMPOUND VERB
2,,"\ta-, an-, un-, in-, non- (negative prefix)"
4,[ἀγ/],"\t\tlead, act, do (Ital. agent, agenda, actor)..."
5,"[ἀν/, αγ/]","\tlead up, celebrate\n\tἀπ/αγ/\t\t\tlead away"


In [111]:
# filter out the english from the definitions

df['definition'] = df.apply(lambda x: filter_greek(x['definition']), axis=1)
df.head()

Unnamed: 0,root,definition
0,,[]
1,,[]
2,,[]
4,[ἀγ/],"[ἄγω, ἄξω, ἤγαγον, ἦχα, ἦγμαι, ἤχθην]"
5,"[ἀν/, αγ/]",[ἀπαγ]


In [112]:
# remove the articles from the definitions

df.definition = df.apply(lambda x: remove_articles(x['definition']), axis=1)
df.head()

Unnamed: 0,root,definition
0,,[]
1,,[]
2,,[]
4,[ἀγ/],"[ἄγω, ἄξω, ἤγαγον, ἦχα, ἦγμαι, ἤχθην]"
5,"[ἀν/, αγ/]",[ἀπαγ]


In [113]:
# drop empty rows or columns again

df.drop(df.loc[(df.root.str.len() == 0) | (df.definition.str.len() == 0)].index, inplace=True)
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,root,definition
0,[ἀγ/],"[ἄγω, ἄξω, ἤγαγον, ἦχα, ἦγμαι, ἤχθην]"
1,"[ἀν/, αγ/]",[ἀπαγ]
2,"[εἰσ/, αγ/]","[ἐσαγ, περιαγ, προαγ]"
3,[ἀγ/],"[ἄγνυμι, ἄξω, ἔαξα, ἔαγαἔηγα, ἔαγμαι, ἐάγην]"
4,[ἀγγελ/],"[ἄγγελλω, ἀγγελέω, ἤγγειλα, ἤγγελκα, ἤγγελμαι,..."


In [82]:
# df = pd.read_pickle('./data/vocab.pkl')

In [115]:
# remove the prefixes from roots

df['root'] = df.apply(lambda x: remove_prefix(x['root']), axis=1)
df.head()

In [117]:
# turn root back into string

df.root = df.root.apply(''.join)
df.head()

Unnamed: 0,root,definition
0,ἀγ/,"[ἄγω, ἄξω, ἤγαγον, ἦχα, ἦγμαι, ἤχθην]"
1,αγ/,[ἀπαγ]
2,αγ/,"[ἐσαγ, περιαγ, προαγ]"
3,ἀγ/,"[ἄγνυμι, ἄξω, ἔαξα, ἔαγαἔηγα, ἔαγμαι, ἐάγην]"
4,ἀγγελ/,"[ἄγγελλω, ἀγγελέω, ἤγγειλα, ἤγγελκα, ἤγγελμαι,..."


In [118]:
# # save to pkl 

# df.to_pickle('./data/vocab.pkl')