In [100]:
chars = 'άέήίαβγδεζηθικλμνξοπρςστυφχψωόύώἀἁἂἃἄἅἆἐἑἓἔἕἠἡἢἣἤἥἦἧἰἱἳἴἵἶἷὀὁὂὃὄὅὐὑὓὔὕὖὗὠὡὢὣὤὥὦὧὰὲὴὶὸὺὼᾄᾐᾑᾔᾖᾗᾠᾤᾦᾧᾳᾴᾶᾷῂῃῄῆῇῖῥῦῳῴῶῷ'

In [61]:
from cltk.corpus.swadesh import Swadesh
from cltk.stop.greek.stops import STOPS_LIST
from cltk.corpus.readers import get_corpus_reader
from cltk.tag.pos import POSTag
from greek_accentuation.syllabify import syllabify
import pandas as pd
import os
from betacode import conv

In [46]:
def pull_data(path:str, authors=None, swadesh=False):
    
    # get the greek files from path which match the author's name
    fileids = []
    if authors != None:
        for root, dirs, files in os.walk(path):
            for author in authors:
                text = [filename for filename in files if author in filename if 'grc' in filename]
                fileids += text
    else:
        for root, dirs, files in os.walk(path):
            text = [filename for filename in files if 'grc' in filename]
            fileids += text
        
            
    # instantiate the reader with those files
    reader = get_corpus_reader(corpus_name= 'greek_text_perseus', language='greek')
    reader._fileids = fileids
    
    # use the reader to get a list of words (vocab) from the files
    words = list(reader.words())
    
    # process the text, removing stopwords etc. 
    punct = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~᾽"
    swadesh = Swadesh('gr').words()
    words = [word.lower() for word in words]
    words = [word for word in words if word not in punct]
    words = [word for word in words if word not in STOPS_LIST]
    words_noswadesh = [word for word in words if word not in STOPS_LIST if not word in swadesh]
    
    # decide which output we want from the parameter 'swadesh'
    if swadesh:
        to_tag = words_noswadesh
    else:
        to_tag = words
        
    # instantiate POS tagger
    tagger = POSTag('greek')
    
    # tag words using backoff tagger
    joined = ' '.join(to_tag)
    tagged = tagger.tag_ngram_123_backoff(joined)
    
    # create dataframe from tagged words
    df = pd.DataFrame(tagged, columns=['word', 'pos'])
    
    # drop punctuation and useless words
    df = df[(df.pos != 'U--------') & (df.pos != 'D--------')]
    df.pos = df.pos.str.lower()
    
    # create separate dataframe containing the None's to compare later
    nones = df[(df.pos.isnull() == True) & (df.word.isnull() != True)]
    nones.drop_duplicates(ignore_index=True, inplace=True)
    
    # drop duplicates and nulls
    df.dropna(inplace=True)
    df.drop_duplicates(ignore_index=True, inplace=True)
    
    # return dataframe
    return df, nones

In [47]:
df, nones = pull_data('/Users/mackmcgowen/cltk_data/greek/text/greek_text_perseus/cltk_json/')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [80]:
df.to_csv('../data/corpus.csv', index=False)

In [52]:
nones.to_csv('../data/nones.csv', index=False)

In [58]:
df.word[0]

'δημήτηρ'

In [62]:
syllabify(df.word[0])

['δη', 'μή', 'τηρ']

In [88]:
df['syllables'] = df.word.apply(lambda x: ' '.join(syllabify(x)))

In [90]:
df.to_pickle('../data/corpus.pkl')

In [70]:
test = df.word.values

In [76]:

syll = list(map(lambda x: syllabify(x), test))

In [78]:
syll[0]

['δη', 'μή', 'τηρ']

In [87]:
' '.join(syll[0])

'δη μή τηρ'

In [1]:
from cltk.corpus.utils.importer import CorpusImporter
from cltk.corpus.readers import get_corpus_reader

In [2]:
CorpusImporter('greek').list_corpora

['greek_software_tlgu',
 'greek_text_perseus',
 'phi7',
 'tlg',
 'greek_proper_names_cltk',
 'greek_models_cltk',
 'greek_treebank_perseus',
 'greek_treebank_gorman',
 'greek_lexica_perseus',
 'greek_training_set_sentence_cltk',
 'greek_word2vec_cltk',
 'greek_text_lacus_curtius',
 'greek_text_first1kgreek',
 'greek_text_tesserae']

In [3]:
CorpusImporter('greek').import_corpus('greek_treebank_perseus')

In [5]:
reader = get_corpus_reader(corpus_name= 'greek_treebank_perseus')

ValueError: Specified corpus data not found, please install greek_treebank_perseus for language: None