/
scanner.py
265 lines (238 loc) · 10.5 KB
/
scanner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
"""
This program returns the prosimetric scansion of Greek texts.
A user is first prompted to supply the file path of the text they wish to scan.
Note that this text must be a relatively 'clean' text, as the opening function
(i.e., tokenize) will only remove numbers and all punctuation that is not a
period. The tokenizer will also force lower case on the text. The text will
then be tokenized and syllabified. Before the text undergoes the actual
scansion functions, the text will be re-tokenized into a simple list of words
and syllables. Finally, the simplified tokenized text will be scanned according
to typical Greek scansion rules. The details of these rules are delineated in
the docstrings of the specific scansion functions. The final output is the
resulting scansion.
Known bugs:
1) Reduplicated syllables in a single sentence are not scanned seperately
"""
from cltk.utils.cltk_logger import logger
__author__ = ['Tyler Kirby <tyler.kirby9398@gmail.com>']
__license__ = 'MIT License'
class Scansion:
"""Scans Greek texts, but does not macronize the text."""
def __init__(self):
"""Setup class variables."""
self.vowels = ['ε', 'ι', 'ο', 'α', 'η', 'ω', 'υ', 'ῖ', 'ᾶ']
self.sing_cons = ['ς', 'ρ', 'τ', 'θ', 'π', 'σ', 'δ', 'φ', 'γ', 'ξ',
'κ', 'λ', 'χ', 'β', 'ν', 'μ']
self.doub_cons = ['ξ', 'ζ', 'ψ']
self.long_vowels = ['η', 'ω', 'ῖ', 'ᾶ', 'ῦ']
self.diphthongs = ['αι', 'αῖ', 'ευ', 'εῦ', 'αυ', 'αῦ', 'οι', 'οῖ',
'ου', 'οῦ', 'ει', 'εῖ', 'υι', 'υῖ', 'ηῦ']
self.stops = ['π', 'τ', 'κ', 'β', 'δ', 'γ']
self.liquids = ['ρ', 'λ']
self.punc = ['!', '@', '#', '$', '%', '^', '&', '*', '(', ')',
'-', '_', '=', '+', '}', '{', '[', ']', '1', '2',
'3', '4', '5', '6', '7', '8', '9', '0', ',', '\'',
'᾽', '(', ')']
self.punc_stops = ['·', ':', ';']
def _clean_text(self, text):
"""Clean the text of extraneous punction.
By default, ':', ';', and '.' are defined as stops.
:param text: raw text
:return: clean text
:rtype : string
"""
clean = []
for char in text:
if char in self.punc_stops:
clean += '.'
elif char not in self.punc:
clean += char
else:
pass
return (''.join(clean)).lower()
def _clean_accents(self, text):
"""Remove most accent marks.
Note that the circumflexes over alphas and iotas in the text since
they determine vocalic quantity.
:param text: raw text
:return: clean text with minimum accent marks
:rtype : string
"""
accents = {
'ὲέἐἑἒἓἕἔ': 'ε',
'ὺύὑὐὒὓὔὕ': 'υ',
'ὸόὀὁὂὃὄὅ': 'ο',
'ὶίἰἱἲἳἵἴ': 'ι',
'ὰάἁἀἂἃἅἄᾳᾂᾃ': 'α',
'ὴήἠἡἢἣἥἤἧἦῆῄῂῇῃᾓᾒᾗᾖᾑᾐ': 'η',
'ὼώὠὡὢὣὤὥὦὧῶῲῴῷῳᾧᾦᾢᾣᾡᾠ': 'ω',
'ἶἷ': 'ῖ',
'ἆἇᾷᾆᾇ': 'ᾶ',
'ὖὗ': 'ῦ',
}
text = self._clean_text(text)
for char in text:
for key in accents.keys():
if char in key:
text = text.replace(char, accents.get(key))
else:
pass
return text
def _tokenize(self, text):
"""Tokenize the text into a list of sentences with a list of words.
:param text: raw text
:return: tokenized text
:rtype : list
"""
sentences = []
tokens = []
for word in self._clean_accents(text).split(' '):
tokens.append(word)
if '.' in word:
sentences.append(tokens)
tokens = []
return sentences
def _syllable_condenser(self, words_syllables):
"""Reduce a list of [sentence [word [syllable]]] to [sentence [syllable]].
:param words_syllables: tokenized text
:return: text tokenized only at the sentence and syllable level
:rtype : list
"""
sentences_syllables = []
for sentence in words_syllables:
syllables_sentence = []
for word in sentence:
syllables_sentence += word
sentences_syllables.append(syllables_sentence)
return sentences_syllables
def _long_by_nature(self, syllable):
"""Check if syllable is long by nature.
Long by nature includes:
1) Syllable contains a diphthong
2) Syllable contains a long vowel
:param syllable: current syllable
:return: True if long by nature
:rtype : bool
"""
# Find diphthongs
vowel_group = []
for char in syllable:
print
if char in self.long_vowels:
return True
elif char not in self.sing_cons and char not in self.doub_cons:
vowel_group += char
if ''.join(vowel_group) in self.diphthongs:
return True
def _long_by_position(self, syllable, sentence):
"""Check if syllable is long by position.
Long by position includes:
1) Next syllable begins with two consonants, unless those consonants
are a stop + liquid combination
2) Next syllable begins with a double consonant
3) Syllable ends with a consonant and the next syllable begins with a
consonant
:param syllable: Current syllable
:param sentence: Current sentence
:return: True if syllable is long by position
:rtype : bool
"""
try:
next_syll = sentence[sentence.index(syllable) + 1]
# Long by position by case 1
if (next_syll[0] in self.sing_cons and next_syll[1] in
self.sing_cons) and (next_syll[0] not in self.stops and
next_syll[1] not in self.liquids):
return True
# Long by position by case 2
elif syllable[-1] in self.vowels and next_syll[0] in self.doub_cons:
return True
# Long by position by case 3
elif syllable[-1] in self.sing_cons and (next_syll[0] in self.sing_cons):
return True
else:
pass
except IndexError:
logger.info("IndexError while checking if syllable '%s' is long. Continuing.", syllable)
def _scansion(self, sentence_syllables):
"""Replace long and short values for each input syllable.
:param sentence_syllables: A list of strings
:return: '˘' and '¯' to represent short and long syllables,
respectively
:rtype : list
"""
scanned_text = []
for sentence in sentence_syllables:
scanned_sent = []
for syllable in sentence:
if self._long_by_position(syllable, sentence) or \
self._long_by_nature(syllable):
scanned_sent.append('¯')
else:
scanned_sent.append('˘')
if len(scanned_sent) > 1:
del scanned_sent[-1]
scanned_sent.append('x')
scanned_text.append(''.join(scanned_sent))
return scanned_text
def _make_syllables(self, sentences_words):
"""Divide the word tokens into a list of syllables.
Note that a syllable in this instance is defined as a vocalic group
(i.e., vowel or a diphthong). This means that all syllables which are
not the last syllable in the word will end with a vowel or diphthong.
TODO: Determine whether a CLTK syllabifier could replace this
:param sentence_words:
:return: Syllabified words
:rtype : list
"""
text = self._tokenize(sentences_words)
all_syllables = []
for sentence in text:
syll_per_sent = []
for word in sentence:
syll_start = 0 # Begins syllable iterator
syll_per_word = []
cur_letter_in = 0 # Begins general iterator
while cur_letter_in < len(word):
letter = word[cur_letter_in]
if (cur_letter_in != len(word) - 1) and \
(word[cur_letter_in] + word[cur_letter_in + 1]) \
in self.diphthongs:
cur_letter_in += 1
# Syllable ends with a diphthong
syll_per_word.append(word[syll_start:cur_letter_in + 1])
syll_start = cur_letter_in + 1
elif (letter in self.vowels) or (letter in self.long_vowels):
# Syllable ends with a vowel
syll_per_word.append(word[syll_start:cur_letter_in + 1])
syll_start = cur_letter_in + 1
cur_letter_in += 1
try:
last_vowel = syll_per_word[-1][-1] # Last vowel of a word
# Modifies general iterator to accomodate consonants after
# the last syllable in a word
cur_letter_in = len(word) - 1
# Contains all of the consonants after the last vowel in a word
leftovers = ''
while word[cur_letter_in] != last_vowel:
if word[cur_letter_in] != '.':
# Adds consonants to leftovers
leftovers = word[cur_letter_in] + leftovers
cur_letter_in -= 1
# Adds leftovers to last syllable in a word
syll_per_word[-1] += leftovers
syll_per_sent.append(syll_per_word)
except IndexError:
logger.info("IndexError while making syllables of '%s'. Continuing.", word)
all_syllables.append(syll_per_sent)
return all_syllables
def scan_text(self, input_string):
"""The primary method for the class.
:param input_string: A string of macronized text.
:return: meter of text
:rtype : list
"""
syllables = self._make_syllables(input_string)
sentence_syllables = self._syllable_condenser(syllables)
meter = self._scansion(sentence_syllables)
return meter