-
Notifications
You must be signed in to change notification settings - Fork 4
/
babel.lua
7243 lines (6794 loc) · 241 KB
/
babel.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
-- Add random languages to the game
--@ enable = true
local utils = require('utils')
--[[
TODO:
* Use df2console for portable printing.
* Split this giant file up into smaller pieces.
* Find names for anon and unk fields.
* Put words in GEN_DIVINE.
* Update any new names in GEN_DIVINE to point to the moved GEN_DIVINE.
* Get more dimension values if none have positive scores in the cross-product.
* Change the adventurer's forced goodbye response to "I don't understand you".
* Sign languages
* Accents and dialects
* Calquing
* Taboo syllables, e.g. those present in the king's name
* Mother-in-law language
* Audibility: loud sibilants, quiet whispers, silent signs
* [LISP] (which is not lisping but hissing)
* [UTTERANCES]
* Lisping, stuttering, Broca's aphasia, Wernicke's aphasia, muteness
* Language acquisition: babbling, jargon, holophrastic stage, telegraphic stage
* Effects of missing the critical period
* Creolization by kidnapped children
* Pidgins for merchants
* Orthography and spelling pronunciations
]]
local TEST = true
local REPORT_LINE_LENGTH = 73
local DEFAULT_NODE_PROBABILITY_GIVEN_PARENT = 0.5
local MINIMUM_FLUENCY = -32768
local MAXIMUM_FLUENCY = 32767
local UTTERANCES_PER_XP = 16
local MINIMUM_DIMENSION_CACHE_SIZE = 32
local WORD_SEPARATOR = ' '
local MORPHEME_SEPARATOR = nil
local WORD_ID_CHAR = '/'
local STANDARD_AMBIENT_TEMPERATURE = 10045
local NO_TEMPERATURE = 60001
local FEATURE_CLASS_NEUTRAL = 0
local FEATURE_CLASS_VOWEL = 1
local FEATURE_CLASS_CONSONANT = 2
local total_handlers = {
get={},
process_new={},
types={
-- TODO: Track buildings.
'historical_figures',
'entities',
'sites',
'artifacts',
'regions',
'events',
}
}
local next_report_index = 0
local region_x = -1
local region_y = -1
local region_z = -1
local unit_count = -1
local phonologies = nil
local lects = nil
local fluency_data = nil
if enabled == nil then
enabled = false
end
local dirty = true
--[[
Data definitions:
Lect:
A language or dialect.
seed: A random number generator seed for generating the lect's
language parameter table.
parameters: A language parameter table generated with `seed`, or nil
if it hasn't been generated yet.
lemmas: A translation containing the lemmas of this lect.
community: The civilization that speaks this lect.
phonology: The phonology this lect uses.
morphemes: A map of morphemes IDs to morphemes containing all the
morphemes used in this lect, including those only used in other
morphemes.
constituents: A lexicon, i.e. a map of constituent IDs to constituents
containing only the top-level constituents of this lect.
All the IDs, features, and feature values of all morphemes and
constituents in a lect must contain no characters invalid in raw tags.
Morpheme IDs must be positive integers such that `morphemes` is a
sequence, and the others must be strings.
Phonology:
name: A name, for internal use.
constraints: A sequence of constraints.
scalings: A sequence of scalings.
dispersions: A sequence of dispersions.
nodes: A sequence of nodes.
dimension: An optional dimension tree. If it is nil, a dimension tree
will be automatically generated.
symbols: A sequence of symbols.
articulators: A sequence of articulators.
Symbol:
symbol: A string.
features: The phoneme the symbol represents.
Constraint:
-- TODO
Articulator:
A conjunction of conditions which can apply to a unit. Each key in the
table can be nil, in which case that condition always applies. If all
the conditions apply to a unit, the articulator is present in that unit.
bp: A body part token string. The unit must have a body part with this
token. Example: 'U_LIP'.
bp_category: A body part category string. The unit must have a body
part with this category. Example: 'LIP'.
bp_flag: A body part flag. The unit must have a body part with this
flag. Example: 'HEAR'. At most one of `bp`, `bp_category`, and
`bp_flag` can be non-nil.
creature: A `creature_raw` from `df.global.world.raws.creatures.all`.
The unit must be an instance of this creature.
creature_class: A creature class string. The unit must be an instance
of a creature of this creature class. Example: 'GENERAL_POISON'.
creature_flag: A creature flag. The unit must be an instance of a
creature with this flag. Example: 'FANCIFUL'. At most one of
`creature`, `creature_class`, and `creature_flag` can be non-nil.
caste_index: An index into `creature.caste`. The unit's caste must be
at this index. If `creature` is nil, this field must be too.
caste_flag: A caste flag. The unit's caste must have this flag.
Example: 'EXTRAVISION'. At most one of `caste_index` and
`caste_flag` can be non-nil.
-- TODO: sex
Node:
name: A string, used only for parsing the phonology raw file.
parent: The index of the node's parent node in the associated
phonology, or 0 if the parent is the root node.
add: A string to append to the symbol of a phoneme that does not
have this node to denote a phoneme that does have it but is
otherwise identical.
remove: The opposite of `add`.
sonority: How much sonority this node adds to a phoneme. The number is
only meaningful in relation to other nodes' sonorities.
feature_class: TODO: This should be used or removed.
feature: Whether this node is a feature node, as opposed to a class
node.
prob: The probability that a phoneme has this node. It is 1 for class
nodes.
articulators: A sequence of articulators. At least one must be present
in a unit for that unit to produce a phoneme with this node.
Environment:
A table whose keys are indices of features and whose values are feature
environments for those features.
Feature environment:
A sequence of two sequences, each of whose keys are variable indices and
whose values are assignments. The two subsequences correspond to two
patterns in the scope of which the feature environment is being used.
Assignment:
A table representing a value bound to an variable name in an
environment.
val: A boolean for whether this variable has the same value as the
value of the variable that this variable is defined in terms of.
var: The index of the variable that this variable is defined in
terms of.
Phoneme:
A table whose keys are feature indices and whose values are booleans.
Dimension value:
A sequence of node indices sorted in increasing order.
score: A score of how good this dimension value is. It only has
meaning in comparison to other scores. A higher score means the
dimension value is more likely to be chosen.
Indexed dimension value:
A pair of a dimension value and an index.
i: The index of `candidate` in some unspecified sequence. This is only
useful if a function specifies what the index means.
candidate: A dimension value.
Bitfield:
A sequence of 32-bit unsigned integers representing a bitfield. Bit `b`
of element `e` represents bit `(e - 1) * 32 + b` in that bitfield. The
minimum bit is 0. There is no maximum bit; the bitfield will grow when
needed.
Grid metadata object:
Metadata about rows or columns in a grid.
values: A sequence of dimension values associated with the rows or
columns.
mask: A bitfield representing the dimensions associated with this
row or column.
Plus a sequence of the following, one per row or column:
score: The total score of the row or column.
value: The dimension value associated with the row or column.
family: A sequence of the indices of all the rows or columns split
from the same original row or column as this one, or nil if this
row or column has not been split.
Grid:
A grid of dimension values and metadata for the rows and columns.
grid: A sequence of sequences, each of which is the same length and
represents a row. The values in the grid are numbers, representing
the scores of dimension values.
rows: A grid metadata object for the rows.
cols: A grid metadata object for the columns.
Scaling:
A scaling factor to apply to the score of dimension value if it matches
a pattern of specific feature values.
mask: A bitfield. If and only if bit `b` is set, this scaling depends
on the value of node `b + 1` in the appropriate sequence of nodes.
values: A bitfield. If and only if bit `b` is set here and in `mask`,
this scaling applies only when the feature is present.
scalar: How much to scale the score when this scaling applies. It is
non-negative.
strength: How strong the scaling factor is as a function of scalar.
A scalar of 0 gets the maximum strength, then the strength decreases
monotonically for scalars from 0 to 1, with a minimum strength at 1,
then increases monotonically for scalars greater than 1.
Dispersion:
A scaling factor to apply to the score of a dimension value if another
dimension value is picked.
mask: A bitfield. If and only if bit `b` is set, this dispersion
depends on the value of `b + 1` in the appropriate sequence of
nodes.
values_1: A bitfield. If and only if bit `b` is set here and in
`mask`, this dispersion applies only when the feature is present.
values_2: The same as `values_1`, but for the other dimension value.
scalar: How much to scale the score when this dispersion applies. It
is non-negative.
Dimension:
A producer of dimension values. A dimension may have two subdimensions
from whose cross product its values are drawn.
id: A sequence of one or two node indices. If there is only one, this
dimension corresponds to only one node. If there are two, they are
the indices of the two nodes in `nodes` which are most separated
from each other in the dimension tree.
cache: A sequence of dimension values.
nodes: A sequence of the node indices covered by this dimension.
mask: A bitfield corresponding to `nodes`.
d1: A dimension or nil.
d2: A dimension or nil. It is nil if and only if `d1` is.
values_1: A sequence of values chosen from `d1`, or nil if `d1` is
nil.
values_2: A sequence of values chosen from `d2`, or nil if `d2` is
nil.
scalings: A sequence of scalings which apply to the nodes of this
dimension but not to either of its subdimensions'. That is, each
scaling's `node_1` and `node_2` are present in `d1.nodes` and
`d2.nodes`, respectively or vice versa. If `d1` is nil, so is
`scalings`.
dispersions: Like `scalings`, but for dispersions.
peripheral: Whether to use a different algorithm to choose dimension
values based on picking from the periphery of the dimension's grid
and ignoring the interior.
Link:
A relationship between two dimensions, and how close the relationship
is. The details of the relationship are not specified here.
d1: A dimension.
d2: A dimension.
scalings: A sequence of scalings which apply between the two
dimensions. See `scalings` in dimension.
dispersions: A sequence of dispersions which apply between the two
dimensions. See `dispersions` in dimension.
strength: How strong the link is. See `strength` in scaling.
Boundary:
A string representing a boundary.
-- TODO: Enumerate them.
Pword:
A sequence of phonemes.
Mword:
A sequence of morphemes.
Utterable:
An mword or string.
SFI:
A syntactic feature instance.
feature: A feature.
head: The constituent this instance of `feature` is on.
depth: The depth of `head`.
Language parameter table:
inventory: A sequence of phoneme/sonority pairs used in this language.
[1]: A phoneme.
[2]: Its sonority.
min_sonority: The minimum sonority of all phonemes in `inventory`.
max_sonority: The maximum sonority of all phonemes in `inventory`.
constraints: TODO
strategies: A map from features to movement strategies or nil.
overt_trace: Whether the language keeps traces in the phonological
form.
swap: Whether the language is head-final.
Movement strategy:
What sort of movement to do when checking a certain feature.
lower: Whether to lower rather than raise.
pied_piping: Whether to pied-pipe the constituents dominated by the
maximal projection of the moving constituent along with it.
Context:
A table containing whatever is necessary to complete a syntax tree based
on the speaker, hearers, and any other non-constant information which
may differ between utterances of basically the same sentence. See
`context_key` and `context_callback` in constituent.
-- TODO: Should the keys be standardized?
Constituent:
A node in a syntax tree.
n1: A child constituent.
n2: A child constituent, which is nil if `n1` is.
features: A map of features to feature values.
morphemes: A sequence of morphemes. Unspecified if `ref` is not nil.
is_phrase: Whether this constituent is a phrase, i.e. a maximal
projection.
depth: The depth of the constituent from the root, where the root has
a depth of 0 and all others have depths one greater than their
parents'.
ref: The key of another constituent in the lexicon that this
constituent is to be replaced with.
maximal: The maximal projection of this constituent, or nil if none.
moved_to: The constituent to which this constituent was moved, or nil
if none.
text: A string to use verbatim in the output. If this is non-nil, then
`features` and `morphemes` must both be empty.
context_key: A key to look up in a context. At most one of `n1`,
`word`, `text`, and `context_key` can be non-nil.
context_callback: A function returning a constituent to replace
this one given `context[context_key]` and `context` where `context`
is a context. It is nil if and only if `context_key` is.
Morpheme:
id: A unique ID for this morpheme within its language.
text: A string to print for debugging.
pword: A sequence of phonemes.
features: A map of features to feature values.
affix: Whether this is a bound morpheme.
after: Whether this morpheme goes after (as opposed to before) another
morpheme when dislocating.
initial: Whether `after` should be taken in reference to the first
subunit of the unit relative to which this morpheme is dislocated.
fusion: A map of morpheme IDs to morphemes, representing the fusion of
this morpheme and the key to produce the value.
dummy: A morpheme to insert if this is a bound morpheme but there are
no morphemes to bind to, or nil if this morpheme should just be
deleted in that case.
Translation:
A sequence of tag strings in the format of the values of
`df.global.world.raws.language.translations`.
Loan:
A sequence of tables, each with the keys:
prefix: The string to prepend to the ID of the referent.
type: The DFHack struct type of the referent.
id: The name of the field where a referent of type `type` has an ID.
get: A function:
Gets the referents of type `type` from a civilization.
Args:
civ: A civilization.
Returns:
The array of referents.
]]
--[[
Prints a help message.
]]
local function usage()
print[[
Usage:
babel start
Start the script.
babel stop
Stop the script.
]]
end
--[[
Asserts that two values are equal.
If the values are both tables, it compares the elements of the tables.
Args:
actual: A value.
expected: A value.
]]
local function assert_eq(actual, expected, _k)
if type(actual) == 'table' and type(expected) == 'table' then
for k, v in pairs(expected) do
assert_eq(actual[k], v, k)
end
for k, v in pairs(actual) do
assert_eq(v, expected[k], k)
end
else
local k = ''
if _k then
k = ', index: ' .. tostring(_k)
end
assert(expected == actual, 'expected: ' .. tostring(expected) ..
', actual: ' .. tostring(actual) .. k)
end
end
--[[
Concatenates two sequences.
Args:
a: A sequence.
b: A sequence.
Returns:
The concatenation of `a` and `b`.
]]
local function concatenate(a, b)
local length = #a
local rv = copyall(a)
for i, v in ipairs(b) do
rv[length + i] = v
end
return rv
end
if TEST then
assert_eq(concatenate({1, '2'}, {{n=3}, false}), {1, '2', {n=3}, false})
end
--[[
Shuffles a sequence randomly.
Args:
t! A sequence.
rng! A random number generator.
Returns:
`t`, randomly shuffled.
]]
local function shuffle(t, rng)
local j
for i = #t, 2, -1 do
j = rng:random(i) + 1
t[i], t[j] = t[j], t[i]
end
return t
end
--[[
Finds the last instance of an element in a vector.
Args:
vector: A vector or sequence.
key: What to search for.
field: The field which is equal to `key` in the sought element, or
nil to compare the element itself to `key`.
Returns:
The last index of a matching element, or nil if not found.
The last matching element, or nil if none is found.
]]
local function reverse_linear_index(vector, key, field)
local min, max
if df.isvalid(vector) then
min, max = 0, #vector - 1
else
min, max = 1, #vector
end
if field then
for i = max, min, -1 do
local obj = vector[i]
if obj[field] == key then
return i, obj
end
end
else
for i = max, min, -1 do
local obj = vector[i]
if obj == key then
return i, obj
end
end
end
end
if TEST then
assert_eq({reverse_linear_index({1, 2, 1}, 3)}, {})
assert_eq({reverse_linear_index({1, 2, 1}, 1)}, {3, 1})
assert_eq({reverse_linear_index({{k=1}, {j=1}, {j=1, k=1}, {k=2}}, 1, 'k')},
{3, {j=1, k=1}})
end
local function escape(str)
return (str:gsub('[\x00\n\r\x1a%%:%]]', function(c)
return '%' .. string.format('%02X', string.byte(c))
end))
end
if TEST then
assert_eq(escape('<]:\r\n|%\x1a\x00>'), '<%5D%3A%0D%0A|%25%1A%00>')
end
local function unescape(str)
return (str:gsub('%%[%da-fA-F][%da-fA-F]', function(c)
return string.char(tonumber(c:sub(2), 16))
end))
end
if TEST then
assert_eq(unescape('(%5D%3A%0a|%25%1A)'), '(]:\n|%\x1a)')
end
--[[
Serializes a pword.
Args:
nodes: A sequence of the nodes used in `pword`'s lect.
pword: A pword.
Returns:
An opaque string serialization of the pword, which can be deserialized
with `deserialize_pword`.
]]
local function serialize_pword(nodes, pword)
local str = ''
local features_per_phoneme = #nodes
for _, phoneme in ipairs(pword) do
local byte = 0
local bi = 1
for ni = 1, features_per_phoneme do
if nodes[ni].feature then
if phoneme[ni] then
byte = byte + 2 ^ ((8 - bi) % 8)
end
bi = bi + 1
end
if ni == features_per_phoneme or bi == 9 then
str = str .. string.format('%c', byte)
byte = 0
bi = 1
end
end
end
return str
end
if TEST then
local fn = {feature=true}
local cn = {feature=false}
local nodes = {fn, fn, fn, fn, fn, fn, fn, fn}
assert_eq(serialize_pword(nodes, {}), '')
assert_eq(serialize_pword(nodes, {{}}), '\x00')
assert_eq(serialize_pword(nodes, {{[1]=true}}), '\x80')
assert_eq(serialize_pword(nodes, {{[2]=true}}), '\x40')
assert_eq(serialize_pword(nodes, {{[3]=true}}), '\x20')
assert_eq(serialize_pword(nodes, {{[4]=true}}), '\x10')
assert_eq(serialize_pword(nodes, {{[5]=true}}), '\x08')
assert_eq(serialize_pword(nodes, {{[6]=true}}), '\x04')
assert_eq(serialize_pword(nodes, {{[7]=true}}), '\x02')
assert_eq(serialize_pword(nodes, {{[8]=true}}), '\x01')
assert_eq(serialize_pword(concatenate(nodes, {fn}), {{[9]=true}}), '\x00\x80')
assert_eq(serialize_pword(concatenate(nodes, {cn, fn}),
{{[9]=true, [10]=true}}),
'\x00\x80')
assert_eq(serialize_pword(concatenate(nodes, concatenate(nodes, nodes)),
{{[12]=true}}),
'\x00\x10\x00')
assert_eq(serialize_pword(nodes, {{false, true, true, true, true, true}}),
'\x7c')
assert_eq(serialize_pword(nodes, {{true}, {true}}), '\x80\x80')
end
--[[
Deserializes a pword.
Args:
nodes: A sequence of the nodes used in the target lect.
str: A serialized pword.
Returns:
A pword.
]]
local function deserialize_pword(nodes, str)
local pword = {}
local phoneme = {}
local ni = 1
for i = 1, #str do
local code = str:byte(i)
local b = 8
while b >= 1 do
if not nodes[ni] then
b = 0
elseif nodes[ni].feature then
table.insert(phoneme, (code % (2 ^ b)) >= (2 ^ (b - 1)))
b = b - 1
else
table.insert(phoneme, true)
end
ni = ni + 1
end
if ni >= #nodes then
pword[#pword + 1] = phoneme
phoneme = {}
ni = 1
end
end
return pword
end
if TEST then
local fn = {feature=true}
local cn = {feature=false}
local n1 = {fn, fn, fn, fn, fn, fn, fn, fn}
local n2 = concatenate(n1, n1)
local n3 = concatenate(n1, n2)
assert_eq(deserialize_pword(n1, '\x00'),
{{false, false, false, false, false, false, false, false}})
assert_eq(deserialize_pword(n1, '\x80'),
{{true, false, false, false, false, false, false, false}})
assert_eq(deserialize_pword(n1, '\x40'),
{{false, true, false, false, false, false, false, false}})
assert_eq(deserialize_pword(n1, '\x20'),
{{false, false, true, false, false, false, false, false}})
assert_eq(deserialize_pword(n1, '\x10'),
{{false, false, false, true, false, false, false, false}})
assert_eq(deserialize_pword(n1, '\x08'),
{{false, false, false, false, true, false, false, false}})
assert_eq(deserialize_pword(n1, '\x04'),
{{false, false, false, false, false, true, false, false}})
assert_eq(deserialize_pword(n1, '\x02'),
{{false, false, false, false, false, false, true, false}})
assert_eq(deserialize_pword(n1, '\x01'),
{{false, false, false, false, false, false, false, true}})
assert_eq(deserialize_pword(n2, '\x00\x80'),
{{false, false, false, false, false, false, false, false, true,
false, false, false, false, false, false, false}})
assert_eq(deserialize_pword(n3, '\x00\x10\x00'),
{{false, false, false, false, false, false, false, false, false,
false, false, true, false, false, false, false, false, false,
false, false, false, false, false, false}})
assert_eq(deserialize_pword(n1, '\x7c'),
{{false, true, true, true, true, true, false, false}})
assert_eq(deserialize_pword(n1, '\x80\x80'),
{{true, false, false, false, false, false, false, false},
{true, false, false, false, false, false, false, false}})
assert_eq(deserialize_pword(concatenate({cn}, n1), '\x80'),
{{true, true, false, false, false, false, false, false, false}})
end
--[[
Merge two sequences without duplicates sorted in increasing order.
Elements present in both input sequences are collapsed into one.
Args:
s1: A sequence.
s2: A sequence.
cmpfun: A comparator function, or `utils.compare` by default.
Returns:
A merged sorted sequence.
]]
local function merge_sorted_sequences(s1, s2, cmpfun)
local rv = {}
for _, e in ipairs(s1) do
table.insert(rv, e)
end
for _, e in ipairs(s2) do
utils.insert_sorted(rv, e, nil, cmpfun)
end
return rv
end
if TEST then
assert_eq(merge_sorted_sequences({1, 2, 5}, {-1, 3, 4, 5, 100}),
{-1, 1, 2, 3, 4, 5, 100})
assert_eq(
merge_sorted_sequences(
{5, 2, 1}, {100, 5, 4, 3, -1},
function(a, b) return utils.compare(b, a) end),
{100, 5, 4, 3, 2, 1, -1})
end
--[[
Determines whether one node dominates another.
Every node dominates itself.
Args:
index_1: The index of a node in `nodes`.
index_2: The index of a node in `nodes`.
nodes: A sequence of nodes.
Returns:
Whether `nodes[index_1]` dominates `nodes[index_2]`.
]]
local function dominates(index_1, index_2, nodes)
if index_1 == index_2 then
return true
elseif index_2 < index_1 then
return false
end
return dominates(index_1, nodes[index_2].parent, nodes)
end
if TEST then
local nodes = {{name='1', parent=0, sonority=0},
{name='2', parent=0, sonority=0},
{name='3', parent=2, sonority=0}}
assert_eq(dominates(0, 0, nodes), true)
assert_eq(dominates(0, 1, nodes), true)
assert_eq(dominates(0, 2, nodes), true)
assert_eq(dominates(0, 3, nodes), true)
assert_eq(dominates(1, 0, nodes), false)
assert_eq(dominates(1, 1, nodes), true)
assert_eq(dominates(1, 2, nodes), false)
assert_eq(dominates(1, 3, nodes), false)
assert_eq(dominates(2, 0, nodes), false)
assert_eq(dominates(2, 1, nodes), false)
assert_eq(dominates(2, 2, nodes), true)
assert_eq(dominates(2, 3, nodes), true)
assert_eq(dominates(3, 0, nodes), false)
assert_eq(dominates(3, 1, nodes), false)
assert_eq(dominates(3, 2, nodes), false)
assert_eq(dominates(3, 3, nodes), true)
end
local function optimize(parameters, input, is_loan)
local output = copyall(input)
--[[
if is_loan then
for _, phoneme in pairs(output) do
if phoneme not in parameters.inventory then
phoneme = closest_phoneme(phoneme, parameters.inventory)
end
end
end
output = best_candidate(1, parameters.constraints, input, output)
--]]
return output
end
--[[
Gets the lemma of a pword.
Args:
phonology: A phonology.
pword: A pword.
Returns:
The lemma.
]]
local function get_lemma(phonology, pword)
local str = ''
for _, phoneme in ipairs(pword) do
local best_symbol = ''
local best_score = -1
local best_base_score = -1
for _, symbol_info in ipairs(phonology.symbols) do
local symbol = symbol_info.symbol
local symbol_features = symbol_info.features
local base_score = 0
for node_index, node in ipairs(phonology.nodes) do
if (node.feature and (phoneme[node_index] or false) ==
(symbol_features[node_index] or false)) then
base_score = base_score + 1
end
end
local score = base_score
--[[
for i, node in pairs(phonology.nodes) do
if not phoneme[i] ~= not symbol_features[i] then
if node.add and phoneme[i] then
symbol = symbol .. node.add
score = score + 1
elseif node.remove and not phoneme[i] then
symbol = symbol .. node.remove
score = score + 1
elseif phoneme[i] and node.feature then
symbol = symbol .. '[+' .. node.name .. ']'
elseif not phoneme[i] and node.feature then
symbol = symbol .. '[-' .. node.name .. ']'
end
end
end
]]
if (score > best_score or
(score == best_score and base_score > best_base_score)) then
best_symbol = symbol
best_score = score
best_base_score = base_score
end
end
str = str .. best_symbol
end
return str
end
if TEST and TODO then
assert_eq(get_lemma({nodes={}, symbols={}}, {{}}), '')
local phonology = {nodes={{name='a', parent=0, add='+a', remove='-a',
feature=true},
{name='b', parent=0, add='+b', remove='-b',
feature=true},
{name='c', parent=0, add='+c', remove='-c',
feature=true}},
symbols={{symbol='x', features={false, false, false}},
{symbol='abc', features={true, true, true}}}}
assert_eq(get_lemma(phonology, {{false, false, false}}), 'x')
assert_eq(get_lemma(phonology, {{false, false, true}}), 'x+c')
assert_eq(get_lemma(phonology, {{false, true, false}}), 'x+b')
assert_eq(get_lemma(phonology, {{false, true, true}}), 'abc-a')
assert_eq(get_lemma(phonology, {{true, false, false}}), 'x+a')
assert_eq(get_lemma(phonology, {{true, false, true}}), 'abc-b')
assert_eq(get_lemma(phonology, {{true, true, false}}), 'abc-c')
assert_eq(get_lemma(phonology, {{true, true, true}}), 'abc')
assert_eq(get_lemma(phonology, {{true, false, true}, {true, true, false}}),
'abc-babc-c')
table.remove(phonology.symbols, 1)
assert_eq(get_lemma(phonology, {{false, false, false}}), 'abc-a-b-c')
end
--[[
local function best_candidate(constraint_index, constraints, original,
candidate, violation_counts)
local violations =
violations(constraints[constraint_index], original, candidate)
if #violations == 0 then
if constraint_index == #constraints then
return candidate, i, 0
else
return best_candidate(constraint_index + 1, constraints, original,
candidate)
end
end
local actions =
actions(constraint_index, constraints, candidate, violations[1])
if #actions == 0 then
return candidate, i, #violations
end
let best_violated_constraint_index = 1
let best_violated_constraint_count = math.huge
for action in actions do
local new_candidate, new_constraint_index, new_constraint_count =
best_candidate(constraint_index, apply_action(action, candidate))
end
end
--]]
--[[
Updates a binding in an environment.
Does `feature_env[lvalue_i][lvalue_var] = new` and modifies
`feature_env` to be consistent with the new assignment.
Args:
feature_env! A feature environment.
lvalue_i: Which pattern (1 or 2) the identifier is from.
lvalue_var: The identifier to bind to a new assignment.
new: An assignment.
Returns:
Whether the new binding is consistent with the original feature
environment.
]]
local function update_binding(feature_env, lvalue_i, lvalue_var, new)
if lvalue_i == new.i and lvalue_var == new.var then
return new.val
end
feature_env[lvalue_i][lvalue_a] = new
for i = 1, 2 do
for _, other in pairs(feature_env[i]) do
if other.i == lvalue_i and other.var == lvalue_var then
other.i = new.i
other.val = other.val == new.val
other.var = new.var
end
end
end
return true
end
-- A feature assignment is a pair of a value and a var. A value is a
-- boolean. A var is a non-negative integer representing a variable.
-- The var 0 is always true.
-- Every assignment is one of:
-- * a literal boolean
-- * a variable with no prior information (i.e. nil in feature_env)
-- * a variable with a known boolean value
-- * a variable with a known relationship to another variable
-- i=0 means i=<don't care>
local function equalize(a1, a2, feature_env)
if a1.var == 0 then
if a2.var == 0 then
return a1.val == a2.val
elseif not feature_env[2][a2.var] then
return update_binding(feature_env, 2, a2.var,
{i=0, val=a1.val, var=a1.var})
elseif feature_env[2][a2.var].var == 0 then
return a1.val == feature_env[2][a2.var].val
else
return equalize(a1, feature_env[2][a2.var], feature_env)
end
elseif not feature_env[1][a1.var] then
if a2.var == 0 then
return update_binding(feature_env, 1, a1.var,
{i=0, val=a2.val, var=a2.var})
elseif not feature_env[2][a2.var] then
return update_binding(feature_env, 1, a1.var,
{i=2, val=a2.val, var=a2.var})
elseif feature_env[2][a2.var].var == 0 then
return update_binding(feature_env, 1, a1.var, feature_env[2][a2.var])
else
return update_binding(feature_env, 1, a1.var, feature_env[2][a2.var])
end
elseif feature_env[1][a1.var].var == 0 then
if a2.var == 0 then
return feature_env[1][a1.var].val == a2.val
elseif not feature_env[2][a2.var] then
return update_binding(feature_env, 2, a2.var, feature_env[1][a1.var])
elseif feature_env[2][a2.var].var == 0 then
return feature_env[1][a1.var].val == feature_env[2][a2.var].val
else
return equalize(feature_env[1][a1.var], feature_env[2][a2.var],
feature_env)
end
else
if a2.var == 0 then
return equalize(feature_env[1][a1.var], a2, feature_env)
elseif not feature_env[2][a2.var] then
return update_binding(feature_env, 2, a2.var, feature[1][a1.var])
elseif feature_env[2][a2.var].var == 0 then
return equalize(feature_env[1][a1.var], feature_env[2][a2.var],
feature_env)
else
return equalize(feature_env[1][a1.var], feature_env[2][a2.var],
feature_env)
end
end
end
local function get_feature_set_overlap(overlap, phoneme_2, env)
for i, a2 in pairs(phoneme2) do
local a1 = overlap[i]
if a1 then
if not env[i] then
env[i] = {{}, {}}
end
if not equalize(a1, a2, 1, 2, env[i]) then
return nil
end
else
overlap[i] = {i=2, val=a2.val, var=a2.var}
end
end
return overlap
end
local function get_overlap(element_1, element_2, env)
if element_1.type == 'phoneme' then
if element_2.type == 'phoneme' then
return get_feature_set_overlap(copyall(element_1), element_2, env)
elseif element_2.type == 'boundary' then
return nil
else
return get_feature_set_overlap({[element_2.feature]={val=false, var=0}},
element_1, env)
end
elseif element_1.type == 'boundary' then
if element_2.type == 'phoneme' then
return nil
elseif element_2.type == 'boundary' then
return get_feature_set_overlap(copyall(element_1), element_2, {})
else
return get_feature_set_overlap(copyall(element_1), element_2.boundaries,
{})
end
else
if element_2.type == 'phoneme' then
return get_feature_set_overlap({[element_1.feature]={val=false, var=0}},
element_2, env)