diff --git a/src/mtu_tur.py b/src/mtu_tur.py new file mode 100644 index 0000000..ac49504 --- /dev/null +++ b/src/mtu_tur.py @@ -0,0 +1,309 @@ +#! /usr/bin/python3 + +# mtu_tur.py +# +# Extracts data from MTU.TUR, which is required for Turkish-English dictionary, +# Türkçe Eş Anlamlılar dictionary and Türkçe Leb Demeden feature. +# +# MTU.TRK consists of seven parts: +# 1- Header (12 bytes) +# 2- 1st section (66 bytes) +# 3- 2nd section (2050 bytes) +# 4- 3rd section (45052 bytes) +# 5- 4th section (107100 bytes) +# 6- 5th section (62800 bytes) +# 7- 6th section (3640 bytes) + +import os +import struct + +# MTU.TUR encodes all text in its own custom alphabet, where 0x00 is 'a', 0x01 +# is 'b' and so on. +alphabet = "abcçdefgğhıijklmnoöpqrsştuüvwxyzâ..........î..............û" + +def GetSuffixLength(value): + # 0x00-0x08: 0, 0x08-0x10: 1, 0x10-0x18: 2, (...), 0xb0-0xb8: 22 + if 0x00 <= value < 0xb8: + return value // 8 + # 0xb8-0xd0: 3, 0xd0-0xe8: 4, 0xe8-0x100: 5 + elif 0xb8 <= value < 0x100: + return 3 + ((value - 0xb8) // 0x18) + else: + return None + +def GetSuffixReodered(suffix, value): + if value >= 0xb8: + value = (value - 0xb8) % 0x18 + if 0x00 <= value < 0x08: + # 'abcd' -> 'dabc' + suffix = suffix[-1] + suffix[:-1] + elif 0x08 <= value < 0x10: + # 'abcd' -> 'bcda' + suffix = suffix[1:] + suffix[0] + elif 0x10 <= value < 0x18: + # 'abcd' -> 'dcba' + suffix = suffix[::-1] + + return suffix + +def GetSuffix(data, instructions, base_offset): + suffix = '' + suffix_length = GetSuffixLength(instructions[1]) + + if suffix_length == 0: + # TODO: What's the purpose of [2] and [3] here? + pass + # One/Two-letter suffixes are formed directly from our custom alphabet. + elif 1 <= suffix_length <= 2: + for i in range(0, suffix_length): + suffix += alphabet[instructions[2 + i]] + # For anything else, we need to read the suffix from the 5th section. + else: + offset = struct.unpack(" + # "aba yak"). Doesn't seem to affect Leb Demeden. + section3 = [] + for i in range(0, header[1]): # 3218 + pos += 1 + value = struct.unpack("