diff --git a/chardet/langarabicmodel.py b/chardet/langarabicmodel.py index 02a02b07..e96f3d01 100644 --- a/chardet/langarabicmodel.py +++ b/chardet/langarabicmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -62768,1070 +62766,1077 @@ # Character Mapping Table(s): ISO_8859_6_ARABIC_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # None - 162: 255, # None - 163: 255, # None - 164: 253, # '¤' - 165: 255, # None - 166: 255, # None - 167: 255, # None - 168: 255, # None - 169: 255, # None - 170: 255, # None - 171: 255, # None - 172: 253, # '،' - 173: 251, # '\xad' - 174: 255, # None - 175: 255, # None - 176: 255, # None - 177: 255, # None - 178: 255, # None - 179: 255, # None - 180: 255, # None - 181: 255, # None - 182: 255, # None - 183: 255, # None - 184: 255, # None - 185: 255, # None - 186: 255, # None - 187: 253, # '؛' - 188: 255, # None - 189: 255, # None - 190: 255, # None - 191: 253, # '؟' - 192: 255, # None - 193: 255, # 'ء' - 194: 255, # 'آ' - 195: 255, # 'أ' - 196: 255, # 'ؤ' - 197: 255, # 'إ' - 198: 255, # 'ئ' - 199: 255, # 'ا' - 200: 255, # 'ب' - 201: 255, # 'ة' - 202: 255, # 'ت' - 203: 255, # 'ث' - 204: 255, # 'ج' - 205: 255, # 'ح' - 206: 255, # 'خ' - 207: 255, # 'د' - 208: 255, # 'ذ' - 209: 255, # 'ر' - 210: 255, # 'ز' - 211: 255, # 'س' - 212: 255, # 'ش' - 213: 255, # 'ص' - 214: 255, # 'ض' - 215: 255, # 'ط' - 216: 255, # 'ظ' - 217: 255, # 'ع' - 218: 255, # 'غ' - 219: 255, # None - 220: 255, # None - 221: 255, # None - 222: 255, # None - 223: 255, # None - 224: 255, # 'ـ' - 225: 255, # 'ف' - 226: 255, # 'ق' - 227: 255, # 'ك' - 228: 255, # 'ل' - 229: 255, # 'م' - 230: 255, # 'ن' - 231: 255, # 'ه' - 232: 255, # 'و' - 233: 255, # 'ى' - 234: 255, # 'ي' - 235: 253, # 'ً' - 236: 253, # 'ٌ' - 237: 253, # 'ٍ' - 238: 253, # 'َ' - 239: 253, # 'ُ' - 240: 253, # 'ِ' - 241: 253, # 'ّ' - 242: 253, # 'ْ' - 243: 255, # None - 244: 255, # None - 245: 255, # None - 246: 255, # None - 247: 255, # None - 248: 255, # None - 249: 255, # None - 250: 255, # None - 251: 255, # None - 252: 255, # None - 253: 255, # None - 254: 255, # None - 255: 255, # None + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # None + 162: 255, # None + 163: 255, # None + 164: 253, # '¤' + 165: 255, # None + 166: 255, # None + 167: 255, # None + 168: 255, # None + 169: 255, # None + 170: 255, # None + 171: 255, # None + 172: 253, # '،' + 173: 251, # '\xad' + 174: 255, # None + 175: 255, # None + 176: 255, # None + 177: 255, # None + 178: 255, # None + 179: 255, # None + 180: 255, # None + 181: 255, # None + 182: 255, # None + 183: 255, # None + 184: 255, # None + 185: 255, # None + 186: 255, # None + 187: 253, # '؛' + 188: 255, # None + 189: 255, # None + 190: 255, # None + 191: 253, # '؟' + 192: 255, # None + 193: 255, # 'ء' + 194: 255, # 'آ' + 195: 255, # 'أ' + 196: 255, # 'ؤ' + 197: 255, # 'إ' + 198: 255, # 'ئ' + 199: 255, # 'ا' + 200: 255, # 'ب' + 201: 255, # 'ة' + 202: 255, # 'ت' + 203: 255, # 'ث' + 204: 255, # 'ج' + 205: 255, # 'ح' + 206: 255, # 'خ' + 207: 255, # 'د' + 208: 255, # 'ذ' + 209: 255, # 'ر' + 210: 255, # 'ز' + 211: 255, # 'س' + 212: 255, # 'ش' + 213: 255, # 'ص' + 214: 255, # 'ض' + 215: 255, # 'ط' + 216: 255, # 'ظ' + 217: 255, # 'ع' + 218: 255, # 'غ' + 219: 255, # None + 220: 255, # None + 221: 255, # None + 222: 255, # None + 223: 255, # None + 224: 255, # 'ـ' + 225: 255, # 'ف' + 226: 255, # 'ق' + 227: 255, # 'ك' + 228: 255, # 'ل' + 229: 255, # 'م' + 230: 255, # 'ن' + 231: 255, # 'ه' + 232: 255, # 'و' + 233: 255, # 'ى' + 234: 255, # 'ي' + 235: 253, # 'ً' + 236: 253, # 'ٌ' + 237: 253, # 'ٍ' + 238: 253, # 'َ' + 239: 253, # 'ُ' + 240: 253, # 'ِ' + 241: 253, # 'ّ' + 242: 253, # 'ْ' + 243: 255, # None + 244: 255, # None + 245: 255, # None + 246: 255, # None + 247: 255, # None + 248: 255, # None + 249: 255, # None + 250: 255, # None + 251: 255, # None + 252: 255, # None + 253: 255, # None + 254: 255, # None + 255: 255, # None } -ISO_8859_6_ARABIC_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-6', - language='Arabic', - char_to_order_map=ISO_8859_6_ARABIC_CHAR_TO_ORDER, - language_model=ARABIC_LANG_MODEL, - typical_positive_ratio=0.917216226959467, - keep_ascii_letters=False, - alphabet='ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ') +ISO_8859_6_ARABIC_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-6", + language="Arabic", + char_to_order_map=ISO_8859_6_ARABIC_CHAR_TO_ORDER, + language_model=ARABIC_LANG_MODEL, + typical_positive_ratio=0.917216226959467, + keep_ascii_letters=False, + alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ", +) WINDOWS_1256_ARABIC_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # 'پ' - 130: 253, # '‚' - 131: 255, # 'ƒ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # 'ˆ' - 137: 253, # '‰' - 138: 255, # 'ٹ' - 139: 253, # '‹' - 140: 255, # 'Œ' - 141: 255, # 'چ' - 142: 255, # 'ژ' - 143: 255, # 'ڈ' - 144: 255, # 'گ' - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 255, # 'ک' - 153: 253, # '™' - 154: 255, # 'ڑ' - 155: 253, # '›' - 156: 255, # 'œ' - 157: 251, # '\u200c' - 158: 251, # '\u200d' - 159: 255, # 'ں' - 160: 251, # '\xa0' - 161: 253, # '،' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ھ' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 253, # '؛' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '؟' - 192: 255, # 'ہ' - 193: 255, # 'ء' - 194: 255, # 'آ' - 195: 255, # 'أ' - 196: 255, # 'ؤ' - 197: 255, # 'إ' - 198: 255, # 'ئ' - 199: 255, # 'ا' - 200: 255, # 'ب' - 201: 255, # 'ة' - 202: 255, # 'ت' - 203: 255, # 'ث' - 204: 255, # 'ج' - 205: 255, # 'ح' - 206: 255, # 'خ' - 207: 255, # 'د' - 208: 255, # 'ذ' - 209: 255, # 'ر' - 210: 255, # 'ز' - 211: 255, # 'س' - 212: 255, # 'ش' - 213: 255, # 'ص' - 214: 255, # 'ض' - 215: 253, # '×' - 216: 255, # 'ط' - 217: 255, # 'ظ' - 218: 255, # 'ع' - 219: 255, # 'غ' - 220: 255, # 'ـ' - 221: 255, # 'ف' - 222: 255, # 'ق' - 223: 255, # 'ك' - 224: 255, # 'à' - 225: 255, # 'ل' - 226: 255, # 'â' - 227: 255, # 'م' - 228: 255, # 'ن' - 229: 255, # 'ه' - 230: 255, # 'و' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ى' - 237: 255, # 'ي' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 253, # 'ً' - 241: 253, # 'ٌ' - 242: 253, # 'ٍ' - 243: 253, # 'َ' - 244: 255, # 'ô' - 245: 253, # 'ُ' - 246: 253, # 'ِ' - 247: 253, # '÷' - 248: 253, # 'ّ' - 249: 255, # 'ù' - 250: 253, # 'ْ' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 251, # '\u200e' - 254: 251, # '\u200f' - 255: 255, # 'ے' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # 'پ' + 130: 253, # '‚' + 131: 255, # 'ƒ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # 'ˆ' + 137: 253, # '‰' + 138: 255, # 'ٹ' + 139: 253, # '‹' + 140: 255, # 'Œ' + 141: 255, # 'چ' + 142: 255, # 'ژ' + 143: 255, # 'ڈ' + 144: 255, # 'گ' + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 255, # 'ک' + 153: 253, # '™' + 154: 255, # 'ڑ' + 155: 253, # '›' + 156: 255, # 'œ' + 157: 251, # '\u200c' + 158: 251, # '\u200d' + 159: 255, # 'ں' + 160: 251, # '\xa0' + 161: 253, # '،' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ھ' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 253, # '؛' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '؟' + 192: 255, # 'ہ' + 193: 255, # 'ء' + 194: 255, # 'آ' + 195: 255, # 'أ' + 196: 255, # 'ؤ' + 197: 255, # 'إ' + 198: 255, # 'ئ' + 199: 255, # 'ا' + 200: 255, # 'ب' + 201: 255, # 'ة' + 202: 255, # 'ت' + 203: 255, # 'ث' + 204: 255, # 'ج' + 205: 255, # 'ح' + 206: 255, # 'خ' + 207: 255, # 'د' + 208: 255, # 'ذ' + 209: 255, # 'ر' + 210: 255, # 'ز' + 211: 255, # 'س' + 212: 255, # 'ش' + 213: 255, # 'ص' + 214: 255, # 'ض' + 215: 253, # '×' + 216: 255, # 'ط' + 217: 255, # 'ظ' + 218: 255, # 'ع' + 219: 255, # 'غ' + 220: 255, # 'ـ' + 221: 255, # 'ف' + 222: 255, # 'ق' + 223: 255, # 'ك' + 224: 255, # 'à' + 225: 255, # 'ل' + 226: 255, # 'â' + 227: 255, # 'م' + 228: 255, # 'ن' + 229: 255, # 'ه' + 230: 255, # 'و' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ى' + 237: 255, # 'ي' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 253, # 'ً' + 241: 253, # 'ٌ' + 242: 253, # 'ٍ' + 243: 253, # 'َ' + 244: 255, # 'ô' + 245: 253, # 'ُ' + 246: 253, # 'ِ' + 247: 253, # '÷' + 248: 253, # 'ّ' + 249: 255, # 'ù' + 250: 253, # 'ْ' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 251, # '\u200e' + 254: 251, # '\u200f' + 255: 255, # 'ے' } -WINDOWS_1256_ARABIC_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1256', - language='Arabic', - char_to_order_map=WINDOWS_1256_ARABIC_CHAR_TO_ORDER, - language_model=ARABIC_LANG_MODEL, - typical_positive_ratio=0.917216226959467, - keep_ascii_letters=False, - alphabet='ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ') +WINDOWS_1256_ARABIC_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1256", + language="Arabic", + char_to_order_map=WINDOWS_1256_ARABIC_CHAR_TO_ORDER, + language_model=ARABIC_LANG_MODEL, + typical_positive_ratio=0.917216226959467, + keep_ascii_letters=False, + alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ", +) CP720_ARABIC_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 255, # 'é' - 131: 255, # 'â' - 132: 251, # '\x84' - 133: 255, # 'à' - 134: 251, # '\x86' - 135: 255, # 'ç' - 136: 255, # 'ê' - 137: 255, # 'ë' - 138: 255, # 'è' - 139: 255, # 'ï' - 140: 255, # 'î' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 253, # 'ّ' - 146: 253, # 'ْ' - 147: 255, # 'ô' - 148: 253, # '¤' - 149: 255, # 'ـ' - 150: 255, # 'û' - 151: 255, # 'ù' - 152: 255, # 'ء' - 153: 255, # 'آ' - 154: 255, # 'أ' - 155: 255, # 'ؤ' - 156: 253, # '£' - 157: 255, # 'إ' - 158: 255, # 'ئ' - 159: 255, # 'ا' - 160: 255, # 'ب' - 161: 255, # 'ة' - 162: 255, # 'ت' - 163: 255, # 'ث' - 164: 255, # 'ج' - 165: 255, # 'ح' - 166: 255, # 'خ' - 167: 255, # 'د' - 168: 255, # 'ذ' - 169: 255, # 'ر' - 170: 255, # 'ز' - 171: 255, # 'س' - 172: 255, # 'ش' - 173: 255, # 'ص' - 174: 253, # '«' - 175: 253, # '»' - 176: 253, # '░' - 177: 253, # '▒' - 178: 253, # '▓' - 179: 253, # '│' - 180: 253, # '┤' - 181: 253, # '╡' - 182: 253, # '╢' - 183: 253, # '╖' - 184: 253, # '╕' - 185: 253, # '╣' - 186: 253, # '║' - 187: 253, # '╗' - 188: 253, # '╝' - 189: 253, # '╜' - 190: 253, # '╛' - 191: 253, # '┐' - 192: 253, # '└' - 193: 253, # '┴' - 194: 253, # '┬' - 195: 253, # '├' - 196: 253, # '─' - 197: 253, # '┼' - 198: 253, # '╞' - 199: 253, # '╟' - 200: 253, # '╚' - 201: 253, # '╔' - 202: 253, # '╩' - 203: 253, # '╦' - 204: 253, # '╠' - 205: 253, # '═' - 206: 253, # '╬' - 207: 253, # '╧' - 208: 253, # '╨' - 209: 253, # '╤' - 210: 253, # '╥' - 211: 253, # '╙' - 212: 253, # '╘' - 213: 253, # '╒' - 214: 253, # '╓' - 215: 253, # '╫' - 216: 253, # '╪' - 217: 253, # '┘' - 218: 253, # '┌' - 219: 253, # '█' - 220: 253, # '▄' - 221: 253, # '▌' - 222: 253, # '▐' - 223: 253, # '▀' - 224: 255, # 'ض' - 225: 255, # 'ط' - 226: 255, # 'ظ' - 227: 255, # 'ع' - 228: 255, # 'غ' - 229: 255, # 'ف' - 230: 255, # 'µ' - 231: 255, # 'ق' - 232: 255, # 'ك' - 233: 255, # 'ل' - 234: 255, # 'م' - 235: 255, # 'ن' - 236: 255, # 'ه' - 237: 255, # 'و' - 238: 255, # 'ى' - 239: 255, # 'ي' - 240: 253, # '≡' - 241: 253, # 'ً' - 242: 253, # 'ٌ' - 243: 253, # 'ٍ' - 244: 253, # 'َ' - 245: 253, # 'ُ' - 246: 253, # 'ِ' - 247: 253, # '≈' - 248: 253, # '°' - 249: 253, # '∙' - 250: 253, # '·' - 251: 253, # '√' - 252: 255, # 'ⁿ' - 253: 252, # '²' - 254: 253, # '■' - 255: 251, # '\xa0' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 255, # 'é' + 131: 255, # 'â' + 132: 251, # '\x84' + 133: 255, # 'à' + 134: 251, # '\x86' + 135: 255, # 'ç' + 136: 255, # 'ê' + 137: 255, # 'ë' + 138: 255, # 'è' + 139: 255, # 'ï' + 140: 255, # 'î' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 253, # 'ّ' + 146: 253, # 'ْ' + 147: 255, # 'ô' + 148: 253, # '¤' + 149: 255, # 'ـ' + 150: 255, # 'û' + 151: 255, # 'ù' + 152: 255, # 'ء' + 153: 255, # 'آ' + 154: 255, # 'أ' + 155: 255, # 'ؤ' + 156: 253, # '£' + 157: 255, # 'إ' + 158: 255, # 'ئ' + 159: 255, # 'ا' + 160: 255, # 'ب' + 161: 255, # 'ة' + 162: 255, # 'ت' + 163: 255, # 'ث' + 164: 255, # 'ج' + 165: 255, # 'ح' + 166: 255, # 'خ' + 167: 255, # 'د' + 168: 255, # 'ذ' + 169: 255, # 'ر' + 170: 255, # 'ز' + 171: 255, # 'س' + 172: 255, # 'ش' + 173: 255, # 'ص' + 174: 253, # '«' + 175: 253, # '»' + 176: 253, # '░' + 177: 253, # '▒' + 178: 253, # '▓' + 179: 253, # '│' + 180: 253, # '┤' + 181: 253, # '╡' + 182: 253, # '╢' + 183: 253, # '╖' + 184: 253, # '╕' + 185: 253, # '╣' + 186: 253, # '║' + 187: 253, # '╗' + 188: 253, # '╝' + 189: 253, # '╜' + 190: 253, # '╛' + 191: 253, # '┐' + 192: 253, # '└' + 193: 253, # '┴' + 194: 253, # '┬' + 195: 253, # '├' + 196: 253, # '─' + 197: 253, # '┼' + 198: 253, # '╞' + 199: 253, # '╟' + 200: 253, # '╚' + 201: 253, # '╔' + 202: 253, # '╩' + 203: 253, # '╦' + 204: 253, # '╠' + 205: 253, # '═' + 206: 253, # '╬' + 207: 253, # '╧' + 208: 253, # '╨' + 209: 253, # '╤' + 210: 253, # '╥' + 211: 253, # '╙' + 212: 253, # '╘' + 213: 253, # '╒' + 214: 253, # '╓' + 215: 253, # '╫' + 216: 253, # '╪' + 217: 253, # '┘' + 218: 253, # '┌' + 219: 253, # '█' + 220: 253, # '▄' + 221: 253, # '▌' + 222: 253, # '▐' + 223: 253, # '▀' + 224: 255, # 'ض' + 225: 255, # 'ط' + 226: 255, # 'ظ' + 227: 255, # 'ع' + 228: 255, # 'غ' + 229: 255, # 'ف' + 230: 255, # 'µ' + 231: 255, # 'ق' + 232: 255, # 'ك' + 233: 255, # 'ل' + 234: 255, # 'م' + 235: 255, # 'ن' + 236: 255, # 'ه' + 237: 255, # 'و' + 238: 255, # 'ى' + 239: 255, # 'ي' + 240: 253, # '≡' + 241: 253, # 'ً' + 242: 253, # 'ٌ' + 243: 253, # 'ٍ' + 244: 253, # 'َ' + 245: 253, # 'ُ' + 246: 253, # 'ِ' + 247: 253, # '≈' + 248: 253, # '°' + 249: 253, # '∙' + 250: 253, # '·' + 251: 253, # '√' + 252: 255, # 'ⁿ' + 253: 252, # '²' + 254: 253, # '■' + 255: 251, # '\xa0' } -CP720_ARABIC_MODEL = SingleByteCharSetModel(charset_name='CP720', - language='Arabic', - char_to_order_map=CP720_ARABIC_CHAR_TO_ORDER, - language_model=ARABIC_LANG_MODEL, - typical_positive_ratio=0.9043281113849888, - keep_ascii_letters=False, - alphabet='ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ') +CP720_ARABIC_MODEL = SingleByteCharSetModel( + charset_name="CP720", + language="Arabic", + char_to_order_map=CP720_ARABIC_CHAR_TO_ORDER, + language_model=ARABIC_LANG_MODEL, + typical_positive_ratio=0.9043281113849888, + keep_ascii_letters=False, + alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ", +) CP864_ARABIC_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '٪' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '°' - 129: 253, # '·' - 130: 253, # '∙' - 131: 253, # '√' - 132: 253, # '▒' - 133: 253, # '─' - 134: 253, # '│' - 135: 253, # '┼' - 136: 253, # '┤' - 137: 253, # '┬' - 138: 253, # '├' - 139: 253, # '┴' - 140: 253, # '┐' - 141: 253, # '┌' - 142: 253, # '└' - 143: 253, # '┘' - 144: 255, # 'β' - 145: 253, # '∞' - 146: 255, # 'φ' - 147: 253, # '±' - 148: 252, # '½' - 149: 252, # '¼' - 150: 253, # '≈' - 151: 253, # '«' - 152: 253, # '»' - 153: 255, # 'ﻷ' - 154: 255, # 'ﻸ' - 155: 255, # None - 156: 255, # None - 157: 255, # 'ﻻ' - 158: 255, # 'ﻼ' - 159: 255, # None - 160: 251, # '\xa0' - 161: 251, # '\xad' - 162: 255, # 'ﺂ' - 163: 253, # '£' - 164: 253, # '¤' - 165: 255, # 'ﺄ' - 166: 255, # None - 167: 255, # None - 168: 255, # 'ﺎ' - 169: 255, # 'ﺏ' - 170: 255, # 'ﺕ' - 171: 255, # 'ﺙ' - 172: 253, # '،' - 173: 255, # 'ﺝ' - 174: 255, # 'ﺡ' - 175: 255, # 'ﺥ' - 176: 252, # '٠' - 177: 252, # '١' - 178: 252, # '٢' - 179: 252, # '٣' - 180: 252, # '٤' - 181: 252, # '٥' - 182: 252, # '٦' - 183: 252, # '٧' - 184: 252, # '٨' - 185: 252, # '٩' - 186: 255, # 'ﻑ' - 187: 253, # '؛' - 188: 255, # 'ﺱ' - 189: 255, # 'ﺵ' - 190: 255, # 'ﺹ' - 191: 253, # '؟' - 192: 253, # '¢' - 193: 255, # 'ﺀ' - 194: 255, # 'ﺁ' - 195: 255, # 'ﺃ' - 196: 255, # 'ﺅ' - 197: 255, # 'ﻊ' - 198: 255, # 'ﺋ' - 199: 255, # 'ﺍ' - 200: 255, # 'ﺑ' - 201: 255, # 'ﺓ' - 202: 255, # 'ﺗ' - 203: 255, # 'ﺛ' - 204: 255, # 'ﺟ' - 205: 255, # 'ﺣ' - 206: 255, # 'ﺧ' - 207: 255, # 'ﺩ' - 208: 255, # 'ﺫ' - 209: 255, # 'ﺭ' - 210: 255, # 'ﺯ' - 211: 255, # 'ﺳ' - 212: 255, # 'ﺷ' - 213: 255, # 'ﺻ' - 214: 255, # 'ﺿ' - 215: 255, # 'ﻁ' - 216: 255, # 'ﻅ' - 217: 255, # 'ﻋ' - 218: 255, # 'ﻏ' - 219: 253, # '¦' - 220: 253, # '¬' - 221: 253, # '÷' - 222: 253, # '×' - 223: 255, # 'ﻉ' - 224: 255, # 'ـ' - 225: 255, # 'ﻓ' - 226: 255, # 'ﻗ' - 227: 255, # 'ﻛ' - 228: 255, # 'ﻟ' - 229: 255, # 'ﻣ' - 230: 255, # 'ﻧ' - 231: 255, # 'ﻫ' - 232: 255, # 'ﻭ' - 233: 255, # 'ﻯ' - 234: 255, # 'ﻳ' - 235: 255, # 'ﺽ' - 236: 255, # 'ﻌ' - 237: 255, # 'ﻎ' - 238: 255, # 'ﻍ' - 239: 255, # 'ﻡ' - 240: 255, # 'ﹽ' - 241: 253, # 'ّ' - 242: 255, # 'ﻥ' - 243: 255, # 'ﻩ' - 244: 255, # 'ﻬ' - 245: 255, # 'ﻰ' - 246: 255, # 'ﻲ' - 247: 255, # 'ﻐ' - 248: 255, # 'ﻕ' - 249: 255, # 'ﻵ' - 250: 255, # 'ﻶ' - 251: 255, # 'ﻝ' - 252: 255, # 'ﻙ' - 253: 255, # 'ﻱ' - 254: 253, # '■' - 255: 255, # None + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '٪' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '°' + 129: 253, # '·' + 130: 253, # '∙' + 131: 253, # '√' + 132: 253, # '▒' + 133: 253, # '─' + 134: 253, # '│' + 135: 253, # '┼' + 136: 253, # '┤' + 137: 253, # '┬' + 138: 253, # '├' + 139: 253, # '┴' + 140: 253, # '┐' + 141: 253, # '┌' + 142: 253, # '└' + 143: 253, # '┘' + 144: 255, # 'β' + 145: 253, # '∞' + 146: 255, # 'φ' + 147: 253, # '±' + 148: 252, # '½' + 149: 252, # '¼' + 150: 253, # '≈' + 151: 253, # '«' + 152: 253, # '»' + 153: 255, # 'ﻷ' + 154: 255, # 'ﻸ' + 155: 255, # None + 156: 255, # None + 157: 255, # 'ﻻ' + 158: 255, # 'ﻼ' + 159: 255, # None + 160: 251, # '\xa0' + 161: 251, # '\xad' + 162: 255, # 'ﺂ' + 163: 253, # '£' + 164: 253, # '¤' + 165: 255, # 'ﺄ' + 166: 255, # None + 167: 255, # None + 168: 255, # 'ﺎ' + 169: 255, # 'ﺏ' + 170: 255, # 'ﺕ' + 171: 255, # 'ﺙ' + 172: 253, # '،' + 173: 255, # 'ﺝ' + 174: 255, # 'ﺡ' + 175: 255, # 'ﺥ' + 176: 252, # '٠' + 177: 252, # '١' + 178: 252, # '٢' + 179: 252, # '٣' + 180: 252, # '٤' + 181: 252, # '٥' + 182: 252, # '٦' + 183: 252, # '٧' + 184: 252, # '٨' + 185: 252, # '٩' + 186: 255, # 'ﻑ' + 187: 253, # '؛' + 188: 255, # 'ﺱ' + 189: 255, # 'ﺵ' + 190: 255, # 'ﺹ' + 191: 253, # '؟' + 192: 253, # '¢' + 193: 255, # 'ﺀ' + 194: 255, # 'ﺁ' + 195: 255, # 'ﺃ' + 196: 255, # 'ﺅ' + 197: 255, # 'ﻊ' + 198: 255, # 'ﺋ' + 199: 255, # 'ﺍ' + 200: 255, # 'ﺑ' + 201: 255, # 'ﺓ' + 202: 255, # 'ﺗ' + 203: 255, # 'ﺛ' + 204: 255, # 'ﺟ' + 205: 255, # 'ﺣ' + 206: 255, # 'ﺧ' + 207: 255, # 'ﺩ' + 208: 255, # 'ﺫ' + 209: 255, # 'ﺭ' + 210: 255, # 'ﺯ' + 211: 255, # 'ﺳ' + 212: 255, # 'ﺷ' + 213: 255, # 'ﺻ' + 214: 255, # 'ﺿ' + 215: 255, # 'ﻁ' + 216: 255, # 'ﻅ' + 217: 255, # 'ﻋ' + 218: 255, # 'ﻏ' + 219: 253, # '¦' + 220: 253, # '¬' + 221: 253, # '÷' + 222: 253, # '×' + 223: 255, # 'ﻉ' + 224: 255, # 'ـ' + 225: 255, # 'ﻓ' + 226: 255, # 'ﻗ' + 227: 255, # 'ﻛ' + 228: 255, # 'ﻟ' + 229: 255, # 'ﻣ' + 230: 255, # 'ﻧ' + 231: 255, # 'ﻫ' + 232: 255, # 'ﻭ' + 233: 255, # 'ﻯ' + 234: 255, # 'ﻳ' + 235: 255, # 'ﺽ' + 236: 255, # 'ﻌ' + 237: 255, # 'ﻎ' + 238: 255, # 'ﻍ' + 239: 255, # 'ﻡ' + 240: 255, # 'ﹽ' + 241: 253, # 'ّ' + 242: 255, # 'ﻥ' + 243: 255, # 'ﻩ' + 244: 255, # 'ﻬ' + 245: 255, # 'ﻰ' + 246: 255, # 'ﻲ' + 247: 255, # 'ﻐ' + 248: 255, # 'ﻕ' + 249: 255, # 'ﻵ' + 250: 255, # 'ﻶ' + 251: 255, # 'ﻝ' + 252: 255, # 'ﻙ' + 253: 255, # 'ﻱ' + 254: 253, # '■' + 255: 255, # None } -CP864_ARABIC_MODEL = SingleByteCharSetModel(charset_name='CP864', - language='Arabic', - char_to_order_map=CP864_ARABIC_CHAR_TO_ORDER, - language_model=ARABIC_LANG_MODEL, - typical_positive_ratio=0.0330622314588194, - keep_ascii_letters=False, - alphabet='ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ') - +CP864_ARABIC_MODEL = SingleByteCharSetModel( + charset_name="CP864", + language="Arabic", + char_to_order_map=CP864_ARABIC_CHAR_TO_ORDER, + language_model=ARABIC_LANG_MODEL, + typical_positive_ratio=0.0330622314588194, + keep_ascii_letters=False, + alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ", +) diff --git a/chardet/langbelarusianmodel.py b/chardet/langbelarusianmodel.py index cf82eace..b4f7556d 100644 --- a/chardet/langbelarusianmodel.py +++ b/chardet/langbelarusianmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -62768,1070 +62766,1077 @@ # Character Mapping Table(s): ISO_8859_5_BELARUSIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ё' - 162: 255, # 'Ђ' - 163: 255, # 'Ѓ' - 164: 255, # 'Є' - 165: 255, # 'Ѕ' - 166: 255, # 'І' - 167: 255, # 'Ї' - 168: 255, # 'Ј' - 169: 255, # 'Љ' - 170: 255, # 'Њ' - 171: 255, # 'Ћ' - 172: 255, # 'Ќ' - 173: 251, # '\xad' - 174: 255, # 'Ў' - 175: 255, # 'Џ' - 176: 255, # 'А' - 177: 255, # 'Б' - 178: 255, # 'В' - 179: 255, # 'Г' - 180: 255, # 'Д' - 181: 255, # 'Е' - 182: 255, # 'Ж' - 183: 255, # 'З' - 184: 255, # 'И' - 185: 255, # 'Й' - 186: 255, # 'К' - 187: 255, # 'Л' - 188: 255, # 'М' - 189: 255, # 'Н' - 190: 255, # 'О' - 191: 255, # 'П' - 192: 255, # 'Р' - 193: 255, # 'С' - 194: 255, # 'Т' - 195: 255, # 'У' - 196: 255, # 'Ф' - 197: 255, # 'Х' - 198: 255, # 'Ц' - 199: 255, # 'Ч' - 200: 255, # 'Ш' - 201: 255, # 'Щ' - 202: 255, # 'Ъ' - 203: 255, # 'Ы' - 204: 255, # 'Ь' - 205: 255, # 'Э' - 206: 255, # 'Ю' - 207: 255, # 'Я' - 208: 255, # 'а' - 209: 255, # 'б' - 210: 255, # 'в' - 211: 255, # 'г' - 212: 255, # 'д' - 213: 255, # 'е' - 214: 255, # 'ж' - 215: 255, # 'з' - 216: 255, # 'и' - 217: 255, # 'й' - 218: 255, # 'к' - 219: 255, # 'л' - 220: 255, # 'м' - 221: 255, # 'н' - 222: 255, # 'о' - 223: 255, # 'п' - 224: 255, # 'р' - 225: 255, # 'с' - 226: 255, # 'т' - 227: 255, # 'у' - 228: 255, # 'ф' - 229: 255, # 'х' - 230: 255, # 'ц' - 231: 255, # 'ч' - 232: 255, # 'ш' - 233: 255, # 'щ' - 234: 255, # 'ъ' - 235: 255, # 'ы' - 236: 255, # 'ь' - 237: 255, # 'э' - 238: 255, # 'ю' - 239: 255, # 'я' - 240: 253, # '№' - 241: 255, # 'ё' - 242: 255, # 'ђ' - 243: 255, # 'ѓ' - 244: 255, # 'є' - 245: 255, # 'ѕ' - 246: 255, # 'і' - 247: 255, # 'ї' - 248: 255, # 'ј' - 249: 255, # 'љ' - 250: 255, # 'њ' - 251: 255, # 'ћ' - 252: 255, # 'ќ' - 253: 253, # '§' - 254: 255, # 'ў' - 255: 255, # 'џ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ё' + 162: 255, # 'Ђ' + 163: 255, # 'Ѓ' + 164: 255, # 'Є' + 165: 255, # 'Ѕ' + 166: 255, # 'І' + 167: 255, # 'Ї' + 168: 255, # 'Ј' + 169: 255, # 'Љ' + 170: 255, # 'Њ' + 171: 255, # 'Ћ' + 172: 255, # 'Ќ' + 173: 251, # '\xad' + 174: 255, # 'Ў' + 175: 255, # 'Џ' + 176: 255, # 'А' + 177: 255, # 'Б' + 178: 255, # 'В' + 179: 255, # 'Г' + 180: 255, # 'Д' + 181: 255, # 'Е' + 182: 255, # 'Ж' + 183: 255, # 'З' + 184: 255, # 'И' + 185: 255, # 'Й' + 186: 255, # 'К' + 187: 255, # 'Л' + 188: 255, # 'М' + 189: 255, # 'Н' + 190: 255, # 'О' + 191: 255, # 'П' + 192: 255, # 'Р' + 193: 255, # 'С' + 194: 255, # 'Т' + 195: 255, # 'У' + 196: 255, # 'Ф' + 197: 255, # 'Х' + 198: 255, # 'Ц' + 199: 255, # 'Ч' + 200: 255, # 'Ш' + 201: 255, # 'Щ' + 202: 255, # 'Ъ' + 203: 255, # 'Ы' + 204: 255, # 'Ь' + 205: 255, # 'Э' + 206: 255, # 'Ю' + 207: 255, # 'Я' + 208: 255, # 'а' + 209: 255, # 'б' + 210: 255, # 'в' + 211: 255, # 'г' + 212: 255, # 'д' + 213: 255, # 'е' + 214: 255, # 'ж' + 215: 255, # 'з' + 216: 255, # 'и' + 217: 255, # 'й' + 218: 255, # 'к' + 219: 255, # 'л' + 220: 255, # 'м' + 221: 255, # 'н' + 222: 255, # 'о' + 223: 255, # 'п' + 224: 255, # 'р' + 225: 255, # 'с' + 226: 255, # 'т' + 227: 255, # 'у' + 228: 255, # 'ф' + 229: 255, # 'х' + 230: 255, # 'ц' + 231: 255, # 'ч' + 232: 255, # 'ш' + 233: 255, # 'щ' + 234: 255, # 'ъ' + 235: 255, # 'ы' + 236: 255, # 'ь' + 237: 255, # 'э' + 238: 255, # 'ю' + 239: 255, # 'я' + 240: 253, # '№' + 241: 255, # 'ё' + 242: 255, # 'ђ' + 243: 255, # 'ѓ' + 244: 255, # 'є' + 245: 255, # 'ѕ' + 246: 255, # 'і' + 247: 255, # 'ї' + 248: 255, # 'ј' + 249: 255, # 'љ' + 250: 255, # 'њ' + 251: 255, # 'ћ' + 252: 255, # 'ќ' + 253: 253, # '§' + 254: 255, # 'ў' + 255: 255, # 'џ' } -ISO_8859_5_BELARUSIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-5', - language='Belarusian', - char_to_order_map=ISO_8859_5_BELARUSIAN_CHAR_TO_ORDER, - language_model=BELARUSIAN_LANG_MODEL, - typical_positive_ratio=0.9000111172280231, - keep_ascii_letters=False, - alphabet='ʼЁІЎАБВГДЕЖЗЙКЛМНОПРСТУФХЦЧШЫЬЭЮЯабвгдежзйклмнопрстуфхцчшыьэюяёіў') +ISO_8859_5_BELARUSIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-5", + language="Belarusian", + char_to_order_map=ISO_8859_5_BELARUSIAN_CHAR_TO_ORDER, + language_model=BELARUSIAN_LANG_MODEL, + typical_positive_ratio=0.9000111172280231, + keep_ascii_letters=False, + alphabet="ʼЁІЎАБВГДЕЖЗЙКЛМНОПРСТУФХЦЧШЫЬЭЮЯабвгдежзйклмнопрстуфхцчшыьэюяёіў", +) WINDOWS_1251_BELARUSIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 255, # 'Ђ' - 129: 255, # 'Ѓ' - 130: 253, # '‚' - 131: 255, # 'ѓ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 253, # '€' - 137: 253, # '‰' - 138: 255, # 'Љ' - 139: 253, # '‹' - 140: 255, # 'Њ' - 141: 255, # 'Ќ' - 142: 255, # 'Ћ' - 143: 255, # 'Џ' - 144: 255, # 'ђ' - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 255, # None - 153: 253, # '™' - 154: 255, # 'љ' - 155: 253, # '›' - 156: 255, # 'њ' - 157: 255, # 'ќ' - 158: 255, # 'ћ' - 159: 255, # 'џ' - 160: 251, # '\xa0' - 161: 255, # 'Ў' - 162: 255, # 'ў' - 163: 255, # 'Ј' - 164: 253, # '¤' - 165: 255, # 'Ґ' - 166: 253, # '¦' - 167: 253, # '§' - 168: 255, # 'Ё' - 169: 253, # '©' - 170: 255, # 'Є' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Ї' - 176: 253, # '°' - 177: 253, # '±' - 178: 255, # 'І' - 179: 255, # 'і' - 180: 255, # 'ґ' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ё' - 185: 253, # '№' - 186: 255, # 'є' - 187: 253, # '»' - 188: 255, # 'ј' - 189: 255, # 'Ѕ' - 190: 255, # 'ѕ' - 191: 255, # 'ї' - 192: 255, # 'А' - 193: 255, # 'Б' - 194: 255, # 'В' - 195: 255, # 'Г' - 196: 255, # 'Д' - 197: 255, # 'Е' - 198: 255, # 'Ж' - 199: 255, # 'З' - 200: 255, # 'И' - 201: 255, # 'Й' - 202: 255, # 'К' - 203: 255, # 'Л' - 204: 255, # 'М' - 205: 255, # 'Н' - 206: 255, # 'О' - 207: 255, # 'П' - 208: 255, # 'Р' - 209: 255, # 'С' - 210: 255, # 'Т' - 211: 255, # 'У' - 212: 255, # 'Ф' - 213: 255, # 'Х' - 214: 255, # 'Ц' - 215: 255, # 'Ч' - 216: 255, # 'Ш' - 217: 255, # 'Щ' - 218: 255, # 'Ъ' - 219: 255, # 'Ы' - 220: 255, # 'Ь' - 221: 255, # 'Э' - 222: 255, # 'Ю' - 223: 255, # 'Я' - 224: 255, # 'а' - 225: 255, # 'б' - 226: 255, # 'в' - 227: 255, # 'г' - 228: 255, # 'д' - 229: 255, # 'е' - 230: 255, # 'ж' - 231: 255, # 'з' - 232: 255, # 'и' - 233: 255, # 'й' - 234: 255, # 'к' - 235: 255, # 'л' - 236: 255, # 'м' - 237: 255, # 'н' - 238: 255, # 'о' - 239: 255, # 'п' - 240: 255, # 'р' - 241: 255, # 'с' - 242: 255, # 'т' - 243: 255, # 'у' - 244: 255, # 'ф' - 245: 255, # 'х' - 246: 255, # 'ц' - 247: 255, # 'ч' - 248: 255, # 'ш' - 249: 255, # 'щ' - 250: 255, # 'ъ' - 251: 255, # 'ы' - 252: 255, # 'ь' - 253: 255, # 'э' - 254: 255, # 'ю' - 255: 255, # 'я' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 255, # 'Ђ' + 129: 255, # 'Ѓ' + 130: 253, # '‚' + 131: 255, # 'ѓ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 253, # '€' + 137: 253, # '‰' + 138: 255, # 'Љ' + 139: 253, # '‹' + 140: 255, # 'Њ' + 141: 255, # 'Ќ' + 142: 255, # 'Ћ' + 143: 255, # 'Џ' + 144: 255, # 'ђ' + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 255, # None + 153: 253, # '™' + 154: 255, # 'љ' + 155: 253, # '›' + 156: 255, # 'њ' + 157: 255, # 'ќ' + 158: 255, # 'ћ' + 159: 255, # 'џ' + 160: 251, # '\xa0' + 161: 255, # 'Ў' + 162: 255, # 'ў' + 163: 255, # 'Ј' + 164: 253, # '¤' + 165: 255, # 'Ґ' + 166: 253, # '¦' + 167: 253, # '§' + 168: 255, # 'Ё' + 169: 253, # '©' + 170: 255, # 'Є' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Ї' + 176: 253, # '°' + 177: 253, # '±' + 178: 255, # 'І' + 179: 255, # 'і' + 180: 255, # 'ґ' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ё' + 185: 253, # '№' + 186: 255, # 'є' + 187: 253, # '»' + 188: 255, # 'ј' + 189: 255, # 'Ѕ' + 190: 255, # 'ѕ' + 191: 255, # 'ї' + 192: 255, # 'А' + 193: 255, # 'Б' + 194: 255, # 'В' + 195: 255, # 'Г' + 196: 255, # 'Д' + 197: 255, # 'Е' + 198: 255, # 'Ж' + 199: 255, # 'З' + 200: 255, # 'И' + 201: 255, # 'Й' + 202: 255, # 'К' + 203: 255, # 'Л' + 204: 255, # 'М' + 205: 255, # 'Н' + 206: 255, # 'О' + 207: 255, # 'П' + 208: 255, # 'Р' + 209: 255, # 'С' + 210: 255, # 'Т' + 211: 255, # 'У' + 212: 255, # 'Ф' + 213: 255, # 'Х' + 214: 255, # 'Ц' + 215: 255, # 'Ч' + 216: 255, # 'Ш' + 217: 255, # 'Щ' + 218: 255, # 'Ъ' + 219: 255, # 'Ы' + 220: 255, # 'Ь' + 221: 255, # 'Э' + 222: 255, # 'Ю' + 223: 255, # 'Я' + 224: 255, # 'а' + 225: 255, # 'б' + 226: 255, # 'в' + 227: 255, # 'г' + 228: 255, # 'д' + 229: 255, # 'е' + 230: 255, # 'ж' + 231: 255, # 'з' + 232: 255, # 'и' + 233: 255, # 'й' + 234: 255, # 'к' + 235: 255, # 'л' + 236: 255, # 'м' + 237: 255, # 'н' + 238: 255, # 'о' + 239: 255, # 'п' + 240: 255, # 'р' + 241: 255, # 'с' + 242: 255, # 'т' + 243: 255, # 'у' + 244: 255, # 'ф' + 245: 255, # 'х' + 246: 255, # 'ц' + 247: 255, # 'ч' + 248: 255, # 'ш' + 249: 255, # 'щ' + 250: 255, # 'ъ' + 251: 255, # 'ы' + 252: 255, # 'ь' + 253: 255, # 'э' + 254: 255, # 'ю' + 255: 255, # 'я' } -WINDOWS_1251_BELARUSIAN_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1251', - language='Belarusian', - char_to_order_map=WINDOWS_1251_BELARUSIAN_CHAR_TO_ORDER, - language_model=BELARUSIAN_LANG_MODEL, - typical_positive_ratio=0.9071761992924257, - keep_ascii_letters=False, - alphabet='ʼЁІЎАБВГДЕЖЗЙКЛМНОПРСТУФХЦЧШЫЬЭЮЯабвгдежзйклмнопрстуфхцчшыьэюяёіў') +WINDOWS_1251_BELARUSIAN_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1251", + language="Belarusian", + char_to_order_map=WINDOWS_1251_BELARUSIAN_CHAR_TO_ORDER, + language_model=BELARUSIAN_LANG_MODEL, + typical_positive_ratio=0.9071761992924257, + keep_ascii_letters=False, + alphabet="ʼЁІЎАБВГДЕЖЗЙКЛМНОПРСТУФХЦЧШЫЬЭЮЯабвгдежзйклмнопрстуфхцчшыьэюяёіў", +) IBM866_BELARUSIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 255, # 'А' - 129: 255, # 'Б' - 130: 255, # 'В' - 131: 255, # 'Г' - 132: 255, # 'Д' - 133: 255, # 'Е' - 134: 255, # 'Ж' - 135: 255, # 'З' - 136: 255, # 'И' - 137: 255, # 'Й' - 138: 255, # 'К' - 139: 255, # 'Л' - 140: 255, # 'М' - 141: 255, # 'Н' - 142: 255, # 'О' - 143: 255, # 'П' - 144: 255, # 'Р' - 145: 255, # 'С' - 146: 255, # 'Т' - 147: 255, # 'У' - 148: 255, # 'Ф' - 149: 255, # 'Х' - 150: 255, # 'Ц' - 151: 255, # 'Ч' - 152: 255, # 'Ш' - 153: 255, # 'Щ' - 154: 255, # 'Ъ' - 155: 255, # 'Ы' - 156: 255, # 'Ь' - 157: 255, # 'Э' - 158: 255, # 'Ю' - 159: 255, # 'Я' - 160: 255, # 'а' - 161: 255, # 'б' - 162: 255, # 'в' - 163: 255, # 'г' - 164: 255, # 'д' - 165: 255, # 'е' - 166: 255, # 'ж' - 167: 255, # 'з' - 168: 255, # 'и' - 169: 255, # 'й' - 170: 255, # 'к' - 171: 255, # 'л' - 172: 255, # 'м' - 173: 255, # 'н' - 174: 255, # 'о' - 175: 255, # 'п' - 176: 253, # '░' - 177: 253, # '▒' - 178: 253, # '▓' - 179: 253, # '│' - 180: 253, # '┤' - 181: 253, # '╡' - 182: 253, # '╢' - 183: 253, # '╖' - 184: 253, # '╕' - 185: 253, # '╣' - 186: 253, # '║' - 187: 253, # '╗' - 188: 253, # '╝' - 189: 253, # '╜' - 190: 253, # '╛' - 191: 253, # '┐' - 192: 253, # '└' - 193: 253, # '┴' - 194: 253, # '┬' - 195: 253, # '├' - 196: 253, # '─' - 197: 253, # '┼' - 198: 253, # '╞' - 199: 253, # '╟' - 200: 253, # '╚' - 201: 253, # '╔' - 202: 253, # '╩' - 203: 253, # '╦' - 204: 253, # '╠' - 205: 253, # '═' - 206: 253, # '╬' - 207: 253, # '╧' - 208: 253, # '╨' - 209: 253, # '╤' - 210: 253, # '╥' - 211: 253, # '╙' - 212: 253, # '╘' - 213: 253, # '╒' - 214: 253, # '╓' - 215: 253, # '╫' - 216: 253, # '╪' - 217: 253, # '┘' - 218: 253, # '┌' - 219: 253, # '█' - 220: 253, # '▄' - 221: 253, # '▌' - 222: 253, # '▐' - 223: 253, # '▀' - 224: 255, # 'р' - 225: 255, # 'с' - 226: 255, # 'т' - 227: 255, # 'у' - 228: 255, # 'ф' - 229: 255, # 'х' - 230: 255, # 'ц' - 231: 255, # 'ч' - 232: 255, # 'ш' - 233: 255, # 'щ' - 234: 255, # 'ъ' - 235: 255, # 'ы' - 236: 255, # 'ь' - 237: 255, # 'э' - 238: 255, # 'ю' - 239: 255, # 'я' - 240: 255, # 'Ё' - 241: 255, # 'ё' - 242: 255, # 'Є' - 243: 255, # 'є' - 244: 255, # 'Ї' - 245: 255, # 'ї' - 246: 255, # 'Ў' - 247: 255, # 'ў' - 248: 253, # '°' - 249: 253, # '∙' - 250: 253, # '·' - 251: 253, # '√' - 252: 253, # '№' - 253: 253, # '¤' - 254: 253, # '■' - 255: 251, # '\xa0' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 255, # 'А' + 129: 255, # 'Б' + 130: 255, # 'В' + 131: 255, # 'Г' + 132: 255, # 'Д' + 133: 255, # 'Е' + 134: 255, # 'Ж' + 135: 255, # 'З' + 136: 255, # 'И' + 137: 255, # 'Й' + 138: 255, # 'К' + 139: 255, # 'Л' + 140: 255, # 'М' + 141: 255, # 'Н' + 142: 255, # 'О' + 143: 255, # 'П' + 144: 255, # 'Р' + 145: 255, # 'С' + 146: 255, # 'Т' + 147: 255, # 'У' + 148: 255, # 'Ф' + 149: 255, # 'Х' + 150: 255, # 'Ц' + 151: 255, # 'Ч' + 152: 255, # 'Ш' + 153: 255, # 'Щ' + 154: 255, # 'Ъ' + 155: 255, # 'Ы' + 156: 255, # 'Ь' + 157: 255, # 'Э' + 158: 255, # 'Ю' + 159: 255, # 'Я' + 160: 255, # 'а' + 161: 255, # 'б' + 162: 255, # 'в' + 163: 255, # 'г' + 164: 255, # 'д' + 165: 255, # 'е' + 166: 255, # 'ж' + 167: 255, # 'з' + 168: 255, # 'и' + 169: 255, # 'й' + 170: 255, # 'к' + 171: 255, # 'л' + 172: 255, # 'м' + 173: 255, # 'н' + 174: 255, # 'о' + 175: 255, # 'п' + 176: 253, # '░' + 177: 253, # '▒' + 178: 253, # '▓' + 179: 253, # '│' + 180: 253, # '┤' + 181: 253, # '╡' + 182: 253, # '╢' + 183: 253, # '╖' + 184: 253, # '╕' + 185: 253, # '╣' + 186: 253, # '║' + 187: 253, # '╗' + 188: 253, # '╝' + 189: 253, # '╜' + 190: 253, # '╛' + 191: 253, # '┐' + 192: 253, # '└' + 193: 253, # '┴' + 194: 253, # '┬' + 195: 253, # '├' + 196: 253, # '─' + 197: 253, # '┼' + 198: 253, # '╞' + 199: 253, # '╟' + 200: 253, # '╚' + 201: 253, # '╔' + 202: 253, # '╩' + 203: 253, # '╦' + 204: 253, # '╠' + 205: 253, # '═' + 206: 253, # '╬' + 207: 253, # '╧' + 208: 253, # '╨' + 209: 253, # '╤' + 210: 253, # '╥' + 211: 253, # '╙' + 212: 253, # '╘' + 213: 253, # '╒' + 214: 253, # '╓' + 215: 253, # '╫' + 216: 253, # '╪' + 217: 253, # '┘' + 218: 253, # '┌' + 219: 253, # '█' + 220: 253, # '▄' + 221: 253, # '▌' + 222: 253, # '▐' + 223: 253, # '▀' + 224: 255, # 'р' + 225: 255, # 'с' + 226: 255, # 'т' + 227: 255, # 'у' + 228: 255, # 'ф' + 229: 255, # 'х' + 230: 255, # 'ц' + 231: 255, # 'ч' + 232: 255, # 'ш' + 233: 255, # 'щ' + 234: 255, # 'ъ' + 235: 255, # 'ы' + 236: 255, # 'ь' + 237: 255, # 'э' + 238: 255, # 'ю' + 239: 255, # 'я' + 240: 255, # 'Ё' + 241: 255, # 'ё' + 242: 255, # 'Є' + 243: 255, # 'є' + 244: 255, # 'Ї' + 245: 255, # 'ї' + 246: 255, # 'Ў' + 247: 255, # 'ў' + 248: 253, # '°' + 249: 253, # '∙' + 250: 253, # '·' + 251: 253, # '√' + 252: 253, # '№' + 253: 253, # '¤' + 254: 253, # '■' + 255: 251, # '\xa0' } -IBM866_BELARUSIAN_MODEL = SingleByteCharSetModel(charset_name='IBM866', - language='Belarusian', - char_to_order_map=IBM866_BELARUSIAN_CHAR_TO_ORDER, - language_model=BELARUSIAN_LANG_MODEL, - typical_positive_ratio=0.8118540733199303, - keep_ascii_letters=False, - alphabet='ʼЁІЎАБВГДЕЖЗЙКЛМНОПРСТУФХЦЧШЫЬЭЮЯабвгдежзйклмнопрстуфхцчшыьэюяёіў') +IBM866_BELARUSIAN_MODEL = SingleByteCharSetModel( + charset_name="IBM866", + language="Belarusian", + char_to_order_map=IBM866_BELARUSIAN_CHAR_TO_ORDER, + language_model=BELARUSIAN_LANG_MODEL, + typical_positive_ratio=0.8118540733199303, + keep_ascii_letters=False, + alphabet="ʼЁІЎАБВГДЕЖЗЙКЛМНОПРСТУФХЦЧШЫЬЭЮЯабвгдежзйклмнопрстуфхцчшыьэюяёіў", +) MACCYRILLIC_BELARUSIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 255, # 'А' - 129: 255, # 'Б' - 130: 255, # 'В' - 131: 255, # 'Г' - 132: 255, # 'Д' - 133: 255, # 'Е' - 134: 255, # 'Ж' - 135: 255, # 'З' - 136: 255, # 'И' - 137: 255, # 'Й' - 138: 255, # 'К' - 139: 255, # 'Л' - 140: 255, # 'М' - 141: 255, # 'Н' - 142: 255, # 'О' - 143: 255, # 'П' - 144: 255, # 'Р' - 145: 255, # 'С' - 146: 255, # 'Т' - 147: 255, # 'У' - 148: 255, # 'Ф' - 149: 255, # 'Х' - 150: 255, # 'Ц' - 151: 255, # 'Ч' - 152: 255, # 'Ш' - 153: 255, # 'Щ' - 154: 255, # 'Ъ' - 155: 255, # 'Ы' - 156: 255, # 'Ь' - 157: 255, # 'Э' - 158: 255, # 'Ю' - 159: 255, # 'Я' - 160: 253, # '†' - 161: 253, # '°' - 162: 255, # 'Ґ' - 163: 253, # '£' - 164: 253, # '§' - 165: 253, # '•' - 166: 253, # '¶' - 167: 255, # 'І' - 168: 253, # '®' - 169: 253, # '©' - 170: 253, # '™' - 171: 255, # 'Ђ' - 172: 255, # 'ђ' - 173: 253, # '≠' - 174: 255, # 'Ѓ' - 175: 255, # 'ѓ' - 176: 253, # '∞' - 177: 253, # '±' - 178: 253, # '≤' - 179: 253, # '≥' - 180: 255, # 'і' - 181: 255, # 'µ' - 182: 255, # 'ґ' - 183: 255, # 'Ј' - 184: 255, # 'Є' - 185: 255, # 'є' - 186: 255, # 'Ї' - 187: 255, # 'ї' - 188: 255, # 'Љ' - 189: 255, # 'љ' - 190: 255, # 'Њ' - 191: 255, # 'њ' - 192: 255, # 'ј' - 193: 255, # 'Ѕ' - 194: 253, # '¬' - 195: 253, # '√' - 196: 255, # 'ƒ' - 197: 253, # '≈' - 198: 253, # '∆' - 199: 253, # '«' - 200: 253, # '»' - 201: 253, # '…' - 202: 251, # '\xa0' - 203: 255, # 'Ћ' - 204: 255, # 'ћ' - 205: 255, # 'Ќ' - 206: 255, # 'ќ' - 207: 255, # 'ѕ' - 208: 253, # '–' - 209: 253, # '—' - 210: 253, # '“' - 211: 253, # '”' - 212: 253, # '‘' - 213: 253, # '’' - 214: 253, # '÷' - 215: 253, # '„' - 216: 255, # 'Ў' - 217: 255, # 'ў' - 218: 255, # 'Џ' - 219: 255, # 'џ' - 220: 253, # '№' - 221: 255, # 'Ё' - 222: 255, # 'ё' - 223: 255, # 'я' - 224: 255, # 'а' - 225: 255, # 'б' - 226: 255, # 'в' - 227: 255, # 'г' - 228: 255, # 'д' - 229: 255, # 'е' - 230: 255, # 'ж' - 231: 255, # 'з' - 232: 255, # 'и' - 233: 255, # 'й' - 234: 255, # 'к' - 235: 255, # 'л' - 236: 255, # 'м' - 237: 255, # 'н' - 238: 255, # 'о' - 239: 255, # 'п' - 240: 255, # 'р' - 241: 255, # 'с' - 242: 255, # 'т' - 243: 255, # 'у' - 244: 255, # 'ф' - 245: 255, # 'х' - 246: 255, # 'ц' - 247: 255, # 'ч' - 248: 255, # 'ш' - 249: 255, # 'щ' - 250: 255, # 'ъ' - 251: 255, # 'ы' - 252: 255, # 'ь' - 253: 255, # 'э' - 254: 255, # 'ю' - 255: 253, # '€' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 255, # 'А' + 129: 255, # 'Б' + 130: 255, # 'В' + 131: 255, # 'Г' + 132: 255, # 'Д' + 133: 255, # 'Е' + 134: 255, # 'Ж' + 135: 255, # 'З' + 136: 255, # 'И' + 137: 255, # 'Й' + 138: 255, # 'К' + 139: 255, # 'Л' + 140: 255, # 'М' + 141: 255, # 'Н' + 142: 255, # 'О' + 143: 255, # 'П' + 144: 255, # 'Р' + 145: 255, # 'С' + 146: 255, # 'Т' + 147: 255, # 'У' + 148: 255, # 'Ф' + 149: 255, # 'Х' + 150: 255, # 'Ц' + 151: 255, # 'Ч' + 152: 255, # 'Ш' + 153: 255, # 'Щ' + 154: 255, # 'Ъ' + 155: 255, # 'Ы' + 156: 255, # 'Ь' + 157: 255, # 'Э' + 158: 255, # 'Ю' + 159: 255, # 'Я' + 160: 253, # '†' + 161: 253, # '°' + 162: 255, # 'Ґ' + 163: 253, # '£' + 164: 253, # '§' + 165: 253, # '•' + 166: 253, # '¶' + 167: 255, # 'І' + 168: 253, # '®' + 169: 253, # '©' + 170: 253, # '™' + 171: 255, # 'Ђ' + 172: 255, # 'ђ' + 173: 253, # '≠' + 174: 255, # 'Ѓ' + 175: 255, # 'ѓ' + 176: 253, # '∞' + 177: 253, # '±' + 178: 253, # '≤' + 179: 253, # '≥' + 180: 255, # 'і' + 181: 255, # 'µ' + 182: 255, # 'ґ' + 183: 255, # 'Ј' + 184: 255, # 'Є' + 185: 255, # 'є' + 186: 255, # 'Ї' + 187: 255, # 'ї' + 188: 255, # 'Љ' + 189: 255, # 'љ' + 190: 255, # 'Њ' + 191: 255, # 'њ' + 192: 255, # 'ј' + 193: 255, # 'Ѕ' + 194: 253, # '¬' + 195: 253, # '√' + 196: 255, # 'ƒ' + 197: 253, # '≈' + 198: 253, # '∆' + 199: 253, # '«' + 200: 253, # '»' + 201: 253, # '…' + 202: 251, # '\xa0' + 203: 255, # 'Ћ' + 204: 255, # 'ћ' + 205: 255, # 'Ќ' + 206: 255, # 'ќ' + 207: 255, # 'ѕ' + 208: 253, # '–' + 209: 253, # '—' + 210: 253, # '“' + 211: 253, # '”' + 212: 253, # '‘' + 213: 253, # '’' + 214: 253, # '÷' + 215: 253, # '„' + 216: 255, # 'Ў' + 217: 255, # 'ў' + 218: 255, # 'Џ' + 219: 255, # 'џ' + 220: 253, # '№' + 221: 255, # 'Ё' + 222: 255, # 'ё' + 223: 255, # 'я' + 224: 255, # 'а' + 225: 255, # 'б' + 226: 255, # 'в' + 227: 255, # 'г' + 228: 255, # 'д' + 229: 255, # 'е' + 230: 255, # 'ж' + 231: 255, # 'з' + 232: 255, # 'и' + 233: 255, # 'й' + 234: 255, # 'к' + 235: 255, # 'л' + 236: 255, # 'м' + 237: 255, # 'н' + 238: 255, # 'о' + 239: 255, # 'п' + 240: 255, # 'р' + 241: 255, # 'с' + 242: 255, # 'т' + 243: 255, # 'у' + 244: 255, # 'ф' + 245: 255, # 'х' + 246: 255, # 'ц' + 247: 255, # 'ч' + 248: 255, # 'ш' + 249: 255, # 'щ' + 250: 255, # 'ъ' + 251: 255, # 'ы' + 252: 255, # 'ь' + 253: 255, # 'э' + 254: 255, # 'ю' + 255: 253, # '€' } -MACCYRILLIC_BELARUSIAN_MODEL = SingleByteCharSetModel(charset_name='MacCyrillic', - language='Belarusian', - char_to_order_map=MACCYRILLIC_BELARUSIAN_CHAR_TO_ORDER, - language_model=BELARUSIAN_LANG_MODEL, - typical_positive_ratio=0.9071761992924257, - keep_ascii_letters=False, - alphabet='ʼЁІЎАБВГДЕЖЗЙКЛМНОПРСТУФХЦЧШЫЬЭЮЯабвгдежзйклмнопрстуфхцчшыьэюяёіў') - +MACCYRILLIC_BELARUSIAN_MODEL = SingleByteCharSetModel( + charset_name="MacCyrillic", + language="Belarusian", + char_to_order_map=MACCYRILLIC_BELARUSIAN_CHAR_TO_ORDER, + language_model=BELARUSIAN_LANG_MODEL, + typical_positive_ratio=0.9071761992924257, + keep_ascii_letters=False, + alphabet="ʼЁІЎАБВГДЕЖЗЙКЛМНОПРСТУФХЦЧШЫЬЭЮЯабвгдежзйклмнопрстуфхцчшыьэюяёіў", +) diff --git a/chardet/langcroatianmodel.py b/chardet/langcroatianmodel.py index 9a92b5ee..4f154235 100644 --- a/chardet/langcroatianmodel.py +++ b/chardet/langcroatianmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -3324,536 +3322,539 @@ # Character Mapping Table(s): ISO_8859_2_CROATIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ą' - 162: 253, # '˘' - 163: 255, # 'Ł' - 164: 253, # '¤' - 165: 255, # 'Ľ' - 166: 255, # 'Ś' - 167: 253, # '§' - 168: 253, # '¨' - 169: 255, # 'Š' - 170: 255, # 'Ş' - 171: 255, # 'Ť' - 172: 255, # 'Ź' - 173: 251, # '\xad' - 174: 255, # 'Ž' - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 255, # 'ą' - 178: 253, # '˛' - 179: 255, # 'ł' - 180: 253, # '´' - 181: 255, # 'ľ' - 182: 255, # 'ś' - 183: 255, # 'ˇ' - 184: 253, # '¸' - 185: 255, # 'š' - 186: 255, # 'ş' - 187: 255, # 'ť' - 188: 255, # 'ź' - 189: 253, # '˝' - 190: 255, # 'ž' - 191: 255, # 'ż' - 192: 255, # 'Ŕ' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ă' - 196: 255, # 'Ä' - 197: 255, # 'Ĺ' - 198: 255, # 'Ć' - 199: 255, # 'Ç' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ě' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ď' - 208: 255, # 'Đ' - 209: 255, # 'Ń' - 210: 255, # 'Ň' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ő' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ř' - 217: 255, # 'Ů' - 218: 255, # 'Ú' - 219: 255, # 'Ű' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Ţ' - 223: 255, # 'ß' - 224: 255, # 'ŕ' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ă' - 228: 255, # 'ä' - 229: 255, # 'ĺ' - 230: 255, # 'ć' - 231: 255, # 'ç' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ě' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ď' - 240: 255, # 'đ' - 241: 255, # 'ń' - 242: 255, # 'ň' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ő' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ř' - 249: 255, # 'ů' - 250: 255, # 'ú' - 251: 255, # 'ű' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'ţ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ą' + 162: 253, # '˘' + 163: 255, # 'Ł' + 164: 253, # '¤' + 165: 255, # 'Ľ' + 166: 255, # 'Ś' + 167: 253, # '§' + 168: 253, # '¨' + 169: 255, # 'Š' + 170: 255, # 'Ş' + 171: 255, # 'Ť' + 172: 255, # 'Ź' + 173: 251, # '\xad' + 174: 255, # 'Ž' + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 255, # 'ą' + 178: 253, # '˛' + 179: 255, # 'ł' + 180: 253, # '´' + 181: 255, # 'ľ' + 182: 255, # 'ś' + 183: 255, # 'ˇ' + 184: 253, # '¸' + 185: 255, # 'š' + 186: 255, # 'ş' + 187: 255, # 'ť' + 188: 255, # 'ź' + 189: 253, # '˝' + 190: 255, # 'ž' + 191: 255, # 'ż' + 192: 255, # 'Ŕ' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ă' + 196: 255, # 'Ä' + 197: 255, # 'Ĺ' + 198: 255, # 'Ć' + 199: 255, # 'Ç' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ě' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ď' + 208: 255, # 'Đ' + 209: 255, # 'Ń' + 210: 255, # 'Ň' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ő' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ř' + 217: 255, # 'Ů' + 218: 255, # 'Ú' + 219: 255, # 'Ű' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Ţ' + 223: 255, # 'ß' + 224: 255, # 'ŕ' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ă' + 228: 255, # 'ä' + 229: 255, # 'ĺ' + 230: 255, # 'ć' + 231: 255, # 'ç' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ě' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ď' + 240: 255, # 'đ' + 241: 255, # 'ń' + 242: 255, # 'ň' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ő' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ř' + 249: 255, # 'ů' + 250: 255, # 'ú' + 251: 255, # 'ű' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'ţ' + 255: 253, # '˙' } -ISO_8859_2_CROATIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-2', - language='Croatian', - char_to_order_map=ISO_8859_2_CROATIAN_CHAR_TO_ORDER, - language_model=CROATIAN_LANG_MODEL, - typical_positive_ratio=0.9816070469661222, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĆćČčĐ𩹮ž') +ISO_8859_2_CROATIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-2", + language="Croatian", + char_to_order_map=ISO_8859_2_CROATIAN_CHAR_TO_ORDER, + language_model=CROATIAN_LANG_MODEL, + typical_positive_ratio=0.9816070469661222, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĆćČčĐ𩹮ž", +) WINDOWS_1250_CROATIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # None - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # None - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Ś' - 141: 255, # 'Ť' - 142: 255, # 'Ž' - 143: 255, # 'Ź' - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 255, # None - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'ś' - 157: 255, # 'ť' - 158: 255, # 'ž' - 159: 255, # 'ź' - 160: 251, # '\xa0' - 161: 255, # 'ˇ' - 162: 253, # '˘' - 163: 255, # 'Ł' - 164: 253, # '¤' - 165: 255, # 'Ą' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'Ş' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 253, # '±' - 178: 253, # '˛' - 179: 255, # 'ł' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 255, # 'ą' - 186: 255, # 'ş' - 187: 253, # '»' - 188: 255, # 'Ľ' - 189: 253, # '˝' - 190: 255, # 'ľ' - 191: 255, # 'ż' - 192: 255, # 'Ŕ' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ă' - 196: 255, # 'Ä' - 197: 255, # 'Ĺ' - 198: 255, # 'Ć' - 199: 255, # 'Ç' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ě' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ď' - 208: 255, # 'Đ' - 209: 255, # 'Ń' - 210: 255, # 'Ň' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ő' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ř' - 217: 255, # 'Ů' - 218: 255, # 'Ú' - 219: 255, # 'Ű' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Ţ' - 223: 255, # 'ß' - 224: 255, # 'ŕ' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ă' - 228: 255, # 'ä' - 229: 255, # 'ĺ' - 230: 255, # 'ć' - 231: 255, # 'ç' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ě' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ď' - 240: 255, # 'đ' - 241: 255, # 'ń' - 242: 255, # 'ň' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ő' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ř' - 249: 255, # 'ů' - 250: 255, # 'ú' - 251: 255, # 'ű' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'ţ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # None + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # None + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Ś' + 141: 255, # 'Ť' + 142: 255, # 'Ž' + 143: 255, # 'Ź' + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 255, # None + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'ś' + 157: 255, # 'ť' + 158: 255, # 'ž' + 159: 255, # 'ź' + 160: 251, # '\xa0' + 161: 255, # 'ˇ' + 162: 253, # '˘' + 163: 255, # 'Ł' + 164: 253, # '¤' + 165: 255, # 'Ą' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'Ş' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 253, # '±' + 178: 253, # '˛' + 179: 255, # 'ł' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 255, # 'ą' + 186: 255, # 'ş' + 187: 253, # '»' + 188: 255, # 'Ľ' + 189: 253, # '˝' + 190: 255, # 'ľ' + 191: 255, # 'ż' + 192: 255, # 'Ŕ' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ă' + 196: 255, # 'Ä' + 197: 255, # 'Ĺ' + 198: 255, # 'Ć' + 199: 255, # 'Ç' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ě' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ď' + 208: 255, # 'Đ' + 209: 255, # 'Ń' + 210: 255, # 'Ň' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ő' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ř' + 217: 255, # 'Ů' + 218: 255, # 'Ú' + 219: 255, # 'Ű' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Ţ' + 223: 255, # 'ß' + 224: 255, # 'ŕ' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ă' + 228: 255, # 'ä' + 229: 255, # 'ĺ' + 230: 255, # 'ć' + 231: 255, # 'ç' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ě' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ď' + 240: 255, # 'đ' + 241: 255, # 'ń' + 242: 255, # 'ň' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ő' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ř' + 249: 255, # 'ů' + 250: 255, # 'ú' + 251: 255, # 'ű' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'ţ' + 255: 253, # '˙' } -WINDOWS_1250_CROATIAN_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1250', - language='Croatian', - char_to_order_map=WINDOWS_1250_CROATIAN_CHAR_TO_ORDER, - language_model=CROATIAN_LANG_MODEL, - typical_positive_ratio=0.99254272216937, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĆćČčĐ𩹮ž') - +WINDOWS_1250_CROATIAN_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1250", + language="Croatian", + char_to_order_map=WINDOWS_1250_CROATIAN_CHAR_TO_ORDER, + language_model=CROATIAN_LANG_MODEL, + typical_positive_ratio=0.99254272216937, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĆćČčĐ𩹮ž", +) diff --git a/chardet/langczechmodel.py b/chardet/langczechmodel.py index e1e66997..9c11b1d7 100644 --- a/chardet/langczechmodel.py +++ b/chardet/langczechmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -62768,536 +62766,539 @@ # Character Mapping Table(s): ISO_8859_2_CZECH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 53, # 'A' - 66: 56, # 'B' - 67: 66, # 'C' - 68: 62, # 'D' - 69: 70, # 'E' - 70: 71, # 'F' - 71: 78, # 'G' - 72: 67, # 'H' - 73: 63, # 'I' - 74: 59, # 'J' - 75: 58, # 'K' - 76: 68, # 'L' - 77: 51, # 'M' - 78: 57, # 'N' - 79: 65, # 'O' - 80: 46, # 'P' - 81: 120, # 'Q' - 82: 60, # 'R' - 83: 43, # 'S' - 84: 64, # 'T' - 85: 80, # 'U' - 86: 48, # 'V' - 87: 79, # 'W' - 88: 99, # 'X' - 89: 105, # 'Y' - 90: 76, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 4, # 'a' - 98: 24, # 'b' - 99: 18, # 'c' - 100: 13, # 'd' - 101: 3, # 'e' - 102: 44, # 'f' - 103: 39, # 'g' - 104: 19, # 'h' - 105: 7, # 'i' - 106: 26, # 'j' - 107: 12, # 'k' - 108: 10, # 'l' - 109: 15, # 'm' - 110: 5, # 'n' - 111: 2, # 'o' - 112: 16, # 'p' - 113: 110, # 'q' - 114: 9, # 'r' - 115: 8, # 's' - 116: 6, # 't' - 117: 14, # 'u' - 118: 11, # 'v' - 119: 81, # 'w' - 120: 72, # 'x' - 121: 22, # 'y' - 122: 21, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ą' - 162: 253, # '˘' - 163: 255, # 'Ł' - 164: 253, # '¤' - 165: 255, # 'Ľ' - 166: 255, # 'Ś' - 167: 253, # '§' - 168: 253, # '¨' - 169: 255, # 'Š' - 170: 255, # 'Ş' - 171: 255, # 'Ť' - 172: 255, # 'Ź' - 173: 251, # '\xad' - 174: 255, # 'Ž' - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 255, # 'ą' - 178: 253, # '˛' - 179: 255, # 'ł' - 180: 253, # '´' - 181: 255, # 'ľ' - 182: 255, # 'ś' - 183: 255, # 'ˇ' - 184: 253, # '¸' - 185: 255, # 'š' - 186: 255, # 'ş' - 187: 255, # 'ť' - 188: 255, # 'ź' - 189: 253, # '˝' - 190: 255, # 'ž' - 191: 255, # 'ż' - 192: 255, # 'Ŕ' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ă' - 196: 255, # 'Ä' - 197: 255, # 'Ĺ' - 198: 255, # 'Ć' - 199: 255, # 'Ç' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ě' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ď' - 208: 255, # 'Đ' - 209: 255, # 'Ń' - 210: 255, # 'Ň' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ő' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ř' - 217: 255, # 'Ů' - 218: 255, # 'Ú' - 219: 255, # 'Ű' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Ţ' - 223: 255, # 'ß' - 224: 255, # 'ŕ' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ă' - 228: 255, # 'ä' - 229: 255, # 'ĺ' - 230: 255, # 'ć' - 231: 255, # 'ç' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ě' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ď' - 240: 255, # 'đ' - 241: 255, # 'ń' - 242: 255, # 'ň' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ő' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ř' - 249: 255, # 'ů' - 250: 255, # 'ú' - 251: 255, # 'ű' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'ţ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 53, # 'A' + 66: 56, # 'B' + 67: 66, # 'C' + 68: 62, # 'D' + 69: 70, # 'E' + 70: 71, # 'F' + 71: 78, # 'G' + 72: 67, # 'H' + 73: 63, # 'I' + 74: 59, # 'J' + 75: 58, # 'K' + 76: 68, # 'L' + 77: 51, # 'M' + 78: 57, # 'N' + 79: 65, # 'O' + 80: 46, # 'P' + 81: 120, # 'Q' + 82: 60, # 'R' + 83: 43, # 'S' + 84: 64, # 'T' + 85: 80, # 'U' + 86: 48, # 'V' + 87: 79, # 'W' + 88: 99, # 'X' + 89: 105, # 'Y' + 90: 76, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 4, # 'a' + 98: 24, # 'b' + 99: 18, # 'c' + 100: 13, # 'd' + 101: 3, # 'e' + 102: 44, # 'f' + 103: 39, # 'g' + 104: 19, # 'h' + 105: 7, # 'i' + 106: 26, # 'j' + 107: 12, # 'k' + 108: 10, # 'l' + 109: 15, # 'm' + 110: 5, # 'n' + 111: 2, # 'o' + 112: 16, # 'p' + 113: 110, # 'q' + 114: 9, # 'r' + 115: 8, # 's' + 116: 6, # 't' + 117: 14, # 'u' + 118: 11, # 'v' + 119: 81, # 'w' + 120: 72, # 'x' + 121: 22, # 'y' + 122: 21, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ą' + 162: 253, # '˘' + 163: 255, # 'Ł' + 164: 253, # '¤' + 165: 255, # 'Ľ' + 166: 255, # 'Ś' + 167: 253, # '§' + 168: 253, # '¨' + 169: 255, # 'Š' + 170: 255, # 'Ş' + 171: 255, # 'Ť' + 172: 255, # 'Ź' + 173: 251, # '\xad' + 174: 255, # 'Ž' + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 255, # 'ą' + 178: 253, # '˛' + 179: 255, # 'ł' + 180: 253, # '´' + 181: 255, # 'ľ' + 182: 255, # 'ś' + 183: 255, # 'ˇ' + 184: 253, # '¸' + 185: 255, # 'š' + 186: 255, # 'ş' + 187: 255, # 'ť' + 188: 255, # 'ź' + 189: 253, # '˝' + 190: 255, # 'ž' + 191: 255, # 'ż' + 192: 255, # 'Ŕ' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ă' + 196: 255, # 'Ä' + 197: 255, # 'Ĺ' + 198: 255, # 'Ć' + 199: 255, # 'Ç' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ě' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ď' + 208: 255, # 'Đ' + 209: 255, # 'Ń' + 210: 255, # 'Ň' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ő' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ř' + 217: 255, # 'Ů' + 218: 255, # 'Ú' + 219: 255, # 'Ű' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Ţ' + 223: 255, # 'ß' + 224: 255, # 'ŕ' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ă' + 228: 255, # 'ä' + 229: 255, # 'ĺ' + 230: 255, # 'ć' + 231: 255, # 'ç' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ě' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ď' + 240: 255, # 'đ' + 241: 255, # 'ń' + 242: 255, # 'ň' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ő' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ř' + 249: 255, # 'ů' + 250: 255, # 'ú' + 251: 255, # 'ű' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'ţ' + 255: 253, # '˙' } -ISO_8859_2_CZECH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-2', - language='Czech', - char_to_order_map=ISO_8859_2_CZECH_CHAR_TO_ORDER, - language_model=CZECH_LANG_MODEL, - typical_positive_ratio=0.8649334824637391, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÓÚÝáéíóúýČčĎďĚěŇňŘřŠšŤťŮůŽž') +ISO_8859_2_CZECH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-2", + language="Czech", + char_to_order_map=ISO_8859_2_CZECH_CHAR_TO_ORDER, + language_model=CZECH_LANG_MODEL, + typical_positive_ratio=0.8649334824637391, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÓÚÝáéíóúýČčĎďĚěŇňŘřŠšŤťŮůŽž", +) WINDOWS_1250_CZECH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 53, # 'A' - 66: 56, # 'B' - 67: 66, # 'C' - 68: 62, # 'D' - 69: 70, # 'E' - 70: 71, # 'F' - 71: 78, # 'G' - 72: 67, # 'H' - 73: 63, # 'I' - 74: 59, # 'J' - 75: 58, # 'K' - 76: 68, # 'L' - 77: 51, # 'M' - 78: 57, # 'N' - 79: 65, # 'O' - 80: 46, # 'P' - 81: 120, # 'Q' - 82: 60, # 'R' - 83: 43, # 'S' - 84: 64, # 'T' - 85: 80, # 'U' - 86: 48, # 'V' - 87: 79, # 'W' - 88: 99, # 'X' - 89: 105, # 'Y' - 90: 76, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 4, # 'a' - 98: 24, # 'b' - 99: 18, # 'c' - 100: 13, # 'd' - 101: 3, # 'e' - 102: 44, # 'f' - 103: 39, # 'g' - 104: 19, # 'h' - 105: 7, # 'i' - 106: 26, # 'j' - 107: 12, # 'k' - 108: 10, # 'l' - 109: 15, # 'm' - 110: 5, # 'n' - 111: 2, # 'o' - 112: 16, # 'p' - 113: 110, # 'q' - 114: 9, # 'r' - 115: 8, # 's' - 116: 6, # 't' - 117: 14, # 'u' - 118: 11, # 'v' - 119: 81, # 'w' - 120: 72, # 'x' - 121: 22, # 'y' - 122: 21, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # None - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # None - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Ś' - 141: 255, # 'Ť' - 142: 255, # 'Ž' - 143: 255, # 'Ź' - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 255, # None - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'ś' - 157: 255, # 'ť' - 158: 255, # 'ž' - 159: 255, # 'ź' - 160: 251, # '\xa0' - 161: 255, # 'ˇ' - 162: 253, # '˘' - 163: 255, # 'Ł' - 164: 253, # '¤' - 165: 255, # 'Ą' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'Ş' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 253, # '±' - 178: 253, # '˛' - 179: 255, # 'ł' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 255, # 'ą' - 186: 255, # 'ş' - 187: 253, # '»' - 188: 255, # 'Ľ' - 189: 253, # '˝' - 190: 255, # 'ľ' - 191: 255, # 'ż' - 192: 255, # 'Ŕ' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ă' - 196: 255, # 'Ä' - 197: 255, # 'Ĺ' - 198: 255, # 'Ć' - 199: 255, # 'Ç' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ě' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ď' - 208: 255, # 'Đ' - 209: 255, # 'Ń' - 210: 255, # 'Ň' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ő' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ř' - 217: 255, # 'Ů' - 218: 255, # 'Ú' - 219: 255, # 'Ű' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Ţ' - 223: 255, # 'ß' - 224: 255, # 'ŕ' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ă' - 228: 255, # 'ä' - 229: 255, # 'ĺ' - 230: 255, # 'ć' - 231: 255, # 'ç' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ě' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ď' - 240: 255, # 'đ' - 241: 255, # 'ń' - 242: 255, # 'ň' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ő' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ř' - 249: 255, # 'ů' - 250: 255, # 'ú' - 251: 255, # 'ű' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'ţ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 53, # 'A' + 66: 56, # 'B' + 67: 66, # 'C' + 68: 62, # 'D' + 69: 70, # 'E' + 70: 71, # 'F' + 71: 78, # 'G' + 72: 67, # 'H' + 73: 63, # 'I' + 74: 59, # 'J' + 75: 58, # 'K' + 76: 68, # 'L' + 77: 51, # 'M' + 78: 57, # 'N' + 79: 65, # 'O' + 80: 46, # 'P' + 81: 120, # 'Q' + 82: 60, # 'R' + 83: 43, # 'S' + 84: 64, # 'T' + 85: 80, # 'U' + 86: 48, # 'V' + 87: 79, # 'W' + 88: 99, # 'X' + 89: 105, # 'Y' + 90: 76, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 4, # 'a' + 98: 24, # 'b' + 99: 18, # 'c' + 100: 13, # 'd' + 101: 3, # 'e' + 102: 44, # 'f' + 103: 39, # 'g' + 104: 19, # 'h' + 105: 7, # 'i' + 106: 26, # 'j' + 107: 12, # 'k' + 108: 10, # 'l' + 109: 15, # 'm' + 110: 5, # 'n' + 111: 2, # 'o' + 112: 16, # 'p' + 113: 110, # 'q' + 114: 9, # 'r' + 115: 8, # 's' + 116: 6, # 't' + 117: 14, # 'u' + 118: 11, # 'v' + 119: 81, # 'w' + 120: 72, # 'x' + 121: 22, # 'y' + 122: 21, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # None + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # None + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Ś' + 141: 255, # 'Ť' + 142: 255, # 'Ž' + 143: 255, # 'Ź' + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 255, # None + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'ś' + 157: 255, # 'ť' + 158: 255, # 'ž' + 159: 255, # 'ź' + 160: 251, # '\xa0' + 161: 255, # 'ˇ' + 162: 253, # '˘' + 163: 255, # 'Ł' + 164: 253, # '¤' + 165: 255, # 'Ą' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'Ş' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 253, # '±' + 178: 253, # '˛' + 179: 255, # 'ł' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 255, # 'ą' + 186: 255, # 'ş' + 187: 253, # '»' + 188: 255, # 'Ľ' + 189: 253, # '˝' + 190: 255, # 'ľ' + 191: 255, # 'ż' + 192: 255, # 'Ŕ' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ă' + 196: 255, # 'Ä' + 197: 255, # 'Ĺ' + 198: 255, # 'Ć' + 199: 255, # 'Ç' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ě' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ď' + 208: 255, # 'Đ' + 209: 255, # 'Ń' + 210: 255, # 'Ň' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ő' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ř' + 217: 255, # 'Ů' + 218: 255, # 'Ú' + 219: 255, # 'Ű' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Ţ' + 223: 255, # 'ß' + 224: 255, # 'ŕ' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ă' + 228: 255, # 'ä' + 229: 255, # 'ĺ' + 230: 255, # 'ć' + 231: 255, # 'ç' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ě' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ď' + 240: 255, # 'đ' + 241: 255, # 'ń' + 242: 255, # 'ň' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ő' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ř' + 249: 255, # 'ů' + 250: 255, # 'ú' + 251: 255, # 'ű' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'ţ' + 255: 253, # '˙' } -WINDOWS_1250_CZECH_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1250', - language='Czech', - char_to_order_map=WINDOWS_1250_CZECH_CHAR_TO_ORDER, - language_model=CZECH_LANG_MODEL, - typical_positive_ratio=0.8712033740754319, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÓÚÝáéíóúýČčĎďĚěŇňŘřŠšŤťŮůŽž') - +WINDOWS_1250_CZECH_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1250", + language="Czech", + char_to_order_map=WINDOWS_1250_CZECH_CHAR_TO_ORDER, + language_model=CZECH_LANG_MODEL, + typical_positive_ratio=0.8712033740754319, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÓÚÝáéíóúýČčĎďĚěŇňŘřŠšŤťŮůŽž", +) diff --git a/chardet/langdanishmodel.py b/chardet/langdanishmodel.py index 76c86ed4..22285c1c 100644 --- a/chardet/langdanishmodel.py +++ b/chardet/langdanishmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -9524,803 +9522,808 @@ # Character Mapping Table(s): ISO_8859_1_DANISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 39, # 'A' - 66: 43, # 'B' - 67: 54, # 'C' - 68: 34, # 'D' - 69: 49, # 'E' - 70: 45, # 'F' - 71: 60, # 'G' - 72: 42, # 'H' - 73: 44, # 'I' - 74: 62, # 'J' - 75: 46, # 'K' - 76: 59, # 'L' - 77: 41, # 'M' - 78: 57, # 'N' - 79: 66, # 'O' - 80: 53, # 'P' - 81: 92, # 'Q' - 82: 56, # 'R' - 83: 33, # 'S' - 84: 48, # 'T' - 85: 67, # 'U' - 86: 61, # 'V' - 87: 68, # 'W' - 88: 91, # 'X' - 89: 77, # 'Y' - 90: 80, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 6, # 'a' - 98: 18, # 'b' - 99: 30, # 'c' - 100: 9, # 'd' - 101: 2, # 'e' - 102: 15, # 'f' - 103: 12, # 'g' - 104: 19, # 'h' - 105: 7, # 'i' - 106: 31, # 'j' - 107: 13, # 'k' - 108: 11, # 'l' - 109: 14, # 'm' - 110: 4, # 'n' - 111: 10, # 'o' - 112: 21, # 'p' - 113: 83, # 'q' - 114: 3, # 'r' - 115: 8, # 's' - 116: 5, # 't' - 117: 17, # 'u' - 118: 16, # 'v' - 119: 64, # 'w' - 120: 71, # 'x' - 121: 25, # 'y' - 122: 69, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 39, # 'A' + 66: 43, # 'B' + 67: 54, # 'C' + 68: 34, # 'D' + 69: 49, # 'E' + 70: 45, # 'F' + 71: 60, # 'G' + 72: 42, # 'H' + 73: 44, # 'I' + 74: 62, # 'J' + 75: 46, # 'K' + 76: 59, # 'L' + 77: 41, # 'M' + 78: 57, # 'N' + 79: 66, # 'O' + 80: 53, # 'P' + 81: 92, # 'Q' + 82: 56, # 'R' + 83: 33, # 'S' + 84: 48, # 'T' + 85: 67, # 'U' + 86: 61, # 'V' + 87: 68, # 'W' + 88: 91, # 'X' + 89: 77, # 'Y' + 90: 80, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 6, # 'a' + 98: 18, # 'b' + 99: 30, # 'c' + 100: 9, # 'd' + 101: 2, # 'e' + 102: 15, # 'f' + 103: 12, # 'g' + 104: 19, # 'h' + 105: 7, # 'i' + 106: 31, # 'j' + 107: 13, # 'k' + 108: 11, # 'l' + 109: 14, # 'm' + 110: 4, # 'n' + 111: 10, # 'o' + 112: 21, # 'p' + 113: 83, # 'q' + 114: 3, # 'r' + 115: 8, # 's' + 116: 5, # 't' + 117: 17, # 'u' + 118: 16, # 'v' + 119: 64, # 'w' + 120: 71, # 'x' + 121: 25, # 'y' + 122: 69, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_1_DANISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-1', - language='Danish', - char_to_order_map=ISO_8859_1_DANISH_CHAR_TO_ORDER, - language_model=DANISH_LANG_MODEL, - typical_positive_ratio=0.930092242592071, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÅÆØåæø') +ISO_8859_1_DANISH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-1", + language="Danish", + char_to_order_map=ISO_8859_1_DANISH_CHAR_TO_ORDER, + language_model=DANISH_LANG_MODEL, + typical_positive_ratio=0.930092242592071, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÅÆØåæø", +) ISO_8859_15_DANISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 39, # 'A' - 66: 43, # 'B' - 67: 54, # 'C' - 68: 34, # 'D' - 69: 49, # 'E' - 70: 45, # 'F' - 71: 60, # 'G' - 72: 42, # 'H' - 73: 44, # 'I' - 74: 62, # 'J' - 75: 46, # 'K' - 76: 59, # 'L' - 77: 41, # 'M' - 78: 57, # 'N' - 79: 66, # 'O' - 80: 53, # 'P' - 81: 92, # 'Q' - 82: 56, # 'R' - 83: 33, # 'S' - 84: 48, # 'T' - 85: 67, # 'U' - 86: 61, # 'V' - 87: 68, # 'W' - 88: 91, # 'X' - 89: 77, # 'Y' - 90: 80, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 6, # 'a' - 98: 18, # 'b' - 99: 30, # 'c' - 100: 9, # 'd' - 101: 2, # 'e' - 102: 15, # 'f' - 103: 12, # 'g' - 104: 19, # 'h' - 105: 7, # 'i' - 106: 31, # 'j' - 107: 13, # 'k' - 108: 11, # 'l' - 109: 14, # 'm' - 110: 4, # 'n' - 111: 10, # 'o' - 112: 21, # 'p' - 113: 83, # 'q' - 114: 3, # 'r' - 115: 8, # 's' - 116: 5, # 't' - 117: 17, # 'u' - 118: 16, # 'v' - 119: 64, # 'w' - 120: 71, # 'x' - 121: 25, # 'y' - 122: 69, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '€' - 165: 253, # '¥' - 166: 255, # 'Š' - 167: 253, # '§' - 168: 255, # 'š' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 255, # 'Ž' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ž' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 255, # 'Œ' - 189: 255, # 'œ' - 190: 255, # 'Ÿ' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 39, # 'A' + 66: 43, # 'B' + 67: 54, # 'C' + 68: 34, # 'D' + 69: 49, # 'E' + 70: 45, # 'F' + 71: 60, # 'G' + 72: 42, # 'H' + 73: 44, # 'I' + 74: 62, # 'J' + 75: 46, # 'K' + 76: 59, # 'L' + 77: 41, # 'M' + 78: 57, # 'N' + 79: 66, # 'O' + 80: 53, # 'P' + 81: 92, # 'Q' + 82: 56, # 'R' + 83: 33, # 'S' + 84: 48, # 'T' + 85: 67, # 'U' + 86: 61, # 'V' + 87: 68, # 'W' + 88: 91, # 'X' + 89: 77, # 'Y' + 90: 80, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 6, # 'a' + 98: 18, # 'b' + 99: 30, # 'c' + 100: 9, # 'd' + 101: 2, # 'e' + 102: 15, # 'f' + 103: 12, # 'g' + 104: 19, # 'h' + 105: 7, # 'i' + 106: 31, # 'j' + 107: 13, # 'k' + 108: 11, # 'l' + 109: 14, # 'm' + 110: 4, # 'n' + 111: 10, # 'o' + 112: 21, # 'p' + 113: 83, # 'q' + 114: 3, # 'r' + 115: 8, # 's' + 116: 5, # 't' + 117: 17, # 'u' + 118: 16, # 'v' + 119: 64, # 'w' + 120: 71, # 'x' + 121: 25, # 'y' + 122: 69, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '€' + 165: 253, # '¥' + 166: 255, # 'Š' + 167: 253, # '§' + 168: 255, # 'š' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 255, # 'Ž' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ž' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 255, # 'Œ' + 189: 255, # 'œ' + 190: 255, # 'Ÿ' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_15_DANISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-15', - language='Danish', - char_to_order_map=ISO_8859_15_DANISH_CHAR_TO_ORDER, - language_model=DANISH_LANG_MODEL, - typical_positive_ratio=0.930092242592071, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÅÆØåæø') +ISO_8859_15_DANISH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-15", + language="Danish", + char_to_order_map=ISO_8859_15_DANISH_CHAR_TO_ORDER, + language_model=DANISH_LANG_MODEL, + typical_positive_ratio=0.930092242592071, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÅÆØåæø", +) WINDOWS_1252_DANISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 39, # 'A' - 66: 43, # 'B' - 67: 54, # 'C' - 68: 34, # 'D' - 69: 49, # 'E' - 70: 45, # 'F' - 71: 60, # 'G' - 72: 42, # 'H' - 73: 44, # 'I' - 74: 62, # 'J' - 75: 46, # 'K' - 76: 59, # 'L' - 77: 41, # 'M' - 78: 57, # 'N' - 79: 66, # 'O' - 80: 53, # 'P' - 81: 92, # 'Q' - 82: 56, # 'R' - 83: 33, # 'S' - 84: 48, # 'T' - 85: 67, # 'U' - 86: 61, # 'V' - 87: 68, # 'W' - 88: 91, # 'X' - 89: 77, # 'Y' - 90: 80, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 6, # 'a' - 98: 18, # 'b' - 99: 30, # 'c' - 100: 9, # 'd' - 101: 2, # 'e' - 102: 15, # 'f' - 103: 12, # 'g' - 104: 19, # 'h' - 105: 7, # 'i' - 106: 31, # 'j' - 107: 13, # 'k' - 108: 11, # 'l' - 109: 14, # 'm' - 110: 4, # 'n' - 111: 10, # 'o' - 112: 21, # 'p' - 113: 83, # 'q' - 114: 3, # 'r' - 115: 8, # 's' - 116: 5, # 't' - 117: 17, # 'u' - 118: 16, # 'v' - 119: 64, # 'w' - 120: 71, # 'x' - 121: 25, # 'y' - 122: 69, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # 'ƒ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # 'ˆ' - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Œ' - 141: 255, # None - 142: 255, # 'Ž' - 143: 255, # None - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 253, # '˜' - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'œ' - 157: 255, # None - 158: 255, # 'ž' - 159: 255, # 'Ÿ' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 39, # 'A' + 66: 43, # 'B' + 67: 54, # 'C' + 68: 34, # 'D' + 69: 49, # 'E' + 70: 45, # 'F' + 71: 60, # 'G' + 72: 42, # 'H' + 73: 44, # 'I' + 74: 62, # 'J' + 75: 46, # 'K' + 76: 59, # 'L' + 77: 41, # 'M' + 78: 57, # 'N' + 79: 66, # 'O' + 80: 53, # 'P' + 81: 92, # 'Q' + 82: 56, # 'R' + 83: 33, # 'S' + 84: 48, # 'T' + 85: 67, # 'U' + 86: 61, # 'V' + 87: 68, # 'W' + 88: 91, # 'X' + 89: 77, # 'Y' + 90: 80, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 6, # 'a' + 98: 18, # 'b' + 99: 30, # 'c' + 100: 9, # 'd' + 101: 2, # 'e' + 102: 15, # 'f' + 103: 12, # 'g' + 104: 19, # 'h' + 105: 7, # 'i' + 106: 31, # 'j' + 107: 13, # 'k' + 108: 11, # 'l' + 109: 14, # 'm' + 110: 4, # 'n' + 111: 10, # 'o' + 112: 21, # 'p' + 113: 83, # 'q' + 114: 3, # 'r' + 115: 8, # 's' + 116: 5, # 't' + 117: 17, # 'u' + 118: 16, # 'v' + 119: 64, # 'w' + 120: 71, # 'x' + 121: 25, # 'y' + 122: 69, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # 'ƒ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # 'ˆ' + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Œ' + 141: 255, # None + 142: 255, # 'Ž' + 143: 255, # None + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 253, # '˜' + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'œ' + 157: 255, # None + 158: 255, # 'ž' + 159: 255, # 'Ÿ' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -WINDOWS_1252_DANISH_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1252', - language='Danish', - char_to_order_map=WINDOWS_1252_DANISH_CHAR_TO_ORDER, - language_model=DANISH_LANG_MODEL, - typical_positive_ratio=0.9313804502159864, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÅÆØåæø') - +WINDOWS_1252_DANISH_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1252", + language="Danish", + char_to_order_map=WINDOWS_1252_DANISH_CHAR_TO_ORDER, + language_model=DANISH_LANG_MODEL, + typical_positive_ratio=0.9313804502159864, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÅÆØåæø", +) diff --git a/chardet/langdutchmodel.py b/chardet/langdutchmodel.py index dd547117..9fc28d0c 100644 --- a/chardet/langdutchmodel.py +++ b/chardet/langdutchmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -8028,536 +8026,539 @@ # Character Mapping Table(s): ISO_8859_1_DUTCH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 36, # 'A' - 66: 38, # 'B' - 67: 45, # 'C' - 68: 33, # 'D' - 69: 49, # 'E' - 70: 60, # 'F' - 71: 58, # 'G' - 72: 39, # 'H' - 73: 42, # 'I' - 74: 62, # 'J' - 75: 59, # 'K' - 76: 57, # 'L' - 77: 44, # 'M' - 78: 41, # 'N' - 79: 56, # 'O' - 80: 50, # 'P' - 81: 89, # 'Q' - 82: 54, # 'R' - 83: 37, # 'S' - 84: 55, # 'T' - 85: 70, # 'U' - 86: 53, # 'V' - 87: 61, # 'W' - 88: 87, # 'X' - 89: 77, # 'Y' - 90: 65, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 4, # 'a' - 98: 20, # 'b' - 99: 18, # 'c' - 100: 9, # 'd' - 101: 2, # 'e' - 102: 28, # 'f' - 103: 12, # 'g' - 104: 14, # 'h' - 105: 5, # 'i' - 106: 22, # 'j' - 107: 17, # 'k' - 108: 11, # 'l' - 109: 15, # 'm' - 110: 3, # 'n' - 111: 8, # 'o' - 112: 19, # 'p' - 113: 74, # 'q' - 114: 6, # 'r' - 115: 10, # 's' - 116: 7, # 't' - 117: 16, # 'u' - 118: 13, # 'v' - 119: 21, # 'w' - 120: 66, # 'x' - 121: 43, # 'y' - 122: 26, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 36, # 'A' + 66: 38, # 'B' + 67: 45, # 'C' + 68: 33, # 'D' + 69: 49, # 'E' + 70: 60, # 'F' + 71: 58, # 'G' + 72: 39, # 'H' + 73: 42, # 'I' + 74: 62, # 'J' + 75: 59, # 'K' + 76: 57, # 'L' + 77: 44, # 'M' + 78: 41, # 'N' + 79: 56, # 'O' + 80: 50, # 'P' + 81: 89, # 'Q' + 82: 54, # 'R' + 83: 37, # 'S' + 84: 55, # 'T' + 85: 70, # 'U' + 86: 53, # 'V' + 87: 61, # 'W' + 88: 87, # 'X' + 89: 77, # 'Y' + 90: 65, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 4, # 'a' + 98: 20, # 'b' + 99: 18, # 'c' + 100: 9, # 'd' + 101: 2, # 'e' + 102: 28, # 'f' + 103: 12, # 'g' + 104: 14, # 'h' + 105: 5, # 'i' + 106: 22, # 'j' + 107: 17, # 'k' + 108: 11, # 'l' + 109: 15, # 'm' + 110: 3, # 'n' + 111: 8, # 'o' + 112: 19, # 'p' + 113: 74, # 'q' + 114: 6, # 'r' + 115: 10, # 's' + 116: 7, # 't' + 117: 16, # 'u' + 118: 13, # 'v' + 119: 21, # 'w' + 120: 66, # 'x' + 121: 43, # 'y' + 122: 26, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_1_DUTCH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-1', - language='Dutch', - char_to_order_map=ISO_8859_1_DUTCH_CHAR_TO_ORDER, - language_model=DUTCH_LANG_MODEL, - typical_positive_ratio=0.9437677577008142, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz') +ISO_8859_1_DUTCH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-1", + language="Dutch", + char_to_order_map=ISO_8859_1_DUTCH_CHAR_TO_ORDER, + language_model=DUTCH_LANG_MODEL, + typical_positive_ratio=0.9437677577008142, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", +) WINDOWS_1252_DUTCH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 36, # 'A' - 66: 38, # 'B' - 67: 45, # 'C' - 68: 33, # 'D' - 69: 49, # 'E' - 70: 60, # 'F' - 71: 58, # 'G' - 72: 39, # 'H' - 73: 42, # 'I' - 74: 62, # 'J' - 75: 59, # 'K' - 76: 57, # 'L' - 77: 44, # 'M' - 78: 41, # 'N' - 79: 56, # 'O' - 80: 50, # 'P' - 81: 89, # 'Q' - 82: 54, # 'R' - 83: 37, # 'S' - 84: 55, # 'T' - 85: 70, # 'U' - 86: 53, # 'V' - 87: 61, # 'W' - 88: 87, # 'X' - 89: 77, # 'Y' - 90: 65, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 4, # 'a' - 98: 20, # 'b' - 99: 18, # 'c' - 100: 9, # 'd' - 101: 2, # 'e' - 102: 28, # 'f' - 103: 12, # 'g' - 104: 14, # 'h' - 105: 5, # 'i' - 106: 22, # 'j' - 107: 17, # 'k' - 108: 11, # 'l' - 109: 15, # 'm' - 110: 3, # 'n' - 111: 8, # 'o' - 112: 19, # 'p' - 113: 74, # 'q' - 114: 6, # 'r' - 115: 10, # 's' - 116: 7, # 't' - 117: 16, # 'u' - 118: 13, # 'v' - 119: 21, # 'w' - 120: 66, # 'x' - 121: 43, # 'y' - 122: 26, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # 'ƒ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # 'ˆ' - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Œ' - 141: 255, # None - 142: 255, # 'Ž' - 143: 255, # None - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 253, # '˜' - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'œ' - 157: 255, # None - 158: 255, # 'ž' - 159: 255, # 'Ÿ' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 36, # 'A' + 66: 38, # 'B' + 67: 45, # 'C' + 68: 33, # 'D' + 69: 49, # 'E' + 70: 60, # 'F' + 71: 58, # 'G' + 72: 39, # 'H' + 73: 42, # 'I' + 74: 62, # 'J' + 75: 59, # 'K' + 76: 57, # 'L' + 77: 44, # 'M' + 78: 41, # 'N' + 79: 56, # 'O' + 80: 50, # 'P' + 81: 89, # 'Q' + 82: 54, # 'R' + 83: 37, # 'S' + 84: 55, # 'T' + 85: 70, # 'U' + 86: 53, # 'V' + 87: 61, # 'W' + 88: 87, # 'X' + 89: 77, # 'Y' + 90: 65, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 4, # 'a' + 98: 20, # 'b' + 99: 18, # 'c' + 100: 9, # 'd' + 101: 2, # 'e' + 102: 28, # 'f' + 103: 12, # 'g' + 104: 14, # 'h' + 105: 5, # 'i' + 106: 22, # 'j' + 107: 17, # 'k' + 108: 11, # 'l' + 109: 15, # 'm' + 110: 3, # 'n' + 111: 8, # 'o' + 112: 19, # 'p' + 113: 74, # 'q' + 114: 6, # 'r' + 115: 10, # 's' + 116: 7, # 't' + 117: 16, # 'u' + 118: 13, # 'v' + 119: 21, # 'w' + 120: 66, # 'x' + 121: 43, # 'y' + 122: 26, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # 'ƒ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # 'ˆ' + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Œ' + 141: 255, # None + 142: 255, # 'Ž' + 143: 255, # None + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 253, # '˜' + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'œ' + 157: 255, # None + 158: 255, # 'ž' + 159: 255, # 'Ÿ' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -WINDOWS_1252_DUTCH_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1252', - language='Dutch', - char_to_order_map=WINDOWS_1252_DUTCH_CHAR_TO_ORDER, - language_model=DUTCH_LANG_MODEL, - typical_positive_ratio=0.9441397978316203, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz') - +WINDOWS_1252_DUTCH_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1252", + language="Dutch", + char_to_order_map=WINDOWS_1252_DUTCH_CHAR_TO_ORDER, + language_model=DUTCH_LANG_MODEL, + typical_positive_ratio=0.9441397978316203, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", +) diff --git a/chardet/langenglishmodel.py b/chardet/langenglishmodel.py index 29fecc00..60e599fa 100644 --- a/chardet/langenglishmodel.py +++ b/chardet/langenglishmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -6180,536 +6178,539 @@ # Character Mapping Table(s): ISO_8859_1_ENGLISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 30, # 'A' - 66: 37, # 'B' - 67: 31, # 'C' - 68: 46, # 'D' - 69: 52, # 'E' - 70: 48, # 'F' - 71: 49, # 'G' - 72: 43, # 'H' - 73: 36, # 'I' - 74: 61, # 'J' - 75: 66, # 'K' - 76: 51, # 'L' - 77: 35, # 'M' - 78: 50, # 'N' - 79: 60, # 'O' - 80: 38, # 'P' - 81: 76, # 'Q' - 82: 42, # 'R' - 83: 28, # 'S' - 84: 32, # 'T' - 85: 63, # 'U' - 86: 69, # 'V' - 87: 53, # 'W' - 88: 78, # 'X' - 89: 71, # 'Y' - 90: 74, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 3, # 'a' - 98: 22, # 'b' - 99: 13, # 'c' - 100: 12, # 'd' - 101: 2, # 'e' - 102: 16, # 'f' - 103: 18, # 'g' - 104: 10, # 'h' - 105: 5, # 'i' - 106: 65, # 'j' - 107: 27, # 'k' - 108: 11, # 'l' - 109: 15, # 'm' - 110: 6, # 'n' - 111: 7, # 'o' - 112: 17, # 'p' - 113: 68, # 'q' - 114: 8, # 'r' - 115: 9, # 's' - 116: 4, # 't' - 117: 14, # 'u' - 118: 24, # 'v' - 119: 20, # 'w' - 120: 45, # 'x' - 121: 19, # 'y' - 122: 62, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 30, # 'A' + 66: 37, # 'B' + 67: 31, # 'C' + 68: 46, # 'D' + 69: 52, # 'E' + 70: 48, # 'F' + 71: 49, # 'G' + 72: 43, # 'H' + 73: 36, # 'I' + 74: 61, # 'J' + 75: 66, # 'K' + 76: 51, # 'L' + 77: 35, # 'M' + 78: 50, # 'N' + 79: 60, # 'O' + 80: 38, # 'P' + 81: 76, # 'Q' + 82: 42, # 'R' + 83: 28, # 'S' + 84: 32, # 'T' + 85: 63, # 'U' + 86: 69, # 'V' + 87: 53, # 'W' + 88: 78, # 'X' + 89: 71, # 'Y' + 90: 74, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 3, # 'a' + 98: 22, # 'b' + 99: 13, # 'c' + 100: 12, # 'd' + 101: 2, # 'e' + 102: 16, # 'f' + 103: 18, # 'g' + 104: 10, # 'h' + 105: 5, # 'i' + 106: 65, # 'j' + 107: 27, # 'k' + 108: 11, # 'l' + 109: 15, # 'm' + 110: 6, # 'n' + 111: 7, # 'o' + 112: 17, # 'p' + 113: 68, # 'q' + 114: 8, # 'r' + 115: 9, # 's' + 116: 4, # 't' + 117: 14, # 'u' + 118: 24, # 'v' + 119: 20, # 'w' + 120: 45, # 'x' + 121: 19, # 'y' + 122: 62, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_1_ENGLISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-1', - language='English', - char_to_order_map=ISO_8859_1_ENGLISH_CHAR_TO_ORDER, - language_model=ENGLISH_LANG_MODEL, - typical_positive_ratio=0.9450843068716352, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz') +ISO_8859_1_ENGLISH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-1", + language="English", + char_to_order_map=ISO_8859_1_ENGLISH_CHAR_TO_ORDER, + language_model=ENGLISH_LANG_MODEL, + typical_positive_ratio=0.9450843068716352, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", +) WINDOWS_1252_ENGLISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 30, # 'A' - 66: 37, # 'B' - 67: 31, # 'C' - 68: 46, # 'D' - 69: 52, # 'E' - 70: 48, # 'F' - 71: 49, # 'G' - 72: 43, # 'H' - 73: 36, # 'I' - 74: 61, # 'J' - 75: 66, # 'K' - 76: 51, # 'L' - 77: 35, # 'M' - 78: 50, # 'N' - 79: 60, # 'O' - 80: 38, # 'P' - 81: 76, # 'Q' - 82: 42, # 'R' - 83: 28, # 'S' - 84: 32, # 'T' - 85: 63, # 'U' - 86: 69, # 'V' - 87: 53, # 'W' - 88: 78, # 'X' - 89: 71, # 'Y' - 90: 74, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 3, # 'a' - 98: 22, # 'b' - 99: 13, # 'c' - 100: 12, # 'd' - 101: 2, # 'e' - 102: 16, # 'f' - 103: 18, # 'g' - 104: 10, # 'h' - 105: 5, # 'i' - 106: 65, # 'j' - 107: 27, # 'k' - 108: 11, # 'l' - 109: 15, # 'm' - 110: 6, # 'n' - 111: 7, # 'o' - 112: 17, # 'p' - 113: 68, # 'q' - 114: 8, # 'r' - 115: 9, # 's' - 116: 4, # 't' - 117: 14, # 'u' - 118: 24, # 'v' - 119: 20, # 'w' - 120: 45, # 'x' - 121: 19, # 'y' - 122: 62, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # 'ƒ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # 'ˆ' - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Œ' - 141: 255, # None - 142: 255, # 'Ž' - 143: 255, # None - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 253, # '˜' - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'œ' - 157: 255, # None - 158: 255, # 'ž' - 159: 255, # 'Ÿ' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 30, # 'A' + 66: 37, # 'B' + 67: 31, # 'C' + 68: 46, # 'D' + 69: 52, # 'E' + 70: 48, # 'F' + 71: 49, # 'G' + 72: 43, # 'H' + 73: 36, # 'I' + 74: 61, # 'J' + 75: 66, # 'K' + 76: 51, # 'L' + 77: 35, # 'M' + 78: 50, # 'N' + 79: 60, # 'O' + 80: 38, # 'P' + 81: 76, # 'Q' + 82: 42, # 'R' + 83: 28, # 'S' + 84: 32, # 'T' + 85: 63, # 'U' + 86: 69, # 'V' + 87: 53, # 'W' + 88: 78, # 'X' + 89: 71, # 'Y' + 90: 74, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 3, # 'a' + 98: 22, # 'b' + 99: 13, # 'c' + 100: 12, # 'd' + 101: 2, # 'e' + 102: 16, # 'f' + 103: 18, # 'g' + 104: 10, # 'h' + 105: 5, # 'i' + 106: 65, # 'j' + 107: 27, # 'k' + 108: 11, # 'l' + 109: 15, # 'm' + 110: 6, # 'n' + 111: 7, # 'o' + 112: 17, # 'p' + 113: 68, # 'q' + 114: 8, # 'r' + 115: 9, # 's' + 116: 4, # 't' + 117: 14, # 'u' + 118: 24, # 'v' + 119: 20, # 'w' + 120: 45, # 'x' + 121: 19, # 'y' + 122: 62, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # 'ƒ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # 'ˆ' + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Œ' + 141: 255, # None + 142: 255, # 'Ž' + 143: 255, # None + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 253, # '˜' + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'œ' + 157: 255, # None + 158: 255, # 'ž' + 159: 255, # 'Ÿ' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -WINDOWS_1252_ENGLISH_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1252', - language='English', - char_to_order_map=WINDOWS_1252_ENGLISH_CHAR_TO_ORDER, - language_model=ENGLISH_LANG_MODEL, - typical_positive_ratio=0.9461281735386932, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz') - +WINDOWS_1252_ENGLISH_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1252", + language="English", + char_to_order_map=WINDOWS_1252_ENGLISH_CHAR_TO_ORDER, + language_model=ENGLISH_LANG_MODEL, + typical_positive_ratio=0.9461281735386932, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", +) diff --git a/chardet/langesperantomodel.py b/chardet/langesperantomodel.py index df9cd82f..9969f05e 100644 --- a/chardet/langesperantomodel.py +++ b/chardet/langesperantomodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -34428,269 +34426,270 @@ # Character Mapping Table(s): ISO_8859_3_ESPERANTO_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ħ' - 162: 253, # '˘' - 163: 253, # '£' - 164: 253, # '¤' - 165: 255, # None - 166: 255, # 'Ĥ' - 167: 253, # '§' - 168: 253, # '¨' - 169: 255, # 'İ' - 170: 255, # 'Ş' - 171: 255, # 'Ğ' - 172: 255, # 'Ĵ' - 173: 251, # '\xad' - 174: 255, # None - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 255, # 'ħ' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 255, # 'ĥ' - 183: 253, # '·' - 184: 253, # '¸' - 185: 255, # 'ı' - 186: 255, # 'ş' - 187: 255, # 'ğ' - 188: 255, # 'ĵ' - 189: 252, # '½' - 190: 255, # None - 191: 255, # 'ż' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # None - 196: 255, # 'Ä' - 197: 255, # 'Ċ' - 198: 255, # 'Ĉ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # None - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ġ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ĝ' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ŭ' - 222: 255, # 'Ŝ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # None - 228: 255, # 'ä' - 229: 255, # 'ċ' - 230: 255, # 'ĉ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # None - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ġ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ĝ' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ŭ' - 254: 255, # 'ŝ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ħ' + 162: 253, # '˘' + 163: 253, # '£' + 164: 253, # '¤' + 165: 255, # None + 166: 255, # 'Ĥ' + 167: 253, # '§' + 168: 253, # '¨' + 169: 255, # 'İ' + 170: 255, # 'Ş' + 171: 255, # 'Ğ' + 172: 255, # 'Ĵ' + 173: 251, # '\xad' + 174: 255, # None + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 255, # 'ħ' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 255, # 'ĥ' + 183: 253, # '·' + 184: 253, # '¸' + 185: 255, # 'ı' + 186: 255, # 'ş' + 187: 255, # 'ğ' + 188: 255, # 'ĵ' + 189: 252, # '½' + 190: 255, # None + 191: 255, # 'ż' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # None + 196: 255, # 'Ä' + 197: 255, # 'Ċ' + 198: 255, # 'Ĉ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # None + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ġ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ĝ' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ŭ' + 222: 255, # 'Ŝ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # None + 228: 255, # 'ä' + 229: 255, # 'ċ' + 230: 255, # 'ĉ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # None + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ġ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ĝ' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ŭ' + 254: 255, # 'ŝ' + 255: 253, # '˙' } -ISO_8859_3_ESPERANTO_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-3', - language='Esperanto', - char_to_order_map=ISO_8859_3_ESPERANTO_CHAR_TO_ORDER, - language_model=ESPERANTO_LANG_MODEL, - typical_positive_ratio=0.9757895606481384, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĈĉĜĝĤĥĴĵŜŝŬŭ') - +ISO_8859_3_ESPERANTO_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-3", + language="Esperanto", + char_to_order_map=ISO_8859_3_ESPERANTO_CHAR_TO_ORDER, + language_model=ESPERANTO_LANG_MODEL, + typical_positive_ratio=0.9757895606481384, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĈĉĜĝĤĥĴĵŜŝŬŭ", +) diff --git a/chardet/langestonianmodel.py b/chardet/langestonianmodel.py index 3d99b7bc..e6f70350 100644 --- a/chardet/langestonianmodel.py +++ b/chardet/langestonianmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -10320,803 +10318,808 @@ # Character Mapping Table(s): ISO_8859_4_ESTONIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ą' - 162: 255, # 'ĸ' - 163: 255, # 'Ŗ' - 164: 253, # '¤' - 165: 255, # 'Ĩ' - 166: 255, # 'Ļ' - 167: 253, # '§' - 168: 253, # '¨' - 169: 255, # 'Š' - 170: 255, # 'Ē' - 171: 255, # 'Ģ' - 172: 255, # 'Ŧ' - 173: 251, # '\xad' - 174: 255, # 'Ž' - 175: 253, # '¯' - 176: 253, # '°' - 177: 255, # 'ą' - 178: 253, # '˛' - 179: 255, # 'ŗ' - 180: 253, # '´' - 181: 255, # 'ĩ' - 182: 255, # 'ļ' - 183: 255, # 'ˇ' - 184: 253, # '¸' - 185: 255, # 'š' - 186: 255, # 'ē' - 187: 255, # 'ģ' - 188: 255, # 'ŧ' - 189: 255, # 'Ŋ' - 190: 255, # 'ž' - 191: 255, # 'ŋ' - 192: 255, # 'Ā' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Į' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ė' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ī' - 208: 255, # 'Đ' - 209: 255, # 'Ņ' - 210: 255, # 'Ō' - 211: 255, # 'Ķ' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ų' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ũ' - 222: 255, # 'Ū' - 223: 255, # 'ß' - 224: 255, # 'ā' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'į' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ė' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ī' - 240: 255, # 'đ' - 241: 255, # 'ņ' - 242: 255, # 'ō' - 243: 255, # 'ķ' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ų' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ũ' - 254: 255, # 'ū' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ą' + 162: 255, # 'ĸ' + 163: 255, # 'Ŗ' + 164: 253, # '¤' + 165: 255, # 'Ĩ' + 166: 255, # 'Ļ' + 167: 253, # '§' + 168: 253, # '¨' + 169: 255, # 'Š' + 170: 255, # 'Ē' + 171: 255, # 'Ģ' + 172: 255, # 'Ŧ' + 173: 251, # '\xad' + 174: 255, # 'Ž' + 175: 253, # '¯' + 176: 253, # '°' + 177: 255, # 'ą' + 178: 253, # '˛' + 179: 255, # 'ŗ' + 180: 253, # '´' + 181: 255, # 'ĩ' + 182: 255, # 'ļ' + 183: 255, # 'ˇ' + 184: 253, # '¸' + 185: 255, # 'š' + 186: 255, # 'ē' + 187: 255, # 'ģ' + 188: 255, # 'ŧ' + 189: 255, # 'Ŋ' + 190: 255, # 'ž' + 191: 255, # 'ŋ' + 192: 255, # 'Ā' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Į' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ė' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ī' + 208: 255, # 'Đ' + 209: 255, # 'Ņ' + 210: 255, # 'Ō' + 211: 255, # 'Ķ' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ų' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ũ' + 222: 255, # 'Ū' + 223: 255, # 'ß' + 224: 255, # 'ā' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'į' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ė' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ī' + 240: 255, # 'đ' + 241: 255, # 'ņ' + 242: 255, # 'ō' + 243: 255, # 'ķ' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ų' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ũ' + 254: 255, # 'ū' + 255: 253, # '˙' } -ISO_8859_4_ESTONIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-4', - language='Estonian', - char_to_order_map=ISO_8859_4_ESTONIAN_CHAR_TO_ORDER, - language_model=ESTONIAN_LANG_MODEL, - typical_positive_ratio=0.9397250672098478, - keep_ascii_letters=False, - alphabet='ABDEGHIJKLMNOPRSTUVabdeghijklmnoprstuvÄÕÖÜäõöü') +ISO_8859_4_ESTONIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-4", + language="Estonian", + char_to_order_map=ISO_8859_4_ESTONIAN_CHAR_TO_ORDER, + language_model=ESTONIAN_LANG_MODEL, + typical_positive_ratio=0.9397250672098478, + keep_ascii_letters=False, + alphabet="ABDEGHIJKLMNOPRSTUVabdeghijklmnoprstuvÄÕÖÜäõöü", +) ISO_8859_13_ESTONIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '”' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '„' - 166: 253, # '¦' - 167: 253, # '§' - 168: 255, # 'Ø' - 169: 253, # '©' - 170: 255, # 'Ŗ' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Æ' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '“' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ø' - 185: 252, # '¹' - 186: 255, # 'ŗ' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 255, # 'æ' - 192: 255, # 'Ą' - 193: 255, # 'Į' - 194: 255, # 'Ā' - 195: 255, # 'Ć' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Ę' - 199: 255, # 'Ē' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ź' - 203: 255, # 'Ė' - 204: 255, # 'Ģ' - 205: 255, # 'Ķ' - 206: 255, # 'Ī' - 207: 255, # 'Ļ' - 208: 255, # 'Š' - 209: 255, # 'Ń' - 210: 255, # 'Ņ' - 211: 255, # 'Ó' - 212: 255, # 'Ō' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ų' - 217: 255, # 'Ł' - 218: 255, # 'Ś' - 219: 255, # 'Ū' - 220: 255, # 'Ü' - 221: 255, # 'Ż' - 222: 255, # 'Ž' - 223: 255, # 'ß' - 224: 255, # 'ą' - 225: 255, # 'į' - 226: 255, # 'ā' - 227: 255, # 'ć' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'ę' - 231: 255, # 'ē' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ź' - 235: 255, # 'ė' - 236: 255, # 'ģ' - 237: 255, # 'ķ' - 238: 255, # 'ī' - 239: 255, # 'ļ' - 240: 255, # 'š' - 241: 255, # 'ń' - 242: 255, # 'ņ' - 243: 255, # 'ó' - 244: 255, # 'ō' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ų' - 249: 255, # 'ł' - 250: 255, # 'ś' - 251: 255, # 'ū' - 252: 255, # 'ü' - 253: 255, # 'ż' - 254: 255, # 'ž' - 255: 253, # '’' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '”' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '„' + 166: 253, # '¦' + 167: 253, # '§' + 168: 255, # 'Ø' + 169: 253, # '©' + 170: 255, # 'Ŗ' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Æ' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '“' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ø' + 185: 252, # '¹' + 186: 255, # 'ŗ' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 255, # 'æ' + 192: 255, # 'Ą' + 193: 255, # 'Į' + 194: 255, # 'Ā' + 195: 255, # 'Ć' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Ę' + 199: 255, # 'Ē' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ź' + 203: 255, # 'Ė' + 204: 255, # 'Ģ' + 205: 255, # 'Ķ' + 206: 255, # 'Ī' + 207: 255, # 'Ļ' + 208: 255, # 'Š' + 209: 255, # 'Ń' + 210: 255, # 'Ņ' + 211: 255, # 'Ó' + 212: 255, # 'Ō' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ų' + 217: 255, # 'Ł' + 218: 255, # 'Ś' + 219: 255, # 'Ū' + 220: 255, # 'Ü' + 221: 255, # 'Ż' + 222: 255, # 'Ž' + 223: 255, # 'ß' + 224: 255, # 'ą' + 225: 255, # 'į' + 226: 255, # 'ā' + 227: 255, # 'ć' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'ę' + 231: 255, # 'ē' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ź' + 235: 255, # 'ė' + 236: 255, # 'ģ' + 237: 255, # 'ķ' + 238: 255, # 'ī' + 239: 255, # 'ļ' + 240: 255, # 'š' + 241: 255, # 'ń' + 242: 255, # 'ņ' + 243: 255, # 'ó' + 244: 255, # 'ō' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ų' + 249: 255, # 'ł' + 250: 255, # 'ś' + 251: 255, # 'ū' + 252: 255, # 'ü' + 253: 255, # 'ż' + 254: 255, # 'ž' + 255: 253, # '’' } -ISO_8859_13_ESTONIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-13', - language='Estonian', - char_to_order_map=ISO_8859_13_ESTONIAN_CHAR_TO_ORDER, - language_model=ESTONIAN_LANG_MODEL, - typical_positive_ratio=0.9398629524036475, - keep_ascii_letters=False, - alphabet='ABDEGHIJKLMNOPRSTUVabdeghijklmnoprstuvÄÕÖÜäõöü') +ISO_8859_13_ESTONIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-13", + language="Estonian", + char_to_order_map=ISO_8859_13_ESTONIAN_CHAR_TO_ORDER, + language_model=ESTONIAN_LANG_MODEL, + typical_positive_ratio=0.9398629524036475, + keep_ascii_letters=False, + alphabet="ABDEGHIJKLMNOPRSTUVabdeghijklmnoprstuvÄÕÖÜäõöü", +) WINDOWS_1257_ESTONIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # None - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # None - 137: 253, # '‰' - 138: 255, # None - 139: 253, # '‹' - 140: 255, # None - 141: 253, # '¨' - 142: 255, # 'ˇ' - 143: 253, # '¸' - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 255, # None - 153: 253, # '™' - 154: 255, # None - 155: 253, # '›' - 156: 255, # None - 157: 253, # '¯' - 158: 253, # '˛' - 159: 255, # None - 160: 251, # '\xa0' - 161: 255, # None - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 255, # None - 166: 253, # '¦' - 167: 253, # '§' - 168: 255, # 'Ø' - 169: 253, # '©' - 170: 255, # 'Ŗ' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Æ' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ø' - 185: 252, # '¹' - 186: 255, # 'ŗ' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 255, # 'æ' - 192: 255, # 'Ą' - 193: 255, # 'Į' - 194: 255, # 'Ā' - 195: 255, # 'Ć' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Ę' - 199: 255, # 'Ē' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ź' - 203: 255, # 'Ė' - 204: 255, # 'Ģ' - 205: 255, # 'Ķ' - 206: 255, # 'Ī' - 207: 255, # 'Ļ' - 208: 255, # 'Š' - 209: 255, # 'Ń' - 210: 255, # 'Ņ' - 211: 255, # 'Ó' - 212: 255, # 'Ō' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ų' - 217: 255, # 'Ł' - 218: 255, # 'Ś' - 219: 255, # 'Ū' - 220: 255, # 'Ü' - 221: 255, # 'Ż' - 222: 255, # 'Ž' - 223: 255, # 'ß' - 224: 255, # 'ą' - 225: 255, # 'į' - 226: 255, # 'ā' - 227: 255, # 'ć' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'ę' - 231: 255, # 'ē' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ź' - 235: 255, # 'ė' - 236: 255, # 'ģ' - 237: 255, # 'ķ' - 238: 255, # 'ī' - 239: 255, # 'ļ' - 240: 255, # 'š' - 241: 255, # 'ń' - 242: 255, # 'ņ' - 243: 255, # 'ó' - 244: 255, # 'ō' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ų' - 249: 255, # 'ł' - 250: 255, # 'ś' - 251: 255, # 'ū' - 252: 255, # 'ü' - 253: 255, # 'ż' - 254: 255, # 'ž' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # None + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # None + 137: 253, # '‰' + 138: 255, # None + 139: 253, # '‹' + 140: 255, # None + 141: 253, # '¨' + 142: 255, # 'ˇ' + 143: 253, # '¸' + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 255, # None + 153: 253, # '™' + 154: 255, # None + 155: 253, # '›' + 156: 255, # None + 157: 253, # '¯' + 158: 253, # '˛' + 159: 255, # None + 160: 251, # '\xa0' + 161: 255, # None + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 255, # None + 166: 253, # '¦' + 167: 253, # '§' + 168: 255, # 'Ø' + 169: 253, # '©' + 170: 255, # 'Ŗ' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Æ' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ø' + 185: 252, # '¹' + 186: 255, # 'ŗ' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 255, # 'æ' + 192: 255, # 'Ą' + 193: 255, # 'Į' + 194: 255, # 'Ā' + 195: 255, # 'Ć' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Ę' + 199: 255, # 'Ē' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ź' + 203: 255, # 'Ė' + 204: 255, # 'Ģ' + 205: 255, # 'Ķ' + 206: 255, # 'Ī' + 207: 255, # 'Ļ' + 208: 255, # 'Š' + 209: 255, # 'Ń' + 210: 255, # 'Ņ' + 211: 255, # 'Ó' + 212: 255, # 'Ō' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ų' + 217: 255, # 'Ł' + 218: 255, # 'Ś' + 219: 255, # 'Ū' + 220: 255, # 'Ü' + 221: 255, # 'Ż' + 222: 255, # 'Ž' + 223: 255, # 'ß' + 224: 255, # 'ą' + 225: 255, # 'į' + 226: 255, # 'ā' + 227: 255, # 'ć' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'ę' + 231: 255, # 'ē' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ź' + 235: 255, # 'ė' + 236: 255, # 'ģ' + 237: 255, # 'ķ' + 238: 255, # 'ī' + 239: 255, # 'ļ' + 240: 255, # 'š' + 241: 255, # 'ń' + 242: 255, # 'ņ' + 243: 255, # 'ó' + 244: 255, # 'ō' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ų' + 249: 255, # 'ł' + 250: 255, # 'ś' + 251: 255, # 'ū' + 252: 255, # 'ü' + 253: 255, # 'ż' + 254: 255, # 'ž' + 255: 253, # '˙' } -WINDOWS_1257_ESTONIAN_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1257', - language='Estonian', - char_to_order_map=WINDOWS_1257_ESTONIAN_CHAR_TO_ORDER, - language_model=ESTONIAN_LANG_MODEL, - typical_positive_ratio=0.9688719889045279, - keep_ascii_letters=False, - alphabet='ABDEGHIJKLMNOPRSTUVabdeghijklmnoprstuvÄÕÖÜäõöü') - +WINDOWS_1257_ESTONIAN_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1257", + language="Estonian", + char_to_order_map=WINDOWS_1257_ESTONIAN_CHAR_TO_ORDER, + language_model=ESTONIAN_LANG_MODEL, + typical_positive_ratio=0.9688719889045279, + keep_ascii_letters=False, + alphabet="ABDEGHIJKLMNOPRSTUVabdeghijklmnoprstuvÄÕÖÜäõöü", +) diff --git a/chardet/langfinnishmodel.py b/chardet/langfinnishmodel.py index 2296b2c0..1503838b 100644 --- a/chardet/langfinnishmodel.py +++ b/chardet/langfinnishmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -20610,803 +20608,808 @@ # Character Mapping Table(s): ISO_8859_1_FINNISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 38, # 'A' - 66: 53, # 'B' - 67: 59, # 'C' - 68: 63, # 'D' - 69: 52, # 'E' - 70: 62, # 'F' - 71: 60, # 'G' - 72: 44, # 'H' - 73: 54, # 'I' - 74: 57, # 'J' - 75: 35, # 'K' - 76: 49, # 'L' - 77: 37, # 'M' - 78: 56, # 'N' - 79: 61, # 'O' - 80: 40, # 'P' - 81: 85, # 'Q' - 82: 55, # 'R' - 83: 28, # 'S' - 84: 39, # 'T' - 85: 67, # 'U' - 86: 48, # 'V' - 87: 65, # 'W' - 88: 79, # 'X' - 89: 64, # 'Y' - 90: 76, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 2, # 'a' - 98: 41, # 'b' - 99: 43, # 'c' - 100: 21, # 'd' - 101: 6, # 'e' - 102: 51, # 'f' - 103: 30, # 'g' - 104: 17, # 'h' - 105: 3, # 'i' - 106: 16, # 'j' - 107: 11, # 'k' - 108: 8, # 'l' - 109: 14, # 'm' - 110: 4, # 'n' - 111: 9, # 'o' - 112: 18, # 'p' - 113: 77, # 'q' - 114: 12, # 'r' - 115: 7, # 's' - 116: 5, # 't' - 117: 10, # 'u' - 118: 15, # 'v' - 119: 66, # 'w' - 120: 71, # 'x' - 121: 19, # 'y' - 122: 69, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 38, # 'A' + 66: 53, # 'B' + 67: 59, # 'C' + 68: 63, # 'D' + 69: 52, # 'E' + 70: 62, # 'F' + 71: 60, # 'G' + 72: 44, # 'H' + 73: 54, # 'I' + 74: 57, # 'J' + 75: 35, # 'K' + 76: 49, # 'L' + 77: 37, # 'M' + 78: 56, # 'N' + 79: 61, # 'O' + 80: 40, # 'P' + 81: 85, # 'Q' + 82: 55, # 'R' + 83: 28, # 'S' + 84: 39, # 'T' + 85: 67, # 'U' + 86: 48, # 'V' + 87: 65, # 'W' + 88: 79, # 'X' + 89: 64, # 'Y' + 90: 76, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 2, # 'a' + 98: 41, # 'b' + 99: 43, # 'c' + 100: 21, # 'd' + 101: 6, # 'e' + 102: 51, # 'f' + 103: 30, # 'g' + 104: 17, # 'h' + 105: 3, # 'i' + 106: 16, # 'j' + 107: 11, # 'k' + 108: 8, # 'l' + 109: 14, # 'm' + 110: 4, # 'n' + 111: 9, # 'o' + 112: 18, # 'p' + 113: 77, # 'q' + 114: 12, # 'r' + 115: 7, # 's' + 116: 5, # 't' + 117: 10, # 'u' + 118: 15, # 'v' + 119: 66, # 'w' + 120: 71, # 'x' + 121: 19, # 'y' + 122: 69, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_1_FINNISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-1', - language='Finnish', - char_to_order_map=ISO_8859_1_FINNISH_CHAR_TO_ORDER, - language_model=FINNISH_LANG_MODEL, - typical_positive_ratio=0.9426544751497806, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÅÖäåöŠšŽž') +ISO_8859_1_FINNISH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-1", + language="Finnish", + char_to_order_map=ISO_8859_1_FINNISH_CHAR_TO_ORDER, + language_model=FINNISH_LANG_MODEL, + typical_positive_ratio=0.9426544751497806, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÅÖäåöŠšŽž", +) ISO_8859_15_FINNISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 38, # 'A' - 66: 53, # 'B' - 67: 59, # 'C' - 68: 63, # 'D' - 69: 52, # 'E' - 70: 62, # 'F' - 71: 60, # 'G' - 72: 44, # 'H' - 73: 54, # 'I' - 74: 57, # 'J' - 75: 35, # 'K' - 76: 49, # 'L' - 77: 37, # 'M' - 78: 56, # 'N' - 79: 61, # 'O' - 80: 40, # 'P' - 81: 85, # 'Q' - 82: 55, # 'R' - 83: 28, # 'S' - 84: 39, # 'T' - 85: 67, # 'U' - 86: 48, # 'V' - 87: 65, # 'W' - 88: 79, # 'X' - 89: 64, # 'Y' - 90: 76, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 2, # 'a' - 98: 41, # 'b' - 99: 43, # 'c' - 100: 21, # 'd' - 101: 6, # 'e' - 102: 51, # 'f' - 103: 30, # 'g' - 104: 17, # 'h' - 105: 3, # 'i' - 106: 16, # 'j' - 107: 11, # 'k' - 108: 8, # 'l' - 109: 14, # 'm' - 110: 4, # 'n' - 111: 9, # 'o' - 112: 18, # 'p' - 113: 77, # 'q' - 114: 12, # 'r' - 115: 7, # 's' - 116: 5, # 't' - 117: 10, # 'u' - 118: 15, # 'v' - 119: 66, # 'w' - 120: 71, # 'x' - 121: 19, # 'y' - 122: 69, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '€' - 165: 253, # '¥' - 166: 255, # 'Š' - 167: 253, # '§' - 168: 255, # 'š' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 255, # 'Ž' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ž' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 255, # 'Œ' - 189: 255, # 'œ' - 190: 255, # 'Ÿ' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 38, # 'A' + 66: 53, # 'B' + 67: 59, # 'C' + 68: 63, # 'D' + 69: 52, # 'E' + 70: 62, # 'F' + 71: 60, # 'G' + 72: 44, # 'H' + 73: 54, # 'I' + 74: 57, # 'J' + 75: 35, # 'K' + 76: 49, # 'L' + 77: 37, # 'M' + 78: 56, # 'N' + 79: 61, # 'O' + 80: 40, # 'P' + 81: 85, # 'Q' + 82: 55, # 'R' + 83: 28, # 'S' + 84: 39, # 'T' + 85: 67, # 'U' + 86: 48, # 'V' + 87: 65, # 'W' + 88: 79, # 'X' + 89: 64, # 'Y' + 90: 76, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 2, # 'a' + 98: 41, # 'b' + 99: 43, # 'c' + 100: 21, # 'd' + 101: 6, # 'e' + 102: 51, # 'f' + 103: 30, # 'g' + 104: 17, # 'h' + 105: 3, # 'i' + 106: 16, # 'j' + 107: 11, # 'k' + 108: 8, # 'l' + 109: 14, # 'm' + 110: 4, # 'n' + 111: 9, # 'o' + 112: 18, # 'p' + 113: 77, # 'q' + 114: 12, # 'r' + 115: 7, # 's' + 116: 5, # 't' + 117: 10, # 'u' + 118: 15, # 'v' + 119: 66, # 'w' + 120: 71, # 'x' + 121: 19, # 'y' + 122: 69, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '€' + 165: 253, # '¥' + 166: 255, # 'Š' + 167: 253, # '§' + 168: 255, # 'š' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 255, # 'Ž' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ž' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 255, # 'Œ' + 189: 255, # 'œ' + 190: 255, # 'Ÿ' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_15_FINNISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-15', - language='Finnish', - char_to_order_map=ISO_8859_15_FINNISH_CHAR_TO_ORDER, - language_model=FINNISH_LANG_MODEL, - typical_positive_ratio=0.9426544751497806, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÅÖäåöŠšŽž') +ISO_8859_15_FINNISH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-15", + language="Finnish", + char_to_order_map=ISO_8859_15_FINNISH_CHAR_TO_ORDER, + language_model=FINNISH_LANG_MODEL, + typical_positive_ratio=0.9426544751497806, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÅÖäåöŠšŽž", +) WINDOWS_1252_FINNISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 38, # 'A' - 66: 53, # 'B' - 67: 59, # 'C' - 68: 63, # 'D' - 69: 52, # 'E' - 70: 62, # 'F' - 71: 60, # 'G' - 72: 44, # 'H' - 73: 54, # 'I' - 74: 57, # 'J' - 75: 35, # 'K' - 76: 49, # 'L' - 77: 37, # 'M' - 78: 56, # 'N' - 79: 61, # 'O' - 80: 40, # 'P' - 81: 85, # 'Q' - 82: 55, # 'R' - 83: 28, # 'S' - 84: 39, # 'T' - 85: 67, # 'U' - 86: 48, # 'V' - 87: 65, # 'W' - 88: 79, # 'X' - 89: 64, # 'Y' - 90: 76, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 2, # 'a' - 98: 41, # 'b' - 99: 43, # 'c' - 100: 21, # 'd' - 101: 6, # 'e' - 102: 51, # 'f' - 103: 30, # 'g' - 104: 17, # 'h' - 105: 3, # 'i' - 106: 16, # 'j' - 107: 11, # 'k' - 108: 8, # 'l' - 109: 14, # 'm' - 110: 4, # 'n' - 111: 9, # 'o' - 112: 18, # 'p' - 113: 77, # 'q' - 114: 12, # 'r' - 115: 7, # 's' - 116: 5, # 't' - 117: 10, # 'u' - 118: 15, # 'v' - 119: 66, # 'w' - 120: 71, # 'x' - 121: 19, # 'y' - 122: 69, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # 'ƒ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # 'ˆ' - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Œ' - 141: 255, # None - 142: 255, # 'Ž' - 143: 255, # None - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 253, # '˜' - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'œ' - 157: 255, # None - 158: 255, # 'ž' - 159: 255, # 'Ÿ' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 38, # 'A' + 66: 53, # 'B' + 67: 59, # 'C' + 68: 63, # 'D' + 69: 52, # 'E' + 70: 62, # 'F' + 71: 60, # 'G' + 72: 44, # 'H' + 73: 54, # 'I' + 74: 57, # 'J' + 75: 35, # 'K' + 76: 49, # 'L' + 77: 37, # 'M' + 78: 56, # 'N' + 79: 61, # 'O' + 80: 40, # 'P' + 81: 85, # 'Q' + 82: 55, # 'R' + 83: 28, # 'S' + 84: 39, # 'T' + 85: 67, # 'U' + 86: 48, # 'V' + 87: 65, # 'W' + 88: 79, # 'X' + 89: 64, # 'Y' + 90: 76, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 2, # 'a' + 98: 41, # 'b' + 99: 43, # 'c' + 100: 21, # 'd' + 101: 6, # 'e' + 102: 51, # 'f' + 103: 30, # 'g' + 104: 17, # 'h' + 105: 3, # 'i' + 106: 16, # 'j' + 107: 11, # 'k' + 108: 8, # 'l' + 109: 14, # 'm' + 110: 4, # 'n' + 111: 9, # 'o' + 112: 18, # 'p' + 113: 77, # 'q' + 114: 12, # 'r' + 115: 7, # 's' + 116: 5, # 't' + 117: 10, # 'u' + 118: 15, # 'v' + 119: 66, # 'w' + 120: 71, # 'x' + 121: 19, # 'y' + 122: 69, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # 'ƒ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # 'ˆ' + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Œ' + 141: 255, # None + 142: 255, # 'Ž' + 143: 255, # None + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 253, # '˜' + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'œ' + 157: 255, # None + 158: 255, # 'ž' + 159: 255, # 'Ÿ' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -WINDOWS_1252_FINNISH_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1252', - language='Finnish', - char_to_order_map=WINDOWS_1252_FINNISH_CHAR_TO_ORDER, - language_model=FINNISH_LANG_MODEL, - typical_positive_ratio=0.9472462781044202, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÅÖäåöŠšŽž') - +WINDOWS_1252_FINNISH_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1252", + language="Finnish", + char_to_order_map=WINDOWS_1252_FINNISH_CHAR_TO_ORDER, + language_model=FINNISH_LANG_MODEL, + typical_positive_ratio=0.9472462781044202, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÅÖäåöŠšŽž", +) diff --git a/chardet/langfrenchmodel.py b/chardet/langfrenchmodel.py index 030645c0..8973c5e3 100644 --- a/chardet/langfrenchmodel.py +++ b/chardet/langfrenchmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -62768,803 +62766,808 @@ # Character Mapping Table(s): ISO_8859_1_FRENCH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 40, # 'A' - 66: 47, # 'B' - 67: 37, # 'C' - 68: 51, # 'D' - 69: 52, # 'E' - 70: 59, # 'F' - 71: 61, # 'G' - 72: 65, # 'H' - 73: 43, # 'I' - 74: 64, # 'J' - 75: 75, # 'K' - 76: 29, # 'L' - 77: 45, # 'M' - 78: 62, # 'N' - 79: 70, # 'O' - 80: 42, # 'P' - 81: 86, # 'Q' - 82: 55, # 'R' - 83: 41, # 'S' - 84: 60, # 'T' - 85: 71, # 'U' - 86: 69, # 'V' - 87: 77, # 'W' - 88: 79, # 'X' - 89: 87, # 'Y' - 90: 95, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 3, # 'a' - 98: 23, # 'b' - 99: 13, # 'c' - 100: 12, # 'd' - 101: 2, # 'e' - 102: 20, # 'f' - 103: 19, # 'g' - 104: 21, # 'h' - 105: 5, # 'i' - 106: 44, # 'j' - 107: 58, # 'k' - 108: 9, # 'l' - 109: 15, # 'm' - 110: 6, # 'n' - 111: 10, # 'o' - 112: 14, # 'p' - 113: 25, # 'q' - 114: 7, # 'r' - 115: 4, # 's' - 116: 8, # 't' - 117: 11, # 'u' - 118: 18, # 'v' - 119: 74, # 'w' - 120: 30, # 'x' - 121: 34, # 'y' - 122: 63, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 40, # 'A' + 66: 47, # 'B' + 67: 37, # 'C' + 68: 51, # 'D' + 69: 52, # 'E' + 70: 59, # 'F' + 71: 61, # 'G' + 72: 65, # 'H' + 73: 43, # 'I' + 74: 64, # 'J' + 75: 75, # 'K' + 76: 29, # 'L' + 77: 45, # 'M' + 78: 62, # 'N' + 79: 70, # 'O' + 80: 42, # 'P' + 81: 86, # 'Q' + 82: 55, # 'R' + 83: 41, # 'S' + 84: 60, # 'T' + 85: 71, # 'U' + 86: 69, # 'V' + 87: 77, # 'W' + 88: 79, # 'X' + 89: 87, # 'Y' + 90: 95, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 3, # 'a' + 98: 23, # 'b' + 99: 13, # 'c' + 100: 12, # 'd' + 101: 2, # 'e' + 102: 20, # 'f' + 103: 19, # 'g' + 104: 21, # 'h' + 105: 5, # 'i' + 106: 44, # 'j' + 107: 58, # 'k' + 108: 9, # 'l' + 109: 15, # 'm' + 110: 6, # 'n' + 111: 10, # 'o' + 112: 14, # 'p' + 113: 25, # 'q' + 114: 7, # 'r' + 115: 4, # 's' + 116: 8, # 't' + 117: 11, # 'u' + 118: 18, # 'v' + 119: 74, # 'w' + 120: 30, # 'x' + 121: 34, # 'y' + 122: 63, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_1_FRENCH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-1', - language='French', - char_to_order_map=ISO_8859_1_FRENCH_CHAR_TO_ORDER, - language_model=FRENCH_LANG_MODEL, - typical_positive_ratio=0.9404241469586969, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÂÇÈÉÊÎÏÙÛàâçèéêîïùûŒœ') +ISO_8859_1_FRENCH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-1", + language="French", + char_to_order_map=ISO_8859_1_FRENCH_CHAR_TO_ORDER, + language_model=FRENCH_LANG_MODEL, + typical_positive_ratio=0.9404241469586969, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÂÇÈÉÊÎÏÙÛàâçèéêîïùûŒœ", +) ISO_8859_15_FRENCH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 40, # 'A' - 66: 47, # 'B' - 67: 37, # 'C' - 68: 51, # 'D' - 69: 52, # 'E' - 70: 59, # 'F' - 71: 61, # 'G' - 72: 65, # 'H' - 73: 43, # 'I' - 74: 64, # 'J' - 75: 75, # 'K' - 76: 29, # 'L' - 77: 45, # 'M' - 78: 62, # 'N' - 79: 70, # 'O' - 80: 42, # 'P' - 81: 86, # 'Q' - 82: 55, # 'R' - 83: 41, # 'S' - 84: 60, # 'T' - 85: 71, # 'U' - 86: 69, # 'V' - 87: 77, # 'W' - 88: 79, # 'X' - 89: 87, # 'Y' - 90: 95, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 3, # 'a' - 98: 23, # 'b' - 99: 13, # 'c' - 100: 12, # 'd' - 101: 2, # 'e' - 102: 20, # 'f' - 103: 19, # 'g' - 104: 21, # 'h' - 105: 5, # 'i' - 106: 44, # 'j' - 107: 58, # 'k' - 108: 9, # 'l' - 109: 15, # 'm' - 110: 6, # 'n' - 111: 10, # 'o' - 112: 14, # 'p' - 113: 25, # 'q' - 114: 7, # 'r' - 115: 4, # 's' - 116: 8, # 't' - 117: 11, # 'u' - 118: 18, # 'v' - 119: 74, # 'w' - 120: 30, # 'x' - 121: 34, # 'y' - 122: 63, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '€' - 165: 253, # '¥' - 166: 255, # 'Š' - 167: 253, # '§' - 168: 255, # 'š' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 255, # 'Ž' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ž' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 255, # 'Œ' - 189: 255, # 'œ' - 190: 255, # 'Ÿ' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 40, # 'A' + 66: 47, # 'B' + 67: 37, # 'C' + 68: 51, # 'D' + 69: 52, # 'E' + 70: 59, # 'F' + 71: 61, # 'G' + 72: 65, # 'H' + 73: 43, # 'I' + 74: 64, # 'J' + 75: 75, # 'K' + 76: 29, # 'L' + 77: 45, # 'M' + 78: 62, # 'N' + 79: 70, # 'O' + 80: 42, # 'P' + 81: 86, # 'Q' + 82: 55, # 'R' + 83: 41, # 'S' + 84: 60, # 'T' + 85: 71, # 'U' + 86: 69, # 'V' + 87: 77, # 'W' + 88: 79, # 'X' + 89: 87, # 'Y' + 90: 95, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 3, # 'a' + 98: 23, # 'b' + 99: 13, # 'c' + 100: 12, # 'd' + 101: 2, # 'e' + 102: 20, # 'f' + 103: 19, # 'g' + 104: 21, # 'h' + 105: 5, # 'i' + 106: 44, # 'j' + 107: 58, # 'k' + 108: 9, # 'l' + 109: 15, # 'm' + 110: 6, # 'n' + 111: 10, # 'o' + 112: 14, # 'p' + 113: 25, # 'q' + 114: 7, # 'r' + 115: 4, # 's' + 116: 8, # 't' + 117: 11, # 'u' + 118: 18, # 'v' + 119: 74, # 'w' + 120: 30, # 'x' + 121: 34, # 'y' + 122: 63, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '€' + 165: 253, # '¥' + 166: 255, # 'Š' + 167: 253, # '§' + 168: 255, # 'š' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 255, # 'Ž' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ž' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 255, # 'Œ' + 189: 255, # 'œ' + 190: 255, # 'Ÿ' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_15_FRENCH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-15', - language='French', - char_to_order_map=ISO_8859_15_FRENCH_CHAR_TO_ORDER, - language_model=FRENCH_LANG_MODEL, - typical_positive_ratio=0.9404241469586969, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÂÇÈÉÊÎÏÙÛàâçèéêîïùûŒœ') +ISO_8859_15_FRENCH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-15", + language="French", + char_to_order_map=ISO_8859_15_FRENCH_CHAR_TO_ORDER, + language_model=FRENCH_LANG_MODEL, + typical_positive_ratio=0.9404241469586969, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÂÇÈÉÊÎÏÙÛàâçèéêîïùûŒœ", +) WINDOWS_1252_FRENCH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 40, # 'A' - 66: 47, # 'B' - 67: 37, # 'C' - 68: 51, # 'D' - 69: 52, # 'E' - 70: 59, # 'F' - 71: 61, # 'G' - 72: 65, # 'H' - 73: 43, # 'I' - 74: 64, # 'J' - 75: 75, # 'K' - 76: 29, # 'L' - 77: 45, # 'M' - 78: 62, # 'N' - 79: 70, # 'O' - 80: 42, # 'P' - 81: 86, # 'Q' - 82: 55, # 'R' - 83: 41, # 'S' - 84: 60, # 'T' - 85: 71, # 'U' - 86: 69, # 'V' - 87: 77, # 'W' - 88: 79, # 'X' - 89: 87, # 'Y' - 90: 95, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 3, # 'a' - 98: 23, # 'b' - 99: 13, # 'c' - 100: 12, # 'd' - 101: 2, # 'e' - 102: 20, # 'f' - 103: 19, # 'g' - 104: 21, # 'h' - 105: 5, # 'i' - 106: 44, # 'j' - 107: 58, # 'k' - 108: 9, # 'l' - 109: 15, # 'm' - 110: 6, # 'n' - 111: 10, # 'o' - 112: 14, # 'p' - 113: 25, # 'q' - 114: 7, # 'r' - 115: 4, # 's' - 116: 8, # 't' - 117: 11, # 'u' - 118: 18, # 'v' - 119: 74, # 'w' - 120: 30, # 'x' - 121: 34, # 'y' - 122: 63, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # 'ƒ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # 'ˆ' - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Œ' - 141: 255, # None - 142: 255, # 'Ž' - 143: 255, # None - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 253, # '˜' - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'œ' - 157: 255, # None - 158: 255, # 'ž' - 159: 255, # 'Ÿ' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 40, # 'A' + 66: 47, # 'B' + 67: 37, # 'C' + 68: 51, # 'D' + 69: 52, # 'E' + 70: 59, # 'F' + 71: 61, # 'G' + 72: 65, # 'H' + 73: 43, # 'I' + 74: 64, # 'J' + 75: 75, # 'K' + 76: 29, # 'L' + 77: 45, # 'M' + 78: 62, # 'N' + 79: 70, # 'O' + 80: 42, # 'P' + 81: 86, # 'Q' + 82: 55, # 'R' + 83: 41, # 'S' + 84: 60, # 'T' + 85: 71, # 'U' + 86: 69, # 'V' + 87: 77, # 'W' + 88: 79, # 'X' + 89: 87, # 'Y' + 90: 95, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 3, # 'a' + 98: 23, # 'b' + 99: 13, # 'c' + 100: 12, # 'd' + 101: 2, # 'e' + 102: 20, # 'f' + 103: 19, # 'g' + 104: 21, # 'h' + 105: 5, # 'i' + 106: 44, # 'j' + 107: 58, # 'k' + 108: 9, # 'l' + 109: 15, # 'm' + 110: 6, # 'n' + 111: 10, # 'o' + 112: 14, # 'p' + 113: 25, # 'q' + 114: 7, # 'r' + 115: 4, # 's' + 116: 8, # 't' + 117: 11, # 'u' + 118: 18, # 'v' + 119: 74, # 'w' + 120: 30, # 'x' + 121: 34, # 'y' + 122: 63, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # 'ƒ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # 'ˆ' + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Œ' + 141: 255, # None + 142: 255, # 'Ž' + 143: 255, # None + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 253, # '˜' + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'œ' + 157: 255, # None + 158: 255, # 'ž' + 159: 255, # 'Ÿ' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -WINDOWS_1252_FRENCH_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1252', - language='French', - char_to_order_map=WINDOWS_1252_FRENCH_CHAR_TO_ORDER, - language_model=FRENCH_LANG_MODEL, - typical_positive_ratio=0.9418355228457032, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÂÇÈÉÊÎÏÙÛàâçèéêîïùûŒœ') - +WINDOWS_1252_FRENCH_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1252", + language="French", + char_to_order_map=WINDOWS_1252_FRENCH_CHAR_TO_ORDER, + language_model=FRENCH_LANG_MODEL, + typical_positive_ratio=0.9418355228457032, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÂÇÈÉÊÎÏÙÛàâçèéêîïùûŒœ", +) diff --git a/chardet/langgermanmodel.py b/chardet/langgermanmodel.py index c2fe8787..74dc4315 100644 --- a/chardet/langgermanmodel.py +++ b/chardet/langgermanmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -7850,536 +7848,539 @@ # Character Mapping Table(s): ISO_8859_1_GERMAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 29, # 'A' - 66: 31, # 'B' - 67: 64, # 'C' - 68: 32, # 'D' - 69: 43, # 'E' - 70: 44, # 'F' - 71: 42, # 'G' - 72: 46, # 'H' - 73: 51, # 'I' - 74: 54, # 'J' - 75: 39, # 'K' - 76: 52, # 'L' - 77: 36, # 'M' - 78: 55, # 'N' - 79: 67, # 'O' - 80: 41, # 'P' - 81: 83, # 'Q' - 82: 45, # 'R' - 83: 24, # 'S' - 84: 56, # 'T' - 85: 65, # 'U' - 86: 57, # 'V' - 87: 47, # 'W' - 88: 87, # 'X' - 89: 84, # 'Y' - 90: 68, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 8, # 'a' - 98: 17, # 'b' - 99: 14, # 'c' - 100: 9, # 'd' - 101: 2, # 'e' - 102: 18, # 'f' - 103: 15, # 'g' - 104: 10, # 'h' - 105: 4, # 'i' - 106: 69, # 'j' - 107: 19, # 'k' - 108: 12, # 'l' - 109: 16, # 'm' - 110: 3, # 'n' - 111: 13, # 'o' - 112: 25, # 'p' - 113: 79, # 'q' - 114: 5, # 'r' - 115: 7, # 's' - 116: 6, # 't' - 117: 11, # 'u' - 118: 27, # 'v' - 119: 23, # 'w' - 120: 71, # 'x' - 121: 61, # 'y' - 122: 22, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 29, # 'A' + 66: 31, # 'B' + 67: 64, # 'C' + 68: 32, # 'D' + 69: 43, # 'E' + 70: 44, # 'F' + 71: 42, # 'G' + 72: 46, # 'H' + 73: 51, # 'I' + 74: 54, # 'J' + 75: 39, # 'K' + 76: 52, # 'L' + 77: 36, # 'M' + 78: 55, # 'N' + 79: 67, # 'O' + 80: 41, # 'P' + 81: 83, # 'Q' + 82: 45, # 'R' + 83: 24, # 'S' + 84: 56, # 'T' + 85: 65, # 'U' + 86: 57, # 'V' + 87: 47, # 'W' + 88: 87, # 'X' + 89: 84, # 'Y' + 90: 68, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 8, # 'a' + 98: 17, # 'b' + 99: 14, # 'c' + 100: 9, # 'd' + 101: 2, # 'e' + 102: 18, # 'f' + 103: 15, # 'g' + 104: 10, # 'h' + 105: 4, # 'i' + 106: 69, # 'j' + 107: 19, # 'k' + 108: 12, # 'l' + 109: 16, # 'm' + 110: 3, # 'n' + 111: 13, # 'o' + 112: 25, # 'p' + 113: 79, # 'q' + 114: 5, # 'r' + 115: 7, # 's' + 116: 6, # 't' + 117: 11, # 'u' + 118: 27, # 'v' + 119: 23, # 'w' + 120: 71, # 'x' + 121: 61, # 'y' + 122: 22, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_1_GERMAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-1', - language='German', - char_to_order_map=ISO_8859_1_GERMAN_CHAR_TO_ORDER, - language_model=GERMAN_LANG_MODEL, - typical_positive_ratio=0.9204238352804265, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜßäöü') +ISO_8859_1_GERMAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-1", + language="German", + char_to_order_map=ISO_8859_1_GERMAN_CHAR_TO_ORDER, + language_model=GERMAN_LANG_MODEL, + typical_positive_ratio=0.9204238352804265, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜßäöü", +) WINDOWS_1252_GERMAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 29, # 'A' - 66: 31, # 'B' - 67: 64, # 'C' - 68: 32, # 'D' - 69: 43, # 'E' - 70: 44, # 'F' - 71: 42, # 'G' - 72: 46, # 'H' - 73: 51, # 'I' - 74: 54, # 'J' - 75: 39, # 'K' - 76: 52, # 'L' - 77: 36, # 'M' - 78: 55, # 'N' - 79: 67, # 'O' - 80: 41, # 'P' - 81: 83, # 'Q' - 82: 45, # 'R' - 83: 24, # 'S' - 84: 56, # 'T' - 85: 65, # 'U' - 86: 57, # 'V' - 87: 47, # 'W' - 88: 87, # 'X' - 89: 84, # 'Y' - 90: 68, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 8, # 'a' - 98: 17, # 'b' - 99: 14, # 'c' - 100: 9, # 'd' - 101: 2, # 'e' - 102: 18, # 'f' - 103: 15, # 'g' - 104: 10, # 'h' - 105: 4, # 'i' - 106: 69, # 'j' - 107: 19, # 'k' - 108: 12, # 'l' - 109: 16, # 'm' - 110: 3, # 'n' - 111: 13, # 'o' - 112: 25, # 'p' - 113: 79, # 'q' - 114: 5, # 'r' - 115: 7, # 's' - 116: 6, # 't' - 117: 11, # 'u' - 118: 27, # 'v' - 119: 23, # 'w' - 120: 71, # 'x' - 121: 61, # 'y' - 122: 22, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # 'ƒ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # 'ˆ' - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Œ' - 141: 255, # None - 142: 255, # 'Ž' - 143: 255, # None - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 253, # '˜' - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'œ' - 157: 255, # None - 158: 255, # 'ž' - 159: 255, # 'Ÿ' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 29, # 'A' + 66: 31, # 'B' + 67: 64, # 'C' + 68: 32, # 'D' + 69: 43, # 'E' + 70: 44, # 'F' + 71: 42, # 'G' + 72: 46, # 'H' + 73: 51, # 'I' + 74: 54, # 'J' + 75: 39, # 'K' + 76: 52, # 'L' + 77: 36, # 'M' + 78: 55, # 'N' + 79: 67, # 'O' + 80: 41, # 'P' + 81: 83, # 'Q' + 82: 45, # 'R' + 83: 24, # 'S' + 84: 56, # 'T' + 85: 65, # 'U' + 86: 57, # 'V' + 87: 47, # 'W' + 88: 87, # 'X' + 89: 84, # 'Y' + 90: 68, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 8, # 'a' + 98: 17, # 'b' + 99: 14, # 'c' + 100: 9, # 'd' + 101: 2, # 'e' + 102: 18, # 'f' + 103: 15, # 'g' + 104: 10, # 'h' + 105: 4, # 'i' + 106: 69, # 'j' + 107: 19, # 'k' + 108: 12, # 'l' + 109: 16, # 'm' + 110: 3, # 'n' + 111: 13, # 'o' + 112: 25, # 'p' + 113: 79, # 'q' + 114: 5, # 'r' + 115: 7, # 's' + 116: 6, # 't' + 117: 11, # 'u' + 118: 27, # 'v' + 119: 23, # 'w' + 120: 71, # 'x' + 121: 61, # 'y' + 122: 22, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # 'ƒ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # 'ˆ' + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Œ' + 141: 255, # None + 142: 255, # 'Ž' + 143: 255, # None + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 253, # '˜' + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'œ' + 157: 255, # None + 158: 255, # 'ž' + 159: 255, # 'Ÿ' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -WINDOWS_1252_GERMAN_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1252', - language='German', - char_to_order_map=WINDOWS_1252_GERMAN_CHAR_TO_ORDER, - language_model=GERMAN_LANG_MODEL, - typical_positive_ratio=0.9225082237211283, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜßäöü') - +WINDOWS_1252_GERMAN_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1252", + language="German", + char_to_order_map=WINDOWS_1252_GERMAN_CHAR_TO_ORDER, + language_model=GERMAN_LANG_MODEL, + typical_positive_ratio=0.9225082237211283, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜßäöü", +) diff --git a/chardet/langitalianmodel.py b/chardet/langitalianmodel.py index 86525804..32e0f944 100644 --- a/chardet/langitalianmodel.py +++ b/chardet/langitalianmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -62768,803 +62766,808 @@ # Character Mapping Table(s): ISO_8859_15_ITALIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 31, # 'A' - 66: 43, # 'B' - 67: 29, # 'C' - 68: 49, # 'D' - 69: 59, # 'E' - 70: 48, # 'F' - 71: 42, # 'G' - 72: 68, # 'H' - 73: 27, # 'I' - 74: 74, # 'J' - 75: 75, # 'K' - 76: 32, # 'L' - 77: 35, # 'M' - 78: 46, # 'N' - 79: 66, # 'O' - 80: 36, # 'P' - 81: 77, # 'Q' - 82: 47, # 'R' - 83: 28, # 'S' - 84: 50, # 'T' - 85: 64, # 'U' - 86: 58, # 'V' - 87: 70, # 'W' - 88: 73, # 'X' - 89: 85, # 'Y' - 90: 84, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 4, # 'a' - 98: 21, # 'b' - 99: 11, # 'c' - 100: 12, # 'd' - 101: 3, # 'e' - 102: 19, # 'f' - 103: 16, # 'g' - 104: 22, # 'h' - 105: 2, # 'i' - 106: 78, # 'j' - 107: 62, # 'k' - 108: 7, # 'l' - 109: 15, # 'm' - 110: 6, # 'n' - 111: 5, # 'o' - 112: 14, # 'p' - 113: 34, # 'q' - 114: 9, # 'r' - 115: 10, # 's' - 116: 8, # 't' - 117: 13, # 'u' - 118: 17, # 'v' - 119: 69, # 'w' - 120: 76, # 'x' - 121: 63, # 'y' - 122: 20, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '€' - 165: 253, # '¥' - 166: 255, # 'Š' - 167: 253, # '§' - 168: 255, # 'š' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 255, # 'Ž' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ž' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 255, # 'Œ' - 189: 255, # 'œ' - 190: 255, # 'Ÿ' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 31, # 'A' + 66: 43, # 'B' + 67: 29, # 'C' + 68: 49, # 'D' + 69: 59, # 'E' + 70: 48, # 'F' + 71: 42, # 'G' + 72: 68, # 'H' + 73: 27, # 'I' + 74: 74, # 'J' + 75: 75, # 'K' + 76: 32, # 'L' + 77: 35, # 'M' + 78: 46, # 'N' + 79: 66, # 'O' + 80: 36, # 'P' + 81: 77, # 'Q' + 82: 47, # 'R' + 83: 28, # 'S' + 84: 50, # 'T' + 85: 64, # 'U' + 86: 58, # 'V' + 87: 70, # 'W' + 88: 73, # 'X' + 89: 85, # 'Y' + 90: 84, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 4, # 'a' + 98: 21, # 'b' + 99: 11, # 'c' + 100: 12, # 'd' + 101: 3, # 'e' + 102: 19, # 'f' + 103: 16, # 'g' + 104: 22, # 'h' + 105: 2, # 'i' + 106: 78, # 'j' + 107: 62, # 'k' + 108: 7, # 'l' + 109: 15, # 'm' + 110: 6, # 'n' + 111: 5, # 'o' + 112: 14, # 'p' + 113: 34, # 'q' + 114: 9, # 'r' + 115: 10, # 's' + 116: 8, # 't' + 117: 13, # 'u' + 118: 17, # 'v' + 119: 69, # 'w' + 120: 76, # 'x' + 121: 63, # 'y' + 122: 20, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '€' + 165: 253, # '¥' + 166: 255, # 'Š' + 167: 253, # '§' + 168: 255, # 'š' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 255, # 'Ž' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ž' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 255, # 'Œ' + 189: 255, # 'œ' + 190: 255, # 'Ÿ' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_15_ITALIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-15', - language='Italian', - char_to_order_map=ISO_8859_15_ITALIAN_CHAR_TO_ORDER, - language_model=ITALIAN_LANG_MODEL, - typical_positive_ratio=0.9621814448642135, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÈÉÌÒÓÙàèéìòóù') +ISO_8859_15_ITALIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-15", + language="Italian", + char_to_order_map=ISO_8859_15_ITALIAN_CHAR_TO_ORDER, + language_model=ITALIAN_LANG_MODEL, + typical_positive_ratio=0.9621814448642135, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÈÉÌÒÓÙàèéìòóù", +) ISO_8859_1_ITALIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 31, # 'A' - 66: 43, # 'B' - 67: 29, # 'C' - 68: 49, # 'D' - 69: 59, # 'E' - 70: 48, # 'F' - 71: 42, # 'G' - 72: 68, # 'H' - 73: 27, # 'I' - 74: 74, # 'J' - 75: 75, # 'K' - 76: 32, # 'L' - 77: 35, # 'M' - 78: 46, # 'N' - 79: 66, # 'O' - 80: 36, # 'P' - 81: 77, # 'Q' - 82: 47, # 'R' - 83: 28, # 'S' - 84: 50, # 'T' - 85: 64, # 'U' - 86: 58, # 'V' - 87: 70, # 'W' - 88: 73, # 'X' - 89: 85, # 'Y' - 90: 84, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 4, # 'a' - 98: 21, # 'b' - 99: 11, # 'c' - 100: 12, # 'd' - 101: 3, # 'e' - 102: 19, # 'f' - 103: 16, # 'g' - 104: 22, # 'h' - 105: 2, # 'i' - 106: 78, # 'j' - 107: 62, # 'k' - 108: 7, # 'l' - 109: 15, # 'm' - 110: 6, # 'n' - 111: 5, # 'o' - 112: 14, # 'p' - 113: 34, # 'q' - 114: 9, # 'r' - 115: 10, # 's' - 116: 8, # 't' - 117: 13, # 'u' - 118: 17, # 'v' - 119: 69, # 'w' - 120: 76, # 'x' - 121: 63, # 'y' - 122: 20, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 31, # 'A' + 66: 43, # 'B' + 67: 29, # 'C' + 68: 49, # 'D' + 69: 59, # 'E' + 70: 48, # 'F' + 71: 42, # 'G' + 72: 68, # 'H' + 73: 27, # 'I' + 74: 74, # 'J' + 75: 75, # 'K' + 76: 32, # 'L' + 77: 35, # 'M' + 78: 46, # 'N' + 79: 66, # 'O' + 80: 36, # 'P' + 81: 77, # 'Q' + 82: 47, # 'R' + 83: 28, # 'S' + 84: 50, # 'T' + 85: 64, # 'U' + 86: 58, # 'V' + 87: 70, # 'W' + 88: 73, # 'X' + 89: 85, # 'Y' + 90: 84, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 4, # 'a' + 98: 21, # 'b' + 99: 11, # 'c' + 100: 12, # 'd' + 101: 3, # 'e' + 102: 19, # 'f' + 103: 16, # 'g' + 104: 22, # 'h' + 105: 2, # 'i' + 106: 78, # 'j' + 107: 62, # 'k' + 108: 7, # 'l' + 109: 15, # 'm' + 110: 6, # 'n' + 111: 5, # 'o' + 112: 14, # 'p' + 113: 34, # 'q' + 114: 9, # 'r' + 115: 10, # 's' + 116: 8, # 't' + 117: 13, # 'u' + 118: 17, # 'v' + 119: 69, # 'w' + 120: 76, # 'x' + 121: 63, # 'y' + 122: 20, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_1_ITALIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-1', - language='Italian', - char_to_order_map=ISO_8859_1_ITALIAN_CHAR_TO_ORDER, - language_model=ITALIAN_LANG_MODEL, - typical_positive_ratio=0.9621814448642135, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÈÉÌÒÓÙàèéìòóù') +ISO_8859_1_ITALIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-1", + language="Italian", + char_to_order_map=ISO_8859_1_ITALIAN_CHAR_TO_ORDER, + language_model=ITALIAN_LANG_MODEL, + typical_positive_ratio=0.9621814448642135, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÈÉÌÒÓÙàèéìòóù", +) WINDOWS_1252_ITALIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 31, # 'A' - 66: 43, # 'B' - 67: 29, # 'C' - 68: 49, # 'D' - 69: 59, # 'E' - 70: 48, # 'F' - 71: 42, # 'G' - 72: 68, # 'H' - 73: 27, # 'I' - 74: 74, # 'J' - 75: 75, # 'K' - 76: 32, # 'L' - 77: 35, # 'M' - 78: 46, # 'N' - 79: 66, # 'O' - 80: 36, # 'P' - 81: 77, # 'Q' - 82: 47, # 'R' - 83: 28, # 'S' - 84: 50, # 'T' - 85: 64, # 'U' - 86: 58, # 'V' - 87: 70, # 'W' - 88: 73, # 'X' - 89: 85, # 'Y' - 90: 84, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 4, # 'a' - 98: 21, # 'b' - 99: 11, # 'c' - 100: 12, # 'd' - 101: 3, # 'e' - 102: 19, # 'f' - 103: 16, # 'g' - 104: 22, # 'h' - 105: 2, # 'i' - 106: 78, # 'j' - 107: 62, # 'k' - 108: 7, # 'l' - 109: 15, # 'm' - 110: 6, # 'n' - 111: 5, # 'o' - 112: 14, # 'p' - 113: 34, # 'q' - 114: 9, # 'r' - 115: 10, # 's' - 116: 8, # 't' - 117: 13, # 'u' - 118: 17, # 'v' - 119: 69, # 'w' - 120: 76, # 'x' - 121: 63, # 'y' - 122: 20, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # 'ƒ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # 'ˆ' - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Œ' - 141: 255, # None - 142: 255, # 'Ž' - 143: 255, # None - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 253, # '˜' - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'œ' - 157: 255, # None - 158: 255, # 'ž' - 159: 255, # 'Ÿ' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 31, # 'A' + 66: 43, # 'B' + 67: 29, # 'C' + 68: 49, # 'D' + 69: 59, # 'E' + 70: 48, # 'F' + 71: 42, # 'G' + 72: 68, # 'H' + 73: 27, # 'I' + 74: 74, # 'J' + 75: 75, # 'K' + 76: 32, # 'L' + 77: 35, # 'M' + 78: 46, # 'N' + 79: 66, # 'O' + 80: 36, # 'P' + 81: 77, # 'Q' + 82: 47, # 'R' + 83: 28, # 'S' + 84: 50, # 'T' + 85: 64, # 'U' + 86: 58, # 'V' + 87: 70, # 'W' + 88: 73, # 'X' + 89: 85, # 'Y' + 90: 84, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 4, # 'a' + 98: 21, # 'b' + 99: 11, # 'c' + 100: 12, # 'd' + 101: 3, # 'e' + 102: 19, # 'f' + 103: 16, # 'g' + 104: 22, # 'h' + 105: 2, # 'i' + 106: 78, # 'j' + 107: 62, # 'k' + 108: 7, # 'l' + 109: 15, # 'm' + 110: 6, # 'n' + 111: 5, # 'o' + 112: 14, # 'p' + 113: 34, # 'q' + 114: 9, # 'r' + 115: 10, # 's' + 116: 8, # 't' + 117: 13, # 'u' + 118: 17, # 'v' + 119: 69, # 'w' + 120: 76, # 'x' + 121: 63, # 'y' + 122: 20, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # 'ƒ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # 'ˆ' + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Œ' + 141: 255, # None + 142: 255, # 'Ž' + 143: 255, # None + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 253, # '˜' + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'œ' + 157: 255, # None + 158: 255, # 'ž' + 159: 255, # 'Ÿ' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -WINDOWS_1252_ITALIAN_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1252', - language='Italian', - char_to_order_map=WINDOWS_1252_ITALIAN_CHAR_TO_ORDER, - language_model=ITALIAN_LANG_MODEL, - typical_positive_ratio=0.9624467183774188, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÈÉÌÒÓÙàèéìòóù') - +WINDOWS_1252_ITALIAN_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1252", + language="Italian", + char_to_order_map=WINDOWS_1252_ITALIAN_CHAR_TO_ORDER, + language_model=ITALIAN_LANG_MODEL, + typical_positive_ratio=0.9624467183774188, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÈÉÌÒÓÙàèéìòóù", +) diff --git a/chardet/langlatvianmodel.py b/chardet/langlatvianmodel.py index c8fc24cc..80f55f27 100644 --- a/chardet/langlatvianmodel.py +++ b/chardet/langlatvianmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -13358,803 +13356,808 @@ # Character Mapping Table(s): ISO_8859_13_LATVIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '”' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '„' - 166: 253, # '¦' - 167: 253, # '§' - 168: 255, # 'Ø' - 169: 253, # '©' - 170: 255, # 'Ŗ' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Æ' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '“' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ø' - 185: 252, # '¹' - 186: 255, # 'ŗ' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 255, # 'æ' - 192: 255, # 'Ą' - 193: 255, # 'Į' - 194: 255, # 'Ā' - 195: 255, # 'Ć' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Ę' - 199: 255, # 'Ē' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ź' - 203: 255, # 'Ė' - 204: 255, # 'Ģ' - 205: 255, # 'Ķ' - 206: 255, # 'Ī' - 207: 255, # 'Ļ' - 208: 255, # 'Š' - 209: 255, # 'Ń' - 210: 255, # 'Ņ' - 211: 255, # 'Ó' - 212: 255, # 'Ō' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ų' - 217: 255, # 'Ł' - 218: 255, # 'Ś' - 219: 255, # 'Ū' - 220: 255, # 'Ü' - 221: 255, # 'Ż' - 222: 255, # 'Ž' - 223: 255, # 'ß' - 224: 255, # 'ą' - 225: 255, # 'į' - 226: 255, # 'ā' - 227: 255, # 'ć' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'ę' - 231: 255, # 'ē' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ź' - 235: 255, # 'ė' - 236: 255, # 'ģ' - 237: 255, # 'ķ' - 238: 255, # 'ī' - 239: 255, # 'ļ' - 240: 255, # 'š' - 241: 255, # 'ń' - 242: 255, # 'ņ' - 243: 255, # 'ó' - 244: 255, # 'ō' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ų' - 249: 255, # 'ł' - 250: 255, # 'ś' - 251: 255, # 'ū' - 252: 255, # 'ü' - 253: 255, # 'ż' - 254: 255, # 'ž' - 255: 253, # '’' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '”' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '„' + 166: 253, # '¦' + 167: 253, # '§' + 168: 255, # 'Ø' + 169: 253, # '©' + 170: 255, # 'Ŗ' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Æ' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '“' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ø' + 185: 252, # '¹' + 186: 255, # 'ŗ' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 255, # 'æ' + 192: 255, # 'Ą' + 193: 255, # 'Į' + 194: 255, # 'Ā' + 195: 255, # 'Ć' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Ę' + 199: 255, # 'Ē' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ź' + 203: 255, # 'Ė' + 204: 255, # 'Ģ' + 205: 255, # 'Ķ' + 206: 255, # 'Ī' + 207: 255, # 'Ļ' + 208: 255, # 'Š' + 209: 255, # 'Ń' + 210: 255, # 'Ņ' + 211: 255, # 'Ó' + 212: 255, # 'Ō' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ų' + 217: 255, # 'Ł' + 218: 255, # 'Ś' + 219: 255, # 'Ū' + 220: 255, # 'Ü' + 221: 255, # 'Ż' + 222: 255, # 'Ž' + 223: 255, # 'ß' + 224: 255, # 'ą' + 225: 255, # 'į' + 226: 255, # 'ā' + 227: 255, # 'ć' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'ę' + 231: 255, # 'ē' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ź' + 235: 255, # 'ė' + 236: 255, # 'ģ' + 237: 255, # 'ķ' + 238: 255, # 'ī' + 239: 255, # 'ļ' + 240: 255, # 'š' + 241: 255, # 'ń' + 242: 255, # 'ņ' + 243: 255, # 'ó' + 244: 255, # 'ō' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ų' + 249: 255, # 'ł' + 250: 255, # 'ś' + 251: 255, # 'ū' + 252: 255, # 'ü' + 253: 255, # 'ż' + 254: 255, # 'ž' + 255: 253, # '’' } -ISO_8859_13_LATVIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-13', - language='Latvian', - char_to_order_map=ISO_8859_13_LATVIAN_CHAR_TO_ORDER, - language_model=LATVIAN_LANG_MODEL, - typical_positive_ratio=0.9488695969997288, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž') +ISO_8859_13_LATVIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-13", + language="Latvian", + char_to_order_map=ISO_8859_13_LATVIAN_CHAR_TO_ORDER, + language_model=LATVIAN_LANG_MODEL, + typical_positive_ratio=0.9488695969997288, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž", +) WINDOWS_1257_LATVIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # None - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # None - 137: 253, # '‰' - 138: 255, # None - 139: 253, # '‹' - 140: 255, # None - 141: 253, # '¨' - 142: 255, # 'ˇ' - 143: 253, # '¸' - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 255, # None - 153: 253, # '™' - 154: 255, # None - 155: 253, # '›' - 156: 255, # None - 157: 253, # '¯' - 158: 253, # '˛' - 159: 255, # None - 160: 251, # '\xa0' - 161: 255, # None - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 255, # None - 166: 253, # '¦' - 167: 253, # '§' - 168: 255, # 'Ø' - 169: 253, # '©' - 170: 255, # 'Ŗ' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Æ' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ø' - 185: 252, # '¹' - 186: 255, # 'ŗ' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 255, # 'æ' - 192: 255, # 'Ą' - 193: 255, # 'Į' - 194: 255, # 'Ā' - 195: 255, # 'Ć' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Ę' - 199: 255, # 'Ē' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ź' - 203: 255, # 'Ė' - 204: 255, # 'Ģ' - 205: 255, # 'Ķ' - 206: 255, # 'Ī' - 207: 255, # 'Ļ' - 208: 255, # 'Š' - 209: 255, # 'Ń' - 210: 255, # 'Ņ' - 211: 255, # 'Ó' - 212: 255, # 'Ō' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ų' - 217: 255, # 'Ł' - 218: 255, # 'Ś' - 219: 255, # 'Ū' - 220: 255, # 'Ü' - 221: 255, # 'Ż' - 222: 255, # 'Ž' - 223: 255, # 'ß' - 224: 255, # 'ą' - 225: 255, # 'į' - 226: 255, # 'ā' - 227: 255, # 'ć' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'ę' - 231: 255, # 'ē' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ź' - 235: 255, # 'ė' - 236: 255, # 'ģ' - 237: 255, # 'ķ' - 238: 255, # 'ī' - 239: 255, # 'ļ' - 240: 255, # 'š' - 241: 255, # 'ń' - 242: 255, # 'ņ' - 243: 255, # 'ó' - 244: 255, # 'ō' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ų' - 249: 255, # 'ł' - 250: 255, # 'ś' - 251: 255, # 'ū' - 252: 255, # 'ü' - 253: 255, # 'ż' - 254: 255, # 'ž' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # None + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # None + 137: 253, # '‰' + 138: 255, # None + 139: 253, # '‹' + 140: 255, # None + 141: 253, # '¨' + 142: 255, # 'ˇ' + 143: 253, # '¸' + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 255, # None + 153: 253, # '™' + 154: 255, # None + 155: 253, # '›' + 156: 255, # None + 157: 253, # '¯' + 158: 253, # '˛' + 159: 255, # None + 160: 251, # '\xa0' + 161: 255, # None + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 255, # None + 166: 253, # '¦' + 167: 253, # '§' + 168: 255, # 'Ø' + 169: 253, # '©' + 170: 255, # 'Ŗ' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Æ' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ø' + 185: 252, # '¹' + 186: 255, # 'ŗ' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 255, # 'æ' + 192: 255, # 'Ą' + 193: 255, # 'Į' + 194: 255, # 'Ā' + 195: 255, # 'Ć' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Ę' + 199: 255, # 'Ē' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ź' + 203: 255, # 'Ė' + 204: 255, # 'Ģ' + 205: 255, # 'Ķ' + 206: 255, # 'Ī' + 207: 255, # 'Ļ' + 208: 255, # 'Š' + 209: 255, # 'Ń' + 210: 255, # 'Ņ' + 211: 255, # 'Ó' + 212: 255, # 'Ō' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ų' + 217: 255, # 'Ł' + 218: 255, # 'Ś' + 219: 255, # 'Ū' + 220: 255, # 'Ü' + 221: 255, # 'Ż' + 222: 255, # 'Ž' + 223: 255, # 'ß' + 224: 255, # 'ą' + 225: 255, # 'į' + 226: 255, # 'ā' + 227: 255, # 'ć' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'ę' + 231: 255, # 'ē' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ź' + 235: 255, # 'ė' + 236: 255, # 'ģ' + 237: 255, # 'ķ' + 238: 255, # 'ī' + 239: 255, # 'ļ' + 240: 255, # 'š' + 241: 255, # 'ń' + 242: 255, # 'ņ' + 243: 255, # 'ó' + 244: 255, # 'ō' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ų' + 249: 255, # 'ł' + 250: 255, # 'ś' + 251: 255, # 'ū' + 252: 255, # 'ü' + 253: 255, # 'ż' + 254: 255, # 'ž' + 255: 253, # '˙' } -WINDOWS_1257_LATVIAN_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1257', - language='Latvian', - char_to_order_map=WINDOWS_1257_LATVIAN_CHAR_TO_ORDER, - language_model=LATVIAN_LANG_MODEL, - typical_positive_ratio=0.964335016785684, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž') +WINDOWS_1257_LATVIAN_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1257", + language="Latvian", + char_to_order_map=WINDOWS_1257_LATVIAN_CHAR_TO_ORDER, + language_model=LATVIAN_LANG_MODEL, + typical_positive_ratio=0.964335016785684, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž", +) ISO_8859_4_LATVIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ą' - 162: 255, # 'ĸ' - 163: 255, # 'Ŗ' - 164: 253, # '¤' - 165: 255, # 'Ĩ' - 166: 255, # 'Ļ' - 167: 253, # '§' - 168: 253, # '¨' - 169: 255, # 'Š' - 170: 255, # 'Ē' - 171: 255, # 'Ģ' - 172: 255, # 'Ŧ' - 173: 251, # '\xad' - 174: 255, # 'Ž' - 175: 253, # '¯' - 176: 253, # '°' - 177: 255, # 'ą' - 178: 253, # '˛' - 179: 255, # 'ŗ' - 180: 253, # '´' - 181: 255, # 'ĩ' - 182: 255, # 'ļ' - 183: 255, # 'ˇ' - 184: 253, # '¸' - 185: 255, # 'š' - 186: 255, # 'ē' - 187: 255, # 'ģ' - 188: 255, # 'ŧ' - 189: 255, # 'Ŋ' - 190: 255, # 'ž' - 191: 255, # 'ŋ' - 192: 255, # 'Ā' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Į' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ė' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ī' - 208: 255, # 'Đ' - 209: 255, # 'Ņ' - 210: 255, # 'Ō' - 211: 255, # 'Ķ' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ų' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ũ' - 222: 255, # 'Ū' - 223: 255, # 'ß' - 224: 255, # 'ā' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'į' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ė' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ī' - 240: 255, # 'đ' - 241: 255, # 'ņ' - 242: 255, # 'ō' - 243: 255, # 'ķ' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ų' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ũ' - 254: 255, # 'ū' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ą' + 162: 255, # 'ĸ' + 163: 255, # 'Ŗ' + 164: 253, # '¤' + 165: 255, # 'Ĩ' + 166: 255, # 'Ļ' + 167: 253, # '§' + 168: 253, # '¨' + 169: 255, # 'Š' + 170: 255, # 'Ē' + 171: 255, # 'Ģ' + 172: 255, # 'Ŧ' + 173: 251, # '\xad' + 174: 255, # 'Ž' + 175: 253, # '¯' + 176: 253, # '°' + 177: 255, # 'ą' + 178: 253, # '˛' + 179: 255, # 'ŗ' + 180: 253, # '´' + 181: 255, # 'ĩ' + 182: 255, # 'ļ' + 183: 255, # 'ˇ' + 184: 253, # '¸' + 185: 255, # 'š' + 186: 255, # 'ē' + 187: 255, # 'ģ' + 188: 255, # 'ŧ' + 189: 255, # 'Ŋ' + 190: 255, # 'ž' + 191: 255, # 'ŋ' + 192: 255, # 'Ā' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Į' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ė' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ī' + 208: 255, # 'Đ' + 209: 255, # 'Ņ' + 210: 255, # 'Ō' + 211: 255, # 'Ķ' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ų' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ũ' + 222: 255, # 'Ū' + 223: 255, # 'ß' + 224: 255, # 'ā' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'į' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ė' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ī' + 240: 255, # 'đ' + 241: 255, # 'ņ' + 242: 255, # 'ō' + 243: 255, # 'ķ' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ų' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ũ' + 254: 255, # 'ū' + 255: 253, # '˙' } -ISO_8859_4_LATVIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-4', - language='Latvian', - char_to_order_map=ISO_8859_4_LATVIAN_CHAR_TO_ORDER, - language_model=LATVIAN_LANG_MODEL, - typical_positive_ratio=0.9472937246956014, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž') - +ISO_8859_4_LATVIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-4", + language="Latvian", + char_to_order_map=ISO_8859_4_LATVIAN_CHAR_TO_ORDER, + language_model=LATVIAN_LANG_MODEL, + typical_positive_ratio=0.9472937246956014, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž", +) diff --git a/chardet/langlithuanianmodel.py b/chardet/langlithuanianmodel.py index 975b62be..648a5bb9 100644 --- a/chardet/langlithuanianmodel.py +++ b/chardet/langlithuanianmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -62768,803 +62766,808 @@ # Character Mapping Table(s): ISO_8859_13_LITHUANIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '”' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '„' - 166: 253, # '¦' - 167: 253, # '§' - 168: 255, # 'Ø' - 169: 253, # '©' - 170: 255, # 'Ŗ' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Æ' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '“' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ø' - 185: 252, # '¹' - 186: 255, # 'ŗ' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 255, # 'æ' - 192: 255, # 'Ą' - 193: 255, # 'Į' - 194: 255, # 'Ā' - 195: 255, # 'Ć' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Ę' - 199: 255, # 'Ē' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ź' - 203: 255, # 'Ė' - 204: 255, # 'Ģ' - 205: 255, # 'Ķ' - 206: 255, # 'Ī' - 207: 255, # 'Ļ' - 208: 255, # 'Š' - 209: 255, # 'Ń' - 210: 255, # 'Ņ' - 211: 255, # 'Ó' - 212: 255, # 'Ō' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ų' - 217: 255, # 'Ł' - 218: 255, # 'Ś' - 219: 255, # 'Ū' - 220: 255, # 'Ü' - 221: 255, # 'Ż' - 222: 255, # 'Ž' - 223: 255, # 'ß' - 224: 255, # 'ą' - 225: 255, # 'į' - 226: 255, # 'ā' - 227: 255, # 'ć' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'ę' - 231: 255, # 'ē' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ź' - 235: 255, # 'ė' - 236: 255, # 'ģ' - 237: 255, # 'ķ' - 238: 255, # 'ī' - 239: 255, # 'ļ' - 240: 255, # 'š' - 241: 255, # 'ń' - 242: 255, # 'ņ' - 243: 255, # 'ó' - 244: 255, # 'ō' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ų' - 249: 255, # 'ł' - 250: 255, # 'ś' - 251: 255, # 'ū' - 252: 255, # 'ü' - 253: 255, # 'ż' - 254: 255, # 'ž' - 255: 253, # '’' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '”' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '„' + 166: 253, # '¦' + 167: 253, # '§' + 168: 255, # 'Ø' + 169: 253, # '©' + 170: 255, # 'Ŗ' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Æ' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '“' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ø' + 185: 252, # '¹' + 186: 255, # 'ŗ' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 255, # 'æ' + 192: 255, # 'Ą' + 193: 255, # 'Į' + 194: 255, # 'Ā' + 195: 255, # 'Ć' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Ę' + 199: 255, # 'Ē' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ź' + 203: 255, # 'Ė' + 204: 255, # 'Ģ' + 205: 255, # 'Ķ' + 206: 255, # 'Ī' + 207: 255, # 'Ļ' + 208: 255, # 'Š' + 209: 255, # 'Ń' + 210: 255, # 'Ņ' + 211: 255, # 'Ó' + 212: 255, # 'Ō' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ų' + 217: 255, # 'Ł' + 218: 255, # 'Ś' + 219: 255, # 'Ū' + 220: 255, # 'Ü' + 221: 255, # 'Ż' + 222: 255, # 'Ž' + 223: 255, # 'ß' + 224: 255, # 'ą' + 225: 255, # 'į' + 226: 255, # 'ā' + 227: 255, # 'ć' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'ę' + 231: 255, # 'ē' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ź' + 235: 255, # 'ė' + 236: 255, # 'ģ' + 237: 255, # 'ķ' + 238: 255, # 'ī' + 239: 255, # 'ļ' + 240: 255, # 'š' + 241: 255, # 'ń' + 242: 255, # 'ņ' + 243: 255, # 'ó' + 244: 255, # 'ō' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ų' + 249: 255, # 'ł' + 250: 255, # 'ś' + 251: 255, # 'ū' + 252: 255, # 'ü' + 253: 255, # 'ż' + 254: 255, # 'ž' + 255: 253, # '’' } -ISO_8859_13_LITHUANIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-13', - language='Lithuanian', - char_to_order_map=ISO_8859_13_LITHUANIAN_CHAR_TO_ORDER, - language_model=LITHUANIAN_LANG_MODEL, - typical_positive_ratio=0.9533930848699251, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzĄąČčĖėĘęĮįŠšŪūŲųŽž') +ISO_8859_13_LITHUANIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-13", + language="Lithuanian", + char_to_order_map=ISO_8859_13_LITHUANIAN_CHAR_TO_ORDER, + language_model=LITHUANIAN_LANG_MODEL, + typical_positive_ratio=0.9533930848699251, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzĄąČčĖėĘęĮįŠšŪūŲųŽž", +) WINDOWS_1257_LITHUANIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # None - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # None - 137: 253, # '‰' - 138: 255, # None - 139: 253, # '‹' - 140: 255, # None - 141: 253, # '¨' - 142: 255, # 'ˇ' - 143: 253, # '¸' - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 255, # None - 153: 253, # '™' - 154: 255, # None - 155: 253, # '›' - 156: 255, # None - 157: 253, # '¯' - 158: 253, # '˛' - 159: 255, # None - 160: 251, # '\xa0' - 161: 255, # None - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 255, # None - 166: 253, # '¦' - 167: 253, # '§' - 168: 255, # 'Ø' - 169: 253, # '©' - 170: 255, # 'Ŗ' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Æ' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ø' - 185: 252, # '¹' - 186: 255, # 'ŗ' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 255, # 'æ' - 192: 255, # 'Ą' - 193: 255, # 'Į' - 194: 255, # 'Ā' - 195: 255, # 'Ć' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Ę' - 199: 255, # 'Ē' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ź' - 203: 255, # 'Ė' - 204: 255, # 'Ģ' - 205: 255, # 'Ķ' - 206: 255, # 'Ī' - 207: 255, # 'Ļ' - 208: 255, # 'Š' - 209: 255, # 'Ń' - 210: 255, # 'Ņ' - 211: 255, # 'Ó' - 212: 255, # 'Ō' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ų' - 217: 255, # 'Ł' - 218: 255, # 'Ś' - 219: 255, # 'Ū' - 220: 255, # 'Ü' - 221: 255, # 'Ż' - 222: 255, # 'Ž' - 223: 255, # 'ß' - 224: 255, # 'ą' - 225: 255, # 'į' - 226: 255, # 'ā' - 227: 255, # 'ć' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'ę' - 231: 255, # 'ē' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ź' - 235: 255, # 'ė' - 236: 255, # 'ģ' - 237: 255, # 'ķ' - 238: 255, # 'ī' - 239: 255, # 'ļ' - 240: 255, # 'š' - 241: 255, # 'ń' - 242: 255, # 'ņ' - 243: 255, # 'ó' - 244: 255, # 'ō' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ų' - 249: 255, # 'ł' - 250: 255, # 'ś' - 251: 255, # 'ū' - 252: 255, # 'ü' - 253: 255, # 'ż' - 254: 255, # 'ž' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # None + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # None + 137: 253, # '‰' + 138: 255, # None + 139: 253, # '‹' + 140: 255, # None + 141: 253, # '¨' + 142: 255, # 'ˇ' + 143: 253, # '¸' + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 255, # None + 153: 253, # '™' + 154: 255, # None + 155: 253, # '›' + 156: 255, # None + 157: 253, # '¯' + 158: 253, # '˛' + 159: 255, # None + 160: 251, # '\xa0' + 161: 255, # None + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 255, # None + 166: 253, # '¦' + 167: 253, # '§' + 168: 255, # 'Ø' + 169: 253, # '©' + 170: 255, # 'Ŗ' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Æ' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ø' + 185: 252, # '¹' + 186: 255, # 'ŗ' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 255, # 'æ' + 192: 255, # 'Ą' + 193: 255, # 'Į' + 194: 255, # 'Ā' + 195: 255, # 'Ć' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Ę' + 199: 255, # 'Ē' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ź' + 203: 255, # 'Ė' + 204: 255, # 'Ģ' + 205: 255, # 'Ķ' + 206: 255, # 'Ī' + 207: 255, # 'Ļ' + 208: 255, # 'Š' + 209: 255, # 'Ń' + 210: 255, # 'Ņ' + 211: 255, # 'Ó' + 212: 255, # 'Ō' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ų' + 217: 255, # 'Ł' + 218: 255, # 'Ś' + 219: 255, # 'Ū' + 220: 255, # 'Ü' + 221: 255, # 'Ż' + 222: 255, # 'Ž' + 223: 255, # 'ß' + 224: 255, # 'ą' + 225: 255, # 'į' + 226: 255, # 'ā' + 227: 255, # 'ć' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'ę' + 231: 255, # 'ē' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ź' + 235: 255, # 'ė' + 236: 255, # 'ģ' + 237: 255, # 'ķ' + 238: 255, # 'ī' + 239: 255, # 'ļ' + 240: 255, # 'š' + 241: 255, # 'ń' + 242: 255, # 'ņ' + 243: 255, # 'ó' + 244: 255, # 'ō' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ų' + 249: 255, # 'ł' + 250: 255, # 'ś' + 251: 255, # 'ū' + 252: 255, # 'ü' + 253: 255, # 'ż' + 254: 255, # 'ž' + 255: 253, # '˙' } -WINDOWS_1257_LITHUANIAN_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1257', - language='Lithuanian', - char_to_order_map=WINDOWS_1257_LITHUANIAN_CHAR_TO_ORDER, - language_model=LITHUANIAN_LANG_MODEL, - typical_positive_ratio=0.9830473932003772, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzĄąČčĖėĘęĮįŠšŪūŲųŽž') +WINDOWS_1257_LITHUANIAN_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1257", + language="Lithuanian", + char_to_order_map=WINDOWS_1257_LITHUANIAN_CHAR_TO_ORDER, + language_model=LITHUANIAN_LANG_MODEL, + typical_positive_ratio=0.9830473932003772, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzĄąČčĖėĘęĮįŠšŪūŲųŽž", +) ISO_8859_4_LITHUANIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ą' - 162: 255, # 'ĸ' - 163: 255, # 'Ŗ' - 164: 253, # '¤' - 165: 255, # 'Ĩ' - 166: 255, # 'Ļ' - 167: 253, # '§' - 168: 253, # '¨' - 169: 255, # 'Š' - 170: 255, # 'Ē' - 171: 255, # 'Ģ' - 172: 255, # 'Ŧ' - 173: 251, # '\xad' - 174: 255, # 'Ž' - 175: 253, # '¯' - 176: 253, # '°' - 177: 255, # 'ą' - 178: 253, # '˛' - 179: 255, # 'ŗ' - 180: 253, # '´' - 181: 255, # 'ĩ' - 182: 255, # 'ļ' - 183: 255, # 'ˇ' - 184: 253, # '¸' - 185: 255, # 'š' - 186: 255, # 'ē' - 187: 255, # 'ģ' - 188: 255, # 'ŧ' - 189: 255, # 'Ŋ' - 190: 255, # 'ž' - 191: 255, # 'ŋ' - 192: 255, # 'Ā' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Į' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ė' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ī' - 208: 255, # 'Đ' - 209: 255, # 'Ņ' - 210: 255, # 'Ō' - 211: 255, # 'Ķ' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ų' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ũ' - 222: 255, # 'Ū' - 223: 255, # 'ß' - 224: 255, # 'ā' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'į' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ė' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ī' - 240: 255, # 'đ' - 241: 255, # 'ņ' - 242: 255, # 'ō' - 243: 255, # 'ķ' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ų' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ũ' - 254: 255, # 'ū' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ą' + 162: 255, # 'ĸ' + 163: 255, # 'Ŗ' + 164: 253, # '¤' + 165: 255, # 'Ĩ' + 166: 255, # 'Ļ' + 167: 253, # '§' + 168: 253, # '¨' + 169: 255, # 'Š' + 170: 255, # 'Ē' + 171: 255, # 'Ģ' + 172: 255, # 'Ŧ' + 173: 251, # '\xad' + 174: 255, # 'Ž' + 175: 253, # '¯' + 176: 253, # '°' + 177: 255, # 'ą' + 178: 253, # '˛' + 179: 255, # 'ŗ' + 180: 253, # '´' + 181: 255, # 'ĩ' + 182: 255, # 'ļ' + 183: 255, # 'ˇ' + 184: 253, # '¸' + 185: 255, # 'š' + 186: 255, # 'ē' + 187: 255, # 'ģ' + 188: 255, # 'ŧ' + 189: 255, # 'Ŋ' + 190: 255, # 'ž' + 191: 255, # 'ŋ' + 192: 255, # 'Ā' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Į' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ė' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ī' + 208: 255, # 'Đ' + 209: 255, # 'Ņ' + 210: 255, # 'Ō' + 211: 255, # 'Ķ' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ų' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ũ' + 222: 255, # 'Ū' + 223: 255, # 'ß' + 224: 255, # 'ā' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'į' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ė' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ī' + 240: 255, # 'đ' + 241: 255, # 'ņ' + 242: 255, # 'ō' + 243: 255, # 'ķ' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ų' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ũ' + 254: 255, # 'ū' + 255: 253, # '˙' } -ISO_8859_4_LITHUANIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-4', - language='Lithuanian', - char_to_order_map=ISO_8859_4_LITHUANIAN_CHAR_TO_ORDER, - language_model=LITHUANIAN_LANG_MODEL, - typical_positive_ratio=0.9372166116471733, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzĄąČčĖėĘęĮįŠšŪūŲųŽž') - +ISO_8859_4_LITHUANIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-4", + language="Lithuanian", + char_to_order_map=ISO_8859_4_LITHUANIAN_CHAR_TO_ORDER, + language_model=LITHUANIAN_LANG_MODEL, + typical_positive_ratio=0.9372166116471733, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzĄąČčĖėĘęĮįŠšŪūŲųŽž", +) diff --git a/chardet/langmacedonianmodel.py b/chardet/langmacedonianmodel.py index 8ae11b2f..47814a82 100644 --- a/chardet/langmacedonianmodel.py +++ b/chardet/langmacedonianmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -22368,1070 +22366,1077 @@ # Character Mapping Table(s): ISO_8859_5_MACEDONIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ё' - 162: 255, # 'Ђ' - 163: 255, # 'Ѓ' - 164: 255, # 'Є' - 165: 255, # 'Ѕ' - 166: 255, # 'І' - 167: 255, # 'Ї' - 168: 255, # 'Ј' - 169: 255, # 'Љ' - 170: 255, # 'Њ' - 171: 255, # 'Ћ' - 172: 255, # 'Ќ' - 173: 251, # '\xad' - 174: 255, # 'Ў' - 175: 255, # 'Џ' - 176: 255, # 'А' - 177: 255, # 'Б' - 178: 255, # 'В' - 179: 255, # 'Г' - 180: 255, # 'Д' - 181: 255, # 'Е' - 182: 255, # 'Ж' - 183: 255, # 'З' - 184: 255, # 'И' - 185: 255, # 'Й' - 186: 255, # 'К' - 187: 255, # 'Л' - 188: 255, # 'М' - 189: 255, # 'Н' - 190: 255, # 'О' - 191: 255, # 'П' - 192: 255, # 'Р' - 193: 255, # 'С' - 194: 255, # 'Т' - 195: 255, # 'У' - 196: 255, # 'Ф' - 197: 255, # 'Х' - 198: 255, # 'Ц' - 199: 255, # 'Ч' - 200: 255, # 'Ш' - 201: 255, # 'Щ' - 202: 255, # 'Ъ' - 203: 255, # 'Ы' - 204: 255, # 'Ь' - 205: 255, # 'Э' - 206: 255, # 'Ю' - 207: 255, # 'Я' - 208: 255, # 'а' - 209: 255, # 'б' - 210: 255, # 'в' - 211: 255, # 'г' - 212: 255, # 'д' - 213: 255, # 'е' - 214: 255, # 'ж' - 215: 255, # 'з' - 216: 255, # 'и' - 217: 255, # 'й' - 218: 255, # 'к' - 219: 255, # 'л' - 220: 255, # 'м' - 221: 255, # 'н' - 222: 255, # 'о' - 223: 255, # 'п' - 224: 255, # 'р' - 225: 255, # 'с' - 226: 255, # 'т' - 227: 255, # 'у' - 228: 255, # 'ф' - 229: 255, # 'х' - 230: 255, # 'ц' - 231: 255, # 'ч' - 232: 255, # 'ш' - 233: 255, # 'щ' - 234: 255, # 'ъ' - 235: 255, # 'ы' - 236: 255, # 'ь' - 237: 255, # 'э' - 238: 255, # 'ю' - 239: 255, # 'я' - 240: 253, # '№' - 241: 255, # 'ё' - 242: 255, # 'ђ' - 243: 255, # 'ѓ' - 244: 255, # 'є' - 245: 255, # 'ѕ' - 246: 255, # 'і' - 247: 255, # 'ї' - 248: 255, # 'ј' - 249: 255, # 'љ' - 250: 255, # 'њ' - 251: 255, # 'ћ' - 252: 255, # 'ќ' - 253: 253, # '§' - 254: 255, # 'ў' - 255: 255, # 'џ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ё' + 162: 255, # 'Ђ' + 163: 255, # 'Ѓ' + 164: 255, # 'Є' + 165: 255, # 'Ѕ' + 166: 255, # 'І' + 167: 255, # 'Ї' + 168: 255, # 'Ј' + 169: 255, # 'Љ' + 170: 255, # 'Њ' + 171: 255, # 'Ћ' + 172: 255, # 'Ќ' + 173: 251, # '\xad' + 174: 255, # 'Ў' + 175: 255, # 'Џ' + 176: 255, # 'А' + 177: 255, # 'Б' + 178: 255, # 'В' + 179: 255, # 'Г' + 180: 255, # 'Д' + 181: 255, # 'Е' + 182: 255, # 'Ж' + 183: 255, # 'З' + 184: 255, # 'И' + 185: 255, # 'Й' + 186: 255, # 'К' + 187: 255, # 'Л' + 188: 255, # 'М' + 189: 255, # 'Н' + 190: 255, # 'О' + 191: 255, # 'П' + 192: 255, # 'Р' + 193: 255, # 'С' + 194: 255, # 'Т' + 195: 255, # 'У' + 196: 255, # 'Ф' + 197: 255, # 'Х' + 198: 255, # 'Ц' + 199: 255, # 'Ч' + 200: 255, # 'Ш' + 201: 255, # 'Щ' + 202: 255, # 'Ъ' + 203: 255, # 'Ы' + 204: 255, # 'Ь' + 205: 255, # 'Э' + 206: 255, # 'Ю' + 207: 255, # 'Я' + 208: 255, # 'а' + 209: 255, # 'б' + 210: 255, # 'в' + 211: 255, # 'г' + 212: 255, # 'д' + 213: 255, # 'е' + 214: 255, # 'ж' + 215: 255, # 'з' + 216: 255, # 'и' + 217: 255, # 'й' + 218: 255, # 'к' + 219: 255, # 'л' + 220: 255, # 'м' + 221: 255, # 'н' + 222: 255, # 'о' + 223: 255, # 'п' + 224: 255, # 'р' + 225: 255, # 'с' + 226: 255, # 'т' + 227: 255, # 'у' + 228: 255, # 'ф' + 229: 255, # 'х' + 230: 255, # 'ц' + 231: 255, # 'ч' + 232: 255, # 'ш' + 233: 255, # 'щ' + 234: 255, # 'ъ' + 235: 255, # 'ы' + 236: 255, # 'ь' + 237: 255, # 'э' + 238: 255, # 'ю' + 239: 255, # 'я' + 240: 253, # '№' + 241: 255, # 'ё' + 242: 255, # 'ђ' + 243: 255, # 'ѓ' + 244: 255, # 'є' + 245: 255, # 'ѕ' + 246: 255, # 'і' + 247: 255, # 'ї' + 248: 255, # 'ј' + 249: 255, # 'љ' + 250: 255, # 'њ' + 251: 255, # 'ћ' + 252: 255, # 'ќ' + 253: 253, # '§' + 254: 255, # 'ў' + 255: 255, # 'џ' } -ISO_8859_5_MACEDONIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-5', - language='Macedonian', - char_to_order_map=ISO_8859_5_MACEDONIAN_CHAR_TO_ORDER, - language_model=MACEDONIAN_LANG_MODEL, - typical_positive_ratio=0.9494194218689882, - keep_ascii_letters=False, - alphabet='ЃЅЈЉЊЌЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшѓѕјљњќџ') +ISO_8859_5_MACEDONIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-5", + language="Macedonian", + char_to_order_map=ISO_8859_5_MACEDONIAN_CHAR_TO_ORDER, + language_model=MACEDONIAN_LANG_MODEL, + typical_positive_ratio=0.9494194218689882, + keep_ascii_letters=False, + alphabet="ЃЅЈЉЊЌЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшѓѕјљњќџ", +) WINDOWS_1251_MACEDONIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 255, # 'Ђ' - 129: 255, # 'Ѓ' - 130: 253, # '‚' - 131: 255, # 'ѓ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 253, # '€' - 137: 253, # '‰' - 138: 255, # 'Љ' - 139: 253, # '‹' - 140: 255, # 'Њ' - 141: 255, # 'Ќ' - 142: 255, # 'Ћ' - 143: 255, # 'Џ' - 144: 255, # 'ђ' - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 255, # None - 153: 253, # '™' - 154: 255, # 'љ' - 155: 253, # '›' - 156: 255, # 'њ' - 157: 255, # 'ќ' - 158: 255, # 'ћ' - 159: 255, # 'џ' - 160: 251, # '\xa0' - 161: 255, # 'Ў' - 162: 255, # 'ў' - 163: 255, # 'Ј' - 164: 253, # '¤' - 165: 255, # 'Ґ' - 166: 253, # '¦' - 167: 253, # '§' - 168: 255, # 'Ё' - 169: 253, # '©' - 170: 255, # 'Є' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Ї' - 176: 253, # '°' - 177: 253, # '±' - 178: 255, # 'І' - 179: 255, # 'і' - 180: 255, # 'ґ' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ё' - 185: 253, # '№' - 186: 255, # 'є' - 187: 253, # '»' - 188: 255, # 'ј' - 189: 255, # 'Ѕ' - 190: 255, # 'ѕ' - 191: 255, # 'ї' - 192: 255, # 'А' - 193: 255, # 'Б' - 194: 255, # 'В' - 195: 255, # 'Г' - 196: 255, # 'Д' - 197: 255, # 'Е' - 198: 255, # 'Ж' - 199: 255, # 'З' - 200: 255, # 'И' - 201: 255, # 'Й' - 202: 255, # 'К' - 203: 255, # 'Л' - 204: 255, # 'М' - 205: 255, # 'Н' - 206: 255, # 'О' - 207: 255, # 'П' - 208: 255, # 'Р' - 209: 255, # 'С' - 210: 255, # 'Т' - 211: 255, # 'У' - 212: 255, # 'Ф' - 213: 255, # 'Х' - 214: 255, # 'Ц' - 215: 255, # 'Ч' - 216: 255, # 'Ш' - 217: 255, # 'Щ' - 218: 255, # 'Ъ' - 219: 255, # 'Ы' - 220: 255, # 'Ь' - 221: 255, # 'Э' - 222: 255, # 'Ю' - 223: 255, # 'Я' - 224: 255, # 'а' - 225: 255, # 'б' - 226: 255, # 'в' - 227: 255, # 'г' - 228: 255, # 'д' - 229: 255, # 'е' - 230: 255, # 'ж' - 231: 255, # 'з' - 232: 255, # 'и' - 233: 255, # 'й' - 234: 255, # 'к' - 235: 255, # 'л' - 236: 255, # 'м' - 237: 255, # 'н' - 238: 255, # 'о' - 239: 255, # 'п' - 240: 255, # 'р' - 241: 255, # 'с' - 242: 255, # 'т' - 243: 255, # 'у' - 244: 255, # 'ф' - 245: 255, # 'х' - 246: 255, # 'ц' - 247: 255, # 'ч' - 248: 255, # 'ш' - 249: 255, # 'щ' - 250: 255, # 'ъ' - 251: 255, # 'ы' - 252: 255, # 'ь' - 253: 255, # 'э' - 254: 255, # 'ю' - 255: 255, # 'я' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 255, # 'Ђ' + 129: 255, # 'Ѓ' + 130: 253, # '‚' + 131: 255, # 'ѓ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 253, # '€' + 137: 253, # '‰' + 138: 255, # 'Љ' + 139: 253, # '‹' + 140: 255, # 'Њ' + 141: 255, # 'Ќ' + 142: 255, # 'Ћ' + 143: 255, # 'Џ' + 144: 255, # 'ђ' + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 255, # None + 153: 253, # '™' + 154: 255, # 'љ' + 155: 253, # '›' + 156: 255, # 'њ' + 157: 255, # 'ќ' + 158: 255, # 'ћ' + 159: 255, # 'џ' + 160: 251, # '\xa0' + 161: 255, # 'Ў' + 162: 255, # 'ў' + 163: 255, # 'Ј' + 164: 253, # '¤' + 165: 255, # 'Ґ' + 166: 253, # '¦' + 167: 253, # '§' + 168: 255, # 'Ё' + 169: 253, # '©' + 170: 255, # 'Є' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Ї' + 176: 253, # '°' + 177: 253, # '±' + 178: 255, # 'І' + 179: 255, # 'і' + 180: 255, # 'ґ' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ё' + 185: 253, # '№' + 186: 255, # 'є' + 187: 253, # '»' + 188: 255, # 'ј' + 189: 255, # 'Ѕ' + 190: 255, # 'ѕ' + 191: 255, # 'ї' + 192: 255, # 'А' + 193: 255, # 'Б' + 194: 255, # 'В' + 195: 255, # 'Г' + 196: 255, # 'Д' + 197: 255, # 'Е' + 198: 255, # 'Ж' + 199: 255, # 'З' + 200: 255, # 'И' + 201: 255, # 'Й' + 202: 255, # 'К' + 203: 255, # 'Л' + 204: 255, # 'М' + 205: 255, # 'Н' + 206: 255, # 'О' + 207: 255, # 'П' + 208: 255, # 'Р' + 209: 255, # 'С' + 210: 255, # 'Т' + 211: 255, # 'У' + 212: 255, # 'Ф' + 213: 255, # 'Х' + 214: 255, # 'Ц' + 215: 255, # 'Ч' + 216: 255, # 'Ш' + 217: 255, # 'Щ' + 218: 255, # 'Ъ' + 219: 255, # 'Ы' + 220: 255, # 'Ь' + 221: 255, # 'Э' + 222: 255, # 'Ю' + 223: 255, # 'Я' + 224: 255, # 'а' + 225: 255, # 'б' + 226: 255, # 'в' + 227: 255, # 'г' + 228: 255, # 'д' + 229: 255, # 'е' + 230: 255, # 'ж' + 231: 255, # 'з' + 232: 255, # 'и' + 233: 255, # 'й' + 234: 255, # 'к' + 235: 255, # 'л' + 236: 255, # 'м' + 237: 255, # 'н' + 238: 255, # 'о' + 239: 255, # 'п' + 240: 255, # 'р' + 241: 255, # 'с' + 242: 255, # 'т' + 243: 255, # 'у' + 244: 255, # 'ф' + 245: 255, # 'х' + 246: 255, # 'ц' + 247: 255, # 'ч' + 248: 255, # 'ш' + 249: 255, # 'щ' + 250: 255, # 'ъ' + 251: 255, # 'ы' + 252: 255, # 'ь' + 253: 255, # 'э' + 254: 255, # 'ю' + 255: 255, # 'я' } -WINDOWS_1251_MACEDONIAN_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1251', - language='Macedonian', - char_to_order_map=WINDOWS_1251_MACEDONIAN_CHAR_TO_ORDER, - language_model=MACEDONIAN_LANG_MODEL, - typical_positive_ratio=0.9514979037713523, - keep_ascii_letters=False, - alphabet='ЃЅЈЉЊЌЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшѓѕјљњќџ') +WINDOWS_1251_MACEDONIAN_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1251", + language="Macedonian", + char_to_order_map=WINDOWS_1251_MACEDONIAN_CHAR_TO_ORDER, + language_model=MACEDONIAN_LANG_MODEL, + typical_positive_ratio=0.9514979037713523, + keep_ascii_letters=False, + alphabet="ЃЅЈЉЊЌЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшѓѕјљњќџ", +) MACCYRILLIC_MACEDONIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 255, # 'А' - 129: 255, # 'Б' - 130: 255, # 'В' - 131: 255, # 'Г' - 132: 255, # 'Д' - 133: 255, # 'Е' - 134: 255, # 'Ж' - 135: 255, # 'З' - 136: 255, # 'И' - 137: 255, # 'Й' - 138: 255, # 'К' - 139: 255, # 'Л' - 140: 255, # 'М' - 141: 255, # 'Н' - 142: 255, # 'О' - 143: 255, # 'П' - 144: 255, # 'Р' - 145: 255, # 'С' - 146: 255, # 'Т' - 147: 255, # 'У' - 148: 255, # 'Ф' - 149: 255, # 'Х' - 150: 255, # 'Ц' - 151: 255, # 'Ч' - 152: 255, # 'Ш' - 153: 255, # 'Щ' - 154: 255, # 'Ъ' - 155: 255, # 'Ы' - 156: 255, # 'Ь' - 157: 255, # 'Э' - 158: 255, # 'Ю' - 159: 255, # 'Я' - 160: 253, # '†' - 161: 253, # '°' - 162: 255, # 'Ґ' - 163: 253, # '£' - 164: 253, # '§' - 165: 253, # '•' - 166: 253, # '¶' - 167: 255, # 'І' - 168: 253, # '®' - 169: 253, # '©' - 170: 253, # '™' - 171: 255, # 'Ђ' - 172: 255, # 'ђ' - 173: 253, # '≠' - 174: 255, # 'Ѓ' - 175: 255, # 'ѓ' - 176: 253, # '∞' - 177: 253, # '±' - 178: 253, # '≤' - 179: 253, # '≥' - 180: 255, # 'і' - 181: 255, # 'µ' - 182: 255, # 'ґ' - 183: 255, # 'Ј' - 184: 255, # 'Є' - 185: 255, # 'є' - 186: 255, # 'Ї' - 187: 255, # 'ї' - 188: 255, # 'Љ' - 189: 255, # 'љ' - 190: 255, # 'Њ' - 191: 255, # 'њ' - 192: 255, # 'ј' - 193: 255, # 'Ѕ' - 194: 253, # '¬' - 195: 253, # '√' - 196: 255, # 'ƒ' - 197: 253, # '≈' - 198: 253, # '∆' - 199: 253, # '«' - 200: 253, # '»' - 201: 253, # '…' - 202: 251, # '\xa0' - 203: 255, # 'Ћ' - 204: 255, # 'ћ' - 205: 255, # 'Ќ' - 206: 255, # 'ќ' - 207: 255, # 'ѕ' - 208: 253, # '–' - 209: 253, # '—' - 210: 253, # '“' - 211: 253, # '”' - 212: 253, # '‘' - 213: 253, # '’' - 214: 253, # '÷' - 215: 253, # '„' - 216: 255, # 'Ў' - 217: 255, # 'ў' - 218: 255, # 'Џ' - 219: 255, # 'џ' - 220: 253, # '№' - 221: 255, # 'Ё' - 222: 255, # 'ё' - 223: 255, # 'я' - 224: 255, # 'а' - 225: 255, # 'б' - 226: 255, # 'в' - 227: 255, # 'г' - 228: 255, # 'д' - 229: 255, # 'е' - 230: 255, # 'ж' - 231: 255, # 'з' - 232: 255, # 'и' - 233: 255, # 'й' - 234: 255, # 'к' - 235: 255, # 'л' - 236: 255, # 'м' - 237: 255, # 'н' - 238: 255, # 'о' - 239: 255, # 'п' - 240: 255, # 'р' - 241: 255, # 'с' - 242: 255, # 'т' - 243: 255, # 'у' - 244: 255, # 'ф' - 245: 255, # 'х' - 246: 255, # 'ц' - 247: 255, # 'ч' - 248: 255, # 'ш' - 249: 255, # 'щ' - 250: 255, # 'ъ' - 251: 255, # 'ы' - 252: 255, # 'ь' - 253: 255, # 'э' - 254: 255, # 'ю' - 255: 253, # '€' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 255, # 'А' + 129: 255, # 'Б' + 130: 255, # 'В' + 131: 255, # 'Г' + 132: 255, # 'Д' + 133: 255, # 'Е' + 134: 255, # 'Ж' + 135: 255, # 'З' + 136: 255, # 'И' + 137: 255, # 'Й' + 138: 255, # 'К' + 139: 255, # 'Л' + 140: 255, # 'М' + 141: 255, # 'Н' + 142: 255, # 'О' + 143: 255, # 'П' + 144: 255, # 'Р' + 145: 255, # 'С' + 146: 255, # 'Т' + 147: 255, # 'У' + 148: 255, # 'Ф' + 149: 255, # 'Х' + 150: 255, # 'Ц' + 151: 255, # 'Ч' + 152: 255, # 'Ш' + 153: 255, # 'Щ' + 154: 255, # 'Ъ' + 155: 255, # 'Ы' + 156: 255, # 'Ь' + 157: 255, # 'Э' + 158: 255, # 'Ю' + 159: 255, # 'Я' + 160: 253, # '†' + 161: 253, # '°' + 162: 255, # 'Ґ' + 163: 253, # '£' + 164: 253, # '§' + 165: 253, # '•' + 166: 253, # '¶' + 167: 255, # 'І' + 168: 253, # '®' + 169: 253, # '©' + 170: 253, # '™' + 171: 255, # 'Ђ' + 172: 255, # 'ђ' + 173: 253, # '≠' + 174: 255, # 'Ѓ' + 175: 255, # 'ѓ' + 176: 253, # '∞' + 177: 253, # '±' + 178: 253, # '≤' + 179: 253, # '≥' + 180: 255, # 'і' + 181: 255, # 'µ' + 182: 255, # 'ґ' + 183: 255, # 'Ј' + 184: 255, # 'Є' + 185: 255, # 'є' + 186: 255, # 'Ї' + 187: 255, # 'ї' + 188: 255, # 'Љ' + 189: 255, # 'љ' + 190: 255, # 'Њ' + 191: 255, # 'њ' + 192: 255, # 'ј' + 193: 255, # 'Ѕ' + 194: 253, # '¬' + 195: 253, # '√' + 196: 255, # 'ƒ' + 197: 253, # '≈' + 198: 253, # '∆' + 199: 253, # '«' + 200: 253, # '»' + 201: 253, # '…' + 202: 251, # '\xa0' + 203: 255, # 'Ћ' + 204: 255, # 'ћ' + 205: 255, # 'Ќ' + 206: 255, # 'ќ' + 207: 255, # 'ѕ' + 208: 253, # '–' + 209: 253, # '—' + 210: 253, # '“' + 211: 253, # '”' + 212: 253, # '‘' + 213: 253, # '’' + 214: 253, # '÷' + 215: 253, # '„' + 216: 255, # 'Ў' + 217: 255, # 'ў' + 218: 255, # 'Џ' + 219: 255, # 'џ' + 220: 253, # '№' + 221: 255, # 'Ё' + 222: 255, # 'ё' + 223: 255, # 'я' + 224: 255, # 'а' + 225: 255, # 'б' + 226: 255, # 'в' + 227: 255, # 'г' + 228: 255, # 'д' + 229: 255, # 'е' + 230: 255, # 'ж' + 231: 255, # 'з' + 232: 255, # 'и' + 233: 255, # 'й' + 234: 255, # 'к' + 235: 255, # 'л' + 236: 255, # 'м' + 237: 255, # 'н' + 238: 255, # 'о' + 239: 255, # 'п' + 240: 255, # 'р' + 241: 255, # 'с' + 242: 255, # 'т' + 243: 255, # 'у' + 244: 255, # 'ф' + 245: 255, # 'х' + 246: 255, # 'ц' + 247: 255, # 'ч' + 248: 255, # 'ш' + 249: 255, # 'щ' + 250: 255, # 'ъ' + 251: 255, # 'ы' + 252: 255, # 'ь' + 253: 255, # 'э' + 254: 255, # 'ю' + 255: 253, # '€' } -MACCYRILLIC_MACEDONIAN_MODEL = SingleByteCharSetModel(charset_name='MacCyrillic', - language='Macedonian', - char_to_order_map=MACCYRILLIC_MACEDONIAN_CHAR_TO_ORDER, - language_model=MACEDONIAN_LANG_MODEL, - typical_positive_ratio=0.9514979037713523, - keep_ascii_letters=False, - alphabet='ЃЅЈЉЊЌЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшѓѕјљњќџ') +MACCYRILLIC_MACEDONIAN_MODEL = SingleByteCharSetModel( + charset_name="MacCyrillic", + language="Macedonian", + char_to_order_map=MACCYRILLIC_MACEDONIAN_CHAR_TO_ORDER, + language_model=MACEDONIAN_LANG_MODEL, + typical_positive_ratio=0.9514979037713523, + keep_ascii_letters=False, + alphabet="ЃЅЈЉЊЌЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшѓѕјљњќџ", +) IBM855_MACEDONIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 255, # 'ђ' - 129: 255, # 'Ђ' - 130: 255, # 'ѓ' - 131: 255, # 'Ѓ' - 132: 255, # 'ё' - 133: 255, # 'Ё' - 134: 255, # 'є' - 135: 255, # 'Є' - 136: 255, # 'ѕ' - 137: 255, # 'Ѕ' - 138: 255, # 'і' - 139: 255, # 'І' - 140: 255, # 'ї' - 141: 255, # 'Ї' - 142: 255, # 'ј' - 143: 255, # 'Ј' - 144: 255, # 'љ' - 145: 255, # 'Љ' - 146: 255, # 'њ' - 147: 255, # 'Њ' - 148: 255, # 'ћ' - 149: 255, # 'Ћ' - 150: 255, # 'ќ' - 151: 255, # 'Ќ' - 152: 255, # 'ў' - 153: 255, # 'Ў' - 154: 255, # 'џ' - 155: 255, # 'Џ' - 156: 255, # 'ю' - 157: 255, # 'Ю' - 158: 255, # 'ъ' - 159: 255, # 'Ъ' - 160: 255, # 'а' - 161: 255, # 'А' - 162: 255, # 'б' - 163: 255, # 'Б' - 164: 255, # 'ц' - 165: 255, # 'Ц' - 166: 255, # 'д' - 167: 255, # 'Д' - 168: 255, # 'е' - 169: 255, # 'Е' - 170: 255, # 'ф' - 171: 255, # 'Ф' - 172: 255, # 'г' - 173: 255, # 'Г' - 174: 253, # '«' - 175: 253, # '»' - 176: 253, # '░' - 177: 253, # '▒' - 178: 253, # '▓' - 179: 253, # '│' - 180: 253, # '┤' - 181: 255, # 'х' - 182: 255, # 'Х' - 183: 255, # 'и' - 184: 255, # 'И' - 185: 253, # '╣' - 186: 253, # '║' - 187: 253, # '╗' - 188: 253, # '╝' - 189: 255, # 'й' - 190: 255, # 'Й' - 191: 253, # '┐' - 192: 253, # '└' - 193: 253, # '┴' - 194: 253, # '┬' - 195: 253, # '├' - 196: 253, # '─' - 197: 253, # '┼' - 198: 255, # 'к' - 199: 255, # 'К' - 200: 253, # '╚' - 201: 253, # '╔' - 202: 253, # '╩' - 203: 253, # '╦' - 204: 253, # '╠' - 205: 253, # '═' - 206: 253, # '╬' - 207: 253, # '¤' - 208: 255, # 'л' - 209: 255, # 'Л' - 210: 255, # 'м' - 211: 255, # 'М' - 212: 255, # 'н' - 213: 255, # 'Н' - 214: 255, # 'о' - 215: 255, # 'О' - 216: 255, # 'п' - 217: 253, # '┘' - 218: 253, # '┌' - 219: 253, # '█' - 220: 253, # '▄' - 221: 255, # 'П' - 222: 255, # 'я' - 223: 253, # '▀' - 224: 255, # 'Я' - 225: 255, # 'р' - 226: 255, # 'Р' - 227: 255, # 'с' - 228: 255, # 'С' - 229: 255, # 'т' - 230: 255, # 'Т' - 231: 255, # 'у' - 232: 255, # 'У' - 233: 255, # 'ж' - 234: 255, # 'Ж' - 235: 255, # 'в' - 236: 255, # 'В' - 237: 255, # 'ь' - 238: 255, # 'Ь' - 239: 253, # '№' - 240: 251, # '\xad' - 241: 255, # 'ы' - 242: 255, # 'Ы' - 243: 255, # 'з' - 244: 255, # 'З' - 245: 255, # 'ш' - 246: 255, # 'Ш' - 247: 255, # 'э' - 248: 255, # 'Э' - 249: 255, # 'щ' - 250: 255, # 'Щ' - 251: 255, # 'ч' - 252: 255, # 'Ч' - 253: 253, # '§' - 254: 253, # '■' - 255: 251, # '\xa0' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 255, # 'ђ' + 129: 255, # 'Ђ' + 130: 255, # 'ѓ' + 131: 255, # 'Ѓ' + 132: 255, # 'ё' + 133: 255, # 'Ё' + 134: 255, # 'є' + 135: 255, # 'Є' + 136: 255, # 'ѕ' + 137: 255, # 'Ѕ' + 138: 255, # 'і' + 139: 255, # 'І' + 140: 255, # 'ї' + 141: 255, # 'Ї' + 142: 255, # 'ј' + 143: 255, # 'Ј' + 144: 255, # 'љ' + 145: 255, # 'Љ' + 146: 255, # 'њ' + 147: 255, # 'Њ' + 148: 255, # 'ћ' + 149: 255, # 'Ћ' + 150: 255, # 'ќ' + 151: 255, # 'Ќ' + 152: 255, # 'ў' + 153: 255, # 'Ў' + 154: 255, # 'џ' + 155: 255, # 'Џ' + 156: 255, # 'ю' + 157: 255, # 'Ю' + 158: 255, # 'ъ' + 159: 255, # 'Ъ' + 160: 255, # 'а' + 161: 255, # 'А' + 162: 255, # 'б' + 163: 255, # 'Б' + 164: 255, # 'ц' + 165: 255, # 'Ц' + 166: 255, # 'д' + 167: 255, # 'Д' + 168: 255, # 'е' + 169: 255, # 'Е' + 170: 255, # 'ф' + 171: 255, # 'Ф' + 172: 255, # 'г' + 173: 255, # 'Г' + 174: 253, # '«' + 175: 253, # '»' + 176: 253, # '░' + 177: 253, # '▒' + 178: 253, # '▓' + 179: 253, # '│' + 180: 253, # '┤' + 181: 255, # 'х' + 182: 255, # 'Х' + 183: 255, # 'и' + 184: 255, # 'И' + 185: 253, # '╣' + 186: 253, # '║' + 187: 253, # '╗' + 188: 253, # '╝' + 189: 255, # 'й' + 190: 255, # 'Й' + 191: 253, # '┐' + 192: 253, # '└' + 193: 253, # '┴' + 194: 253, # '┬' + 195: 253, # '├' + 196: 253, # '─' + 197: 253, # '┼' + 198: 255, # 'к' + 199: 255, # 'К' + 200: 253, # '╚' + 201: 253, # '╔' + 202: 253, # '╩' + 203: 253, # '╦' + 204: 253, # '╠' + 205: 253, # '═' + 206: 253, # '╬' + 207: 253, # '¤' + 208: 255, # 'л' + 209: 255, # 'Л' + 210: 255, # 'м' + 211: 255, # 'М' + 212: 255, # 'н' + 213: 255, # 'Н' + 214: 255, # 'о' + 215: 255, # 'О' + 216: 255, # 'п' + 217: 253, # '┘' + 218: 253, # '┌' + 219: 253, # '█' + 220: 253, # '▄' + 221: 255, # 'П' + 222: 255, # 'я' + 223: 253, # '▀' + 224: 255, # 'Я' + 225: 255, # 'р' + 226: 255, # 'Р' + 227: 255, # 'с' + 228: 255, # 'С' + 229: 255, # 'т' + 230: 255, # 'Т' + 231: 255, # 'у' + 232: 255, # 'У' + 233: 255, # 'ж' + 234: 255, # 'Ж' + 235: 255, # 'в' + 236: 255, # 'В' + 237: 255, # 'ь' + 238: 255, # 'Ь' + 239: 253, # '№' + 240: 251, # '\xad' + 241: 255, # 'ы' + 242: 255, # 'Ы' + 243: 255, # 'з' + 244: 255, # 'З' + 245: 255, # 'ш' + 246: 255, # 'Ш' + 247: 255, # 'э' + 248: 255, # 'Э' + 249: 255, # 'щ' + 250: 255, # 'Щ' + 251: 255, # 'ч' + 252: 255, # 'Ч' + 253: 253, # '§' + 254: 253, # '■' + 255: 251, # '\xa0' } -IBM855_MACEDONIAN_MODEL = SingleByteCharSetModel(charset_name='IBM855', - language='Macedonian', - char_to_order_map=IBM855_MACEDONIAN_CHAR_TO_ORDER, - language_model=MACEDONIAN_LANG_MODEL, - typical_positive_ratio=0.9494194218689882, - keep_ascii_letters=False, - alphabet='ЃЅЈЉЊЌЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшѓѕјљњќџ') - +IBM855_MACEDONIAN_MODEL = SingleByteCharSetModel( + charset_name="IBM855", + language="Macedonian", + char_to_order_map=IBM855_MACEDONIAN_CHAR_TO_ORDER, + language_model=MACEDONIAN_LANG_MODEL, + typical_positive_ratio=0.9494194218689882, + keep_ascii_letters=False, + alphabet="ЃЅЈЉЊЌЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшѓѕјљњќџ", +) diff --git a/chardet/langpolishmodel.py b/chardet/langpolishmodel.py index 7aaa0701..7520e39e 100644 --- a/chardet/langpolishmodel.py +++ b/chardet/langpolishmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -62768,536 +62766,539 @@ # Character Mapping Table(s): ISO_8859_2_POLISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ą' - 162: 253, # '˘' - 163: 255, # 'Ł' - 164: 253, # '¤' - 165: 255, # 'Ľ' - 166: 255, # 'Ś' - 167: 253, # '§' - 168: 253, # '¨' - 169: 255, # 'Š' - 170: 255, # 'Ş' - 171: 255, # 'Ť' - 172: 255, # 'Ź' - 173: 251, # '\xad' - 174: 255, # 'Ž' - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 255, # 'ą' - 178: 253, # '˛' - 179: 255, # 'ł' - 180: 253, # '´' - 181: 255, # 'ľ' - 182: 255, # 'ś' - 183: 255, # 'ˇ' - 184: 253, # '¸' - 185: 255, # 'š' - 186: 255, # 'ş' - 187: 255, # 'ť' - 188: 255, # 'ź' - 189: 253, # '˝' - 190: 255, # 'ž' - 191: 255, # 'ż' - 192: 255, # 'Ŕ' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ă' - 196: 255, # 'Ä' - 197: 255, # 'Ĺ' - 198: 255, # 'Ć' - 199: 255, # 'Ç' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ě' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ď' - 208: 255, # 'Đ' - 209: 255, # 'Ń' - 210: 255, # 'Ň' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ő' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ř' - 217: 255, # 'Ů' - 218: 255, # 'Ú' - 219: 255, # 'Ű' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Ţ' - 223: 255, # 'ß' - 224: 255, # 'ŕ' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ă' - 228: 255, # 'ä' - 229: 255, # 'ĺ' - 230: 255, # 'ć' - 231: 255, # 'ç' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ě' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ď' - 240: 255, # 'đ' - 241: 255, # 'ń' - 242: 255, # 'ň' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ő' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ř' - 249: 255, # 'ů' - 250: 255, # 'ú' - 251: 255, # 'ű' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'ţ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ą' + 162: 253, # '˘' + 163: 255, # 'Ł' + 164: 253, # '¤' + 165: 255, # 'Ľ' + 166: 255, # 'Ś' + 167: 253, # '§' + 168: 253, # '¨' + 169: 255, # 'Š' + 170: 255, # 'Ş' + 171: 255, # 'Ť' + 172: 255, # 'Ź' + 173: 251, # '\xad' + 174: 255, # 'Ž' + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 255, # 'ą' + 178: 253, # '˛' + 179: 255, # 'ł' + 180: 253, # '´' + 181: 255, # 'ľ' + 182: 255, # 'ś' + 183: 255, # 'ˇ' + 184: 253, # '¸' + 185: 255, # 'š' + 186: 255, # 'ş' + 187: 255, # 'ť' + 188: 255, # 'ź' + 189: 253, # '˝' + 190: 255, # 'ž' + 191: 255, # 'ż' + 192: 255, # 'Ŕ' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ă' + 196: 255, # 'Ä' + 197: 255, # 'Ĺ' + 198: 255, # 'Ć' + 199: 255, # 'Ç' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ě' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ď' + 208: 255, # 'Đ' + 209: 255, # 'Ń' + 210: 255, # 'Ň' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ő' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ř' + 217: 255, # 'Ů' + 218: 255, # 'Ú' + 219: 255, # 'Ű' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Ţ' + 223: 255, # 'ß' + 224: 255, # 'ŕ' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ă' + 228: 255, # 'ä' + 229: 255, # 'ĺ' + 230: 255, # 'ć' + 231: 255, # 'ç' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ě' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ď' + 240: 255, # 'đ' + 241: 255, # 'ń' + 242: 255, # 'ň' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ő' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ř' + 249: 255, # 'ů' + 250: 255, # 'ú' + 251: 255, # 'ű' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'ţ' + 255: 253, # '˙' } -ISO_8859_2_POLISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-2', - language='Polish', - char_to_order_map=ISO_8859_2_POLISH_CHAR_TO_ORDER, - language_model=POLISH_LANG_MODEL, - typical_positive_ratio=0.9570352460830012, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUWYZabcdefghijklmnoprstuwyzÓóĄąĆćĘꣳŃńŚśŹźŻż') +ISO_8859_2_POLISH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-2", + language="Polish", + char_to_order_map=ISO_8859_2_POLISH_CHAR_TO_ORDER, + language_model=POLISH_LANG_MODEL, + typical_positive_ratio=0.9570352460830012, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUWYZabcdefghijklmnoprstuwyzÓóĄąĆćĘꣳŃńŚśŹźŻż", +) WINDOWS_1250_POLISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # None - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # None - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Ś' - 141: 255, # 'Ť' - 142: 255, # 'Ž' - 143: 255, # 'Ź' - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 255, # None - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'ś' - 157: 255, # 'ť' - 158: 255, # 'ž' - 159: 255, # 'ź' - 160: 251, # '\xa0' - 161: 255, # 'ˇ' - 162: 253, # '˘' - 163: 255, # 'Ł' - 164: 253, # '¤' - 165: 255, # 'Ą' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'Ş' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 253, # '±' - 178: 253, # '˛' - 179: 255, # 'ł' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 255, # 'ą' - 186: 255, # 'ş' - 187: 253, # '»' - 188: 255, # 'Ľ' - 189: 253, # '˝' - 190: 255, # 'ľ' - 191: 255, # 'ż' - 192: 255, # 'Ŕ' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ă' - 196: 255, # 'Ä' - 197: 255, # 'Ĺ' - 198: 255, # 'Ć' - 199: 255, # 'Ç' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ě' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ď' - 208: 255, # 'Đ' - 209: 255, # 'Ń' - 210: 255, # 'Ň' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ő' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ř' - 217: 255, # 'Ů' - 218: 255, # 'Ú' - 219: 255, # 'Ű' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Ţ' - 223: 255, # 'ß' - 224: 255, # 'ŕ' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ă' - 228: 255, # 'ä' - 229: 255, # 'ĺ' - 230: 255, # 'ć' - 231: 255, # 'ç' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ě' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ď' - 240: 255, # 'đ' - 241: 255, # 'ń' - 242: 255, # 'ň' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ő' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ř' - 249: 255, # 'ů' - 250: 255, # 'ú' - 251: 255, # 'ű' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'ţ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # None + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # None + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Ś' + 141: 255, # 'Ť' + 142: 255, # 'Ž' + 143: 255, # 'Ź' + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 255, # None + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'ś' + 157: 255, # 'ť' + 158: 255, # 'ž' + 159: 255, # 'ź' + 160: 251, # '\xa0' + 161: 255, # 'ˇ' + 162: 253, # '˘' + 163: 255, # 'Ł' + 164: 253, # '¤' + 165: 255, # 'Ą' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'Ş' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 253, # '±' + 178: 253, # '˛' + 179: 255, # 'ł' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 255, # 'ą' + 186: 255, # 'ş' + 187: 253, # '»' + 188: 255, # 'Ľ' + 189: 253, # '˝' + 190: 255, # 'ľ' + 191: 255, # 'ż' + 192: 255, # 'Ŕ' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ă' + 196: 255, # 'Ä' + 197: 255, # 'Ĺ' + 198: 255, # 'Ć' + 199: 255, # 'Ç' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ě' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ď' + 208: 255, # 'Đ' + 209: 255, # 'Ń' + 210: 255, # 'Ň' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ő' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ř' + 217: 255, # 'Ů' + 218: 255, # 'Ú' + 219: 255, # 'Ű' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Ţ' + 223: 255, # 'ß' + 224: 255, # 'ŕ' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ă' + 228: 255, # 'ä' + 229: 255, # 'ĺ' + 230: 255, # 'ć' + 231: 255, # 'ç' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ě' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ď' + 240: 255, # 'đ' + 241: 255, # 'ń' + 242: 255, # 'ň' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ő' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ř' + 249: 255, # 'ů' + 250: 255, # 'ú' + 251: 255, # 'ű' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'ţ' + 255: 253, # '˙' } -WINDOWS_1250_POLISH_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1250', - language='Polish', - char_to_order_map=WINDOWS_1250_POLISH_CHAR_TO_ORDER, - language_model=POLISH_LANG_MODEL, - typical_positive_ratio=0.9858793276028752, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUWYZabcdefghijklmnoprstuwyzÓóĄąĆćĘꣳŃńŚśŹźŻż') - +WINDOWS_1250_POLISH_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1250", + language="Polish", + char_to_order_map=WINDOWS_1250_POLISH_CHAR_TO_ORDER, + language_model=POLISH_LANG_MODEL, + typical_positive_ratio=0.9858793276028752, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUWYZabcdefghijklmnoprstuwyzÓóĄąĆćĘꣳŃńŚśŹźŻż", +) diff --git a/chardet/langportuguesemodel.py b/chardet/langportuguesemodel.py index f8dcb17a..38584e74 100644 --- a/chardet/langportuguesemodel.py +++ b/chardet/langportuguesemodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -62768,803 +62766,808 @@ # Character Mapping Table(s): ISO_8859_1_PORTUGUESE_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 29, # 'A' - 66: 50, # 'B' - 67: 33, # 'C' - 68: 53, # 'D' - 69: 39, # 'E' - 70: 62, # 'F' - 71: 61, # 'G' - 72: 70, # 'H' - 73: 49, # 'I' - 74: 72, # 'J' - 75: 79, # 'K' - 76: 60, # 'L' - 77: 44, # 'M' - 78: 54, # 'N' - 79: 46, # 'O' - 80: 40, # 'P' - 81: 91, # 'Q' - 82: 52, # 'R' - 83: 38, # 'S' - 84: 55, # 'T' - 85: 71, # 'U' - 86: 74, # 'V' - 87: 77, # 'W' - 88: 83, # 'X' - 89: 92, # 'Y' - 90: 94, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 2, # 'a' - 98: 21, # 'b' - 99: 13, # 'c' - 100: 9, # 'd' - 101: 3, # 'e' - 102: 19, # 'f' - 103: 17, # 'g' - 104: 22, # 'h' - 105: 6, # 'i' - 106: 47, # 'j' - 107: 64, # 'k' - 108: 14, # 'l' - 109: 11, # 'm' - 110: 8, # 'n' - 111: 4, # 'o' - 112: 15, # 'p' - 113: 26, # 'q' - 114: 7, # 'r' - 115: 5, # 's' - 116: 10, # 't' - 117: 12, # 'u' - 118: 18, # 'v' - 119: 75, # 'w' - 120: 41, # 'x' - 121: 57, # 'y' - 122: 37, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 29, # 'A' + 66: 50, # 'B' + 67: 33, # 'C' + 68: 53, # 'D' + 69: 39, # 'E' + 70: 62, # 'F' + 71: 61, # 'G' + 72: 70, # 'H' + 73: 49, # 'I' + 74: 72, # 'J' + 75: 79, # 'K' + 76: 60, # 'L' + 77: 44, # 'M' + 78: 54, # 'N' + 79: 46, # 'O' + 80: 40, # 'P' + 81: 91, # 'Q' + 82: 52, # 'R' + 83: 38, # 'S' + 84: 55, # 'T' + 85: 71, # 'U' + 86: 74, # 'V' + 87: 77, # 'W' + 88: 83, # 'X' + 89: 92, # 'Y' + 90: 94, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 2, # 'a' + 98: 21, # 'b' + 99: 13, # 'c' + 100: 9, # 'd' + 101: 3, # 'e' + 102: 19, # 'f' + 103: 17, # 'g' + 104: 22, # 'h' + 105: 6, # 'i' + 106: 47, # 'j' + 107: 64, # 'k' + 108: 14, # 'l' + 109: 11, # 'm' + 110: 8, # 'n' + 111: 4, # 'o' + 112: 15, # 'p' + 113: 26, # 'q' + 114: 7, # 'r' + 115: 5, # 's' + 116: 10, # 't' + 117: 12, # 'u' + 118: 18, # 'v' + 119: 75, # 'w' + 120: 41, # 'x' + 121: 57, # 'y' + 122: 37, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_1_PORTUGUESE_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-1', - language='Portuguese', - char_to_order_map=ISO_8859_1_PORTUGUESE_CHAR_TO_ORDER, - language_model=PORTUGUESE_LANG_MODEL, - typical_positive_ratio=0.9365045721978503, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÁÂÃÇÉÊÍÓÔÕÚàáâãçéêíóôõú') +ISO_8859_1_PORTUGUESE_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-1", + language="Portuguese", + char_to_order_map=ISO_8859_1_PORTUGUESE_CHAR_TO_ORDER, + language_model=PORTUGUESE_LANG_MODEL, + typical_positive_ratio=0.9365045721978503, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÁÂÃÇÉÊÍÓÔÕÚàáâãçéêíóôõú", +) WINDOWS_1252_PORTUGUESE_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 29, # 'A' - 66: 50, # 'B' - 67: 33, # 'C' - 68: 53, # 'D' - 69: 39, # 'E' - 70: 62, # 'F' - 71: 61, # 'G' - 72: 70, # 'H' - 73: 49, # 'I' - 74: 72, # 'J' - 75: 79, # 'K' - 76: 60, # 'L' - 77: 44, # 'M' - 78: 54, # 'N' - 79: 46, # 'O' - 80: 40, # 'P' - 81: 91, # 'Q' - 82: 52, # 'R' - 83: 38, # 'S' - 84: 55, # 'T' - 85: 71, # 'U' - 86: 74, # 'V' - 87: 77, # 'W' - 88: 83, # 'X' - 89: 92, # 'Y' - 90: 94, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 2, # 'a' - 98: 21, # 'b' - 99: 13, # 'c' - 100: 9, # 'd' - 101: 3, # 'e' - 102: 19, # 'f' - 103: 17, # 'g' - 104: 22, # 'h' - 105: 6, # 'i' - 106: 47, # 'j' - 107: 64, # 'k' - 108: 14, # 'l' - 109: 11, # 'm' - 110: 8, # 'n' - 111: 4, # 'o' - 112: 15, # 'p' - 113: 26, # 'q' - 114: 7, # 'r' - 115: 5, # 's' - 116: 10, # 't' - 117: 12, # 'u' - 118: 18, # 'v' - 119: 75, # 'w' - 120: 41, # 'x' - 121: 57, # 'y' - 122: 37, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # 'ƒ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # 'ˆ' - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Œ' - 141: 255, # None - 142: 255, # 'Ž' - 143: 255, # None - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 253, # '˜' - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'œ' - 157: 255, # None - 158: 255, # 'ž' - 159: 255, # 'Ÿ' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 29, # 'A' + 66: 50, # 'B' + 67: 33, # 'C' + 68: 53, # 'D' + 69: 39, # 'E' + 70: 62, # 'F' + 71: 61, # 'G' + 72: 70, # 'H' + 73: 49, # 'I' + 74: 72, # 'J' + 75: 79, # 'K' + 76: 60, # 'L' + 77: 44, # 'M' + 78: 54, # 'N' + 79: 46, # 'O' + 80: 40, # 'P' + 81: 91, # 'Q' + 82: 52, # 'R' + 83: 38, # 'S' + 84: 55, # 'T' + 85: 71, # 'U' + 86: 74, # 'V' + 87: 77, # 'W' + 88: 83, # 'X' + 89: 92, # 'Y' + 90: 94, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 2, # 'a' + 98: 21, # 'b' + 99: 13, # 'c' + 100: 9, # 'd' + 101: 3, # 'e' + 102: 19, # 'f' + 103: 17, # 'g' + 104: 22, # 'h' + 105: 6, # 'i' + 106: 47, # 'j' + 107: 64, # 'k' + 108: 14, # 'l' + 109: 11, # 'm' + 110: 8, # 'n' + 111: 4, # 'o' + 112: 15, # 'p' + 113: 26, # 'q' + 114: 7, # 'r' + 115: 5, # 's' + 116: 10, # 't' + 117: 12, # 'u' + 118: 18, # 'v' + 119: 75, # 'w' + 120: 41, # 'x' + 121: 57, # 'y' + 122: 37, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # 'ƒ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # 'ˆ' + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Œ' + 141: 255, # None + 142: 255, # 'Ž' + 143: 255, # None + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 253, # '˜' + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'œ' + 157: 255, # None + 158: 255, # 'ž' + 159: 255, # 'Ÿ' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -WINDOWS_1252_PORTUGUESE_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1252', - language='Portuguese', - char_to_order_map=WINDOWS_1252_PORTUGUESE_CHAR_TO_ORDER, - language_model=PORTUGUESE_LANG_MODEL, - typical_positive_ratio=0.9371242661509115, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÁÂÃÇÉÊÍÓÔÕÚàáâãçéêíóôõú') +WINDOWS_1252_PORTUGUESE_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1252", + language="Portuguese", + char_to_order_map=WINDOWS_1252_PORTUGUESE_CHAR_TO_ORDER, + language_model=PORTUGUESE_LANG_MODEL, + typical_positive_ratio=0.9371242661509115, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÁÂÃÇÉÊÍÓÔÕÚàáâãçéêíóôõú", +) ISO_8859_15_PORTUGUESE_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 29, # 'A' - 66: 50, # 'B' - 67: 33, # 'C' - 68: 53, # 'D' - 69: 39, # 'E' - 70: 62, # 'F' - 71: 61, # 'G' - 72: 70, # 'H' - 73: 49, # 'I' - 74: 72, # 'J' - 75: 79, # 'K' - 76: 60, # 'L' - 77: 44, # 'M' - 78: 54, # 'N' - 79: 46, # 'O' - 80: 40, # 'P' - 81: 91, # 'Q' - 82: 52, # 'R' - 83: 38, # 'S' - 84: 55, # 'T' - 85: 71, # 'U' - 86: 74, # 'V' - 87: 77, # 'W' - 88: 83, # 'X' - 89: 92, # 'Y' - 90: 94, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 2, # 'a' - 98: 21, # 'b' - 99: 13, # 'c' - 100: 9, # 'd' - 101: 3, # 'e' - 102: 19, # 'f' - 103: 17, # 'g' - 104: 22, # 'h' - 105: 6, # 'i' - 106: 47, # 'j' - 107: 64, # 'k' - 108: 14, # 'l' - 109: 11, # 'm' - 110: 8, # 'n' - 111: 4, # 'o' - 112: 15, # 'p' - 113: 26, # 'q' - 114: 7, # 'r' - 115: 5, # 's' - 116: 10, # 't' - 117: 12, # 'u' - 118: 18, # 'v' - 119: 75, # 'w' - 120: 41, # 'x' - 121: 57, # 'y' - 122: 37, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '€' - 165: 253, # '¥' - 166: 255, # 'Š' - 167: 253, # '§' - 168: 255, # 'š' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 255, # 'Ž' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ž' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 255, # 'Œ' - 189: 255, # 'œ' - 190: 255, # 'Ÿ' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 29, # 'A' + 66: 50, # 'B' + 67: 33, # 'C' + 68: 53, # 'D' + 69: 39, # 'E' + 70: 62, # 'F' + 71: 61, # 'G' + 72: 70, # 'H' + 73: 49, # 'I' + 74: 72, # 'J' + 75: 79, # 'K' + 76: 60, # 'L' + 77: 44, # 'M' + 78: 54, # 'N' + 79: 46, # 'O' + 80: 40, # 'P' + 81: 91, # 'Q' + 82: 52, # 'R' + 83: 38, # 'S' + 84: 55, # 'T' + 85: 71, # 'U' + 86: 74, # 'V' + 87: 77, # 'W' + 88: 83, # 'X' + 89: 92, # 'Y' + 90: 94, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 2, # 'a' + 98: 21, # 'b' + 99: 13, # 'c' + 100: 9, # 'd' + 101: 3, # 'e' + 102: 19, # 'f' + 103: 17, # 'g' + 104: 22, # 'h' + 105: 6, # 'i' + 106: 47, # 'j' + 107: 64, # 'k' + 108: 14, # 'l' + 109: 11, # 'm' + 110: 8, # 'n' + 111: 4, # 'o' + 112: 15, # 'p' + 113: 26, # 'q' + 114: 7, # 'r' + 115: 5, # 's' + 116: 10, # 't' + 117: 12, # 'u' + 118: 18, # 'v' + 119: 75, # 'w' + 120: 41, # 'x' + 121: 57, # 'y' + 122: 37, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '€' + 165: 253, # '¥' + 166: 255, # 'Š' + 167: 253, # '§' + 168: 255, # 'š' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 255, # 'Ž' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ž' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 255, # 'Œ' + 189: 255, # 'œ' + 190: 255, # 'Ÿ' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_15_PORTUGUESE_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-15', - language='Portuguese', - char_to_order_map=ISO_8859_15_PORTUGUESE_CHAR_TO_ORDER, - language_model=PORTUGUESE_LANG_MODEL, - typical_positive_ratio=0.9365045721978503, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÁÂÃÇÉÊÍÓÔÕÚàáâãçéêíóôõú') - +ISO_8859_15_PORTUGUESE_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-15", + language="Portuguese", + char_to_order_map=ISO_8859_15_PORTUGUESE_CHAR_TO_ORDER, + language_model=PORTUGUESE_LANG_MODEL, + typical_positive_ratio=0.9365045721978503, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÁÂÃÇÉÊÍÓÔÕÚàáâãçéêíóôõú", +) diff --git a/chardet/langromanianmodel.py b/chardet/langromanianmodel.py index 262bab47..51b6e0ac 100644 --- a/chardet/langromanianmodel.py +++ b/chardet/langromanianmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -62768,536 +62766,539 @@ # Character Mapping Table(s): ISO_8859_2_ROMANIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 34, # 'A' - 66: 45, # 'B' - 67: 35, # 'C' - 68: 46, # 'D' - 69: 54, # 'E' - 70: 61, # 'F' - 71: 60, # 'G' - 72: 67, # 'H' - 73: 41, # 'I' - 74: 69, # 'J' - 75: 73, # 'K' - 76: 49, # 'L' - 77: 42, # 'M' - 78: 56, # 'N' - 79: 66, # 'O' - 80: 40, # 'P' - 81: 95, # 'Q' - 82: 43, # 'R' - 83: 36, # 'S' - 84: 57, # 'T' - 85: 63, # 'U' - 86: 65, # 'V' - 87: 74, # 'W' - 88: 77, # 'X' - 89: 85, # 'Y' - 90: 80, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 4, # 'a' - 98: 24, # 'b' - 99: 11, # 'c' - 100: 13, # 'd' - 101: 2, # 'e' - 102: 18, # 'f' - 103: 21, # 'g' - 104: 29, # 'h' - 105: 3, # 'i' - 106: 55, # 'j' - 107: 59, # 'k' - 108: 9, # 'l' - 109: 14, # 'm' - 110: 6, # 'n' - 111: 10, # 'o' - 112: 15, # 'p' - 113: 87, # 'q' - 114: 5, # 'r' - 115: 12, # 's' - 116: 7, # 't' - 117: 8, # 'u' - 118: 19, # 'v' - 119: 68, # 'w' - 120: 48, # 'x' - 121: 62, # 'y' - 122: 26, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ą' - 162: 253, # '˘' - 163: 255, # 'Ł' - 164: 253, # '¤' - 165: 255, # 'Ľ' - 166: 255, # 'Ś' - 167: 253, # '§' - 168: 253, # '¨' - 169: 255, # 'Š' - 170: 255, # 'Ş' - 171: 255, # 'Ť' - 172: 255, # 'Ź' - 173: 251, # '\xad' - 174: 255, # 'Ž' - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 255, # 'ą' - 178: 253, # '˛' - 179: 255, # 'ł' - 180: 253, # '´' - 181: 255, # 'ľ' - 182: 255, # 'ś' - 183: 255, # 'ˇ' - 184: 253, # '¸' - 185: 255, # 'š' - 186: 255, # 'ş' - 187: 255, # 'ť' - 188: 255, # 'ź' - 189: 253, # '˝' - 190: 255, # 'ž' - 191: 255, # 'ż' - 192: 255, # 'Ŕ' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ă' - 196: 255, # 'Ä' - 197: 255, # 'Ĺ' - 198: 255, # 'Ć' - 199: 255, # 'Ç' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ě' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ď' - 208: 255, # 'Đ' - 209: 255, # 'Ń' - 210: 255, # 'Ň' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ő' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ř' - 217: 255, # 'Ů' - 218: 255, # 'Ú' - 219: 255, # 'Ű' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Ţ' - 223: 255, # 'ß' - 224: 255, # 'ŕ' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ă' - 228: 255, # 'ä' - 229: 255, # 'ĺ' - 230: 255, # 'ć' - 231: 255, # 'ç' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ě' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ď' - 240: 255, # 'đ' - 241: 255, # 'ń' - 242: 255, # 'ň' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ő' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ř' - 249: 255, # 'ů' - 250: 255, # 'ú' - 251: 255, # 'ű' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'ţ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 34, # 'A' + 66: 45, # 'B' + 67: 35, # 'C' + 68: 46, # 'D' + 69: 54, # 'E' + 70: 61, # 'F' + 71: 60, # 'G' + 72: 67, # 'H' + 73: 41, # 'I' + 74: 69, # 'J' + 75: 73, # 'K' + 76: 49, # 'L' + 77: 42, # 'M' + 78: 56, # 'N' + 79: 66, # 'O' + 80: 40, # 'P' + 81: 95, # 'Q' + 82: 43, # 'R' + 83: 36, # 'S' + 84: 57, # 'T' + 85: 63, # 'U' + 86: 65, # 'V' + 87: 74, # 'W' + 88: 77, # 'X' + 89: 85, # 'Y' + 90: 80, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 4, # 'a' + 98: 24, # 'b' + 99: 11, # 'c' + 100: 13, # 'd' + 101: 2, # 'e' + 102: 18, # 'f' + 103: 21, # 'g' + 104: 29, # 'h' + 105: 3, # 'i' + 106: 55, # 'j' + 107: 59, # 'k' + 108: 9, # 'l' + 109: 14, # 'm' + 110: 6, # 'n' + 111: 10, # 'o' + 112: 15, # 'p' + 113: 87, # 'q' + 114: 5, # 'r' + 115: 12, # 's' + 116: 7, # 't' + 117: 8, # 'u' + 118: 19, # 'v' + 119: 68, # 'w' + 120: 48, # 'x' + 121: 62, # 'y' + 122: 26, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ą' + 162: 253, # '˘' + 163: 255, # 'Ł' + 164: 253, # '¤' + 165: 255, # 'Ľ' + 166: 255, # 'Ś' + 167: 253, # '§' + 168: 253, # '¨' + 169: 255, # 'Š' + 170: 255, # 'Ş' + 171: 255, # 'Ť' + 172: 255, # 'Ź' + 173: 251, # '\xad' + 174: 255, # 'Ž' + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 255, # 'ą' + 178: 253, # '˛' + 179: 255, # 'ł' + 180: 253, # '´' + 181: 255, # 'ľ' + 182: 255, # 'ś' + 183: 255, # 'ˇ' + 184: 253, # '¸' + 185: 255, # 'š' + 186: 255, # 'ş' + 187: 255, # 'ť' + 188: 255, # 'ź' + 189: 253, # '˝' + 190: 255, # 'ž' + 191: 255, # 'ż' + 192: 255, # 'Ŕ' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ă' + 196: 255, # 'Ä' + 197: 255, # 'Ĺ' + 198: 255, # 'Ć' + 199: 255, # 'Ç' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ě' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ď' + 208: 255, # 'Đ' + 209: 255, # 'Ń' + 210: 255, # 'Ň' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ő' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ř' + 217: 255, # 'Ů' + 218: 255, # 'Ú' + 219: 255, # 'Ű' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Ţ' + 223: 255, # 'ß' + 224: 255, # 'ŕ' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ă' + 228: 255, # 'ä' + 229: 255, # 'ĺ' + 230: 255, # 'ć' + 231: 255, # 'ç' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ě' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ď' + 240: 255, # 'đ' + 241: 255, # 'ń' + 242: 255, # 'ň' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ő' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ř' + 249: 255, # 'ů' + 250: 255, # 'ú' + 251: 255, # 'ű' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'ţ' + 255: 253, # '˙' } -ISO_8859_2_ROMANIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-2', - language='Romanian', - char_to_order_map=ISO_8859_2_ROMANIAN_CHAR_TO_ORDER, - language_model=ROMANIAN_LANG_MODEL, - typical_positive_ratio=0.9110416658876097, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÂÎâîĂăȘșȚț') +ISO_8859_2_ROMANIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-2", + language="Romanian", + char_to_order_map=ISO_8859_2_ROMANIAN_CHAR_TO_ORDER, + language_model=ROMANIAN_LANG_MODEL, + typical_positive_ratio=0.9110416658876097, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÂÎâîĂăȘșȚț", +) WINDOWS_1250_ROMANIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 34, # 'A' - 66: 45, # 'B' - 67: 35, # 'C' - 68: 46, # 'D' - 69: 54, # 'E' - 70: 61, # 'F' - 71: 60, # 'G' - 72: 67, # 'H' - 73: 41, # 'I' - 74: 69, # 'J' - 75: 73, # 'K' - 76: 49, # 'L' - 77: 42, # 'M' - 78: 56, # 'N' - 79: 66, # 'O' - 80: 40, # 'P' - 81: 95, # 'Q' - 82: 43, # 'R' - 83: 36, # 'S' - 84: 57, # 'T' - 85: 63, # 'U' - 86: 65, # 'V' - 87: 74, # 'W' - 88: 77, # 'X' - 89: 85, # 'Y' - 90: 80, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 4, # 'a' - 98: 24, # 'b' - 99: 11, # 'c' - 100: 13, # 'd' - 101: 2, # 'e' - 102: 18, # 'f' - 103: 21, # 'g' - 104: 29, # 'h' - 105: 3, # 'i' - 106: 55, # 'j' - 107: 59, # 'k' - 108: 9, # 'l' - 109: 14, # 'm' - 110: 6, # 'n' - 111: 10, # 'o' - 112: 15, # 'p' - 113: 87, # 'q' - 114: 5, # 'r' - 115: 12, # 's' - 116: 7, # 't' - 117: 8, # 'u' - 118: 19, # 'v' - 119: 68, # 'w' - 120: 48, # 'x' - 121: 62, # 'y' - 122: 26, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # None - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # None - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Ś' - 141: 255, # 'Ť' - 142: 255, # 'Ž' - 143: 255, # 'Ź' - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 255, # None - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'ś' - 157: 255, # 'ť' - 158: 255, # 'ž' - 159: 255, # 'ź' - 160: 251, # '\xa0' - 161: 255, # 'ˇ' - 162: 253, # '˘' - 163: 255, # 'Ł' - 164: 253, # '¤' - 165: 255, # 'Ą' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'Ş' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 253, # '±' - 178: 253, # '˛' - 179: 255, # 'ł' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 255, # 'ą' - 186: 255, # 'ş' - 187: 253, # '»' - 188: 255, # 'Ľ' - 189: 253, # '˝' - 190: 255, # 'ľ' - 191: 255, # 'ż' - 192: 255, # 'Ŕ' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ă' - 196: 255, # 'Ä' - 197: 255, # 'Ĺ' - 198: 255, # 'Ć' - 199: 255, # 'Ç' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ě' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ď' - 208: 255, # 'Đ' - 209: 255, # 'Ń' - 210: 255, # 'Ň' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ő' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ř' - 217: 255, # 'Ů' - 218: 255, # 'Ú' - 219: 255, # 'Ű' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Ţ' - 223: 255, # 'ß' - 224: 255, # 'ŕ' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ă' - 228: 255, # 'ä' - 229: 255, # 'ĺ' - 230: 255, # 'ć' - 231: 255, # 'ç' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ě' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ď' - 240: 255, # 'đ' - 241: 255, # 'ń' - 242: 255, # 'ň' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ő' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ř' - 249: 255, # 'ů' - 250: 255, # 'ú' - 251: 255, # 'ű' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'ţ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 34, # 'A' + 66: 45, # 'B' + 67: 35, # 'C' + 68: 46, # 'D' + 69: 54, # 'E' + 70: 61, # 'F' + 71: 60, # 'G' + 72: 67, # 'H' + 73: 41, # 'I' + 74: 69, # 'J' + 75: 73, # 'K' + 76: 49, # 'L' + 77: 42, # 'M' + 78: 56, # 'N' + 79: 66, # 'O' + 80: 40, # 'P' + 81: 95, # 'Q' + 82: 43, # 'R' + 83: 36, # 'S' + 84: 57, # 'T' + 85: 63, # 'U' + 86: 65, # 'V' + 87: 74, # 'W' + 88: 77, # 'X' + 89: 85, # 'Y' + 90: 80, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 4, # 'a' + 98: 24, # 'b' + 99: 11, # 'c' + 100: 13, # 'd' + 101: 2, # 'e' + 102: 18, # 'f' + 103: 21, # 'g' + 104: 29, # 'h' + 105: 3, # 'i' + 106: 55, # 'j' + 107: 59, # 'k' + 108: 9, # 'l' + 109: 14, # 'm' + 110: 6, # 'n' + 111: 10, # 'o' + 112: 15, # 'p' + 113: 87, # 'q' + 114: 5, # 'r' + 115: 12, # 's' + 116: 7, # 't' + 117: 8, # 'u' + 118: 19, # 'v' + 119: 68, # 'w' + 120: 48, # 'x' + 121: 62, # 'y' + 122: 26, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # None + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # None + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Ś' + 141: 255, # 'Ť' + 142: 255, # 'Ž' + 143: 255, # 'Ź' + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 255, # None + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'ś' + 157: 255, # 'ť' + 158: 255, # 'ž' + 159: 255, # 'ź' + 160: 251, # '\xa0' + 161: 255, # 'ˇ' + 162: 253, # '˘' + 163: 255, # 'Ł' + 164: 253, # '¤' + 165: 255, # 'Ą' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'Ş' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 253, # '±' + 178: 253, # '˛' + 179: 255, # 'ł' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 255, # 'ą' + 186: 255, # 'ş' + 187: 253, # '»' + 188: 255, # 'Ľ' + 189: 253, # '˝' + 190: 255, # 'ľ' + 191: 255, # 'ż' + 192: 255, # 'Ŕ' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ă' + 196: 255, # 'Ä' + 197: 255, # 'Ĺ' + 198: 255, # 'Ć' + 199: 255, # 'Ç' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ě' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ď' + 208: 255, # 'Đ' + 209: 255, # 'Ń' + 210: 255, # 'Ň' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ő' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ř' + 217: 255, # 'Ů' + 218: 255, # 'Ú' + 219: 255, # 'Ű' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Ţ' + 223: 255, # 'ß' + 224: 255, # 'ŕ' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ă' + 228: 255, # 'ä' + 229: 255, # 'ĺ' + 230: 255, # 'ć' + 231: 255, # 'ç' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ě' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ď' + 240: 255, # 'đ' + 241: 255, # 'ń' + 242: 255, # 'ň' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ő' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ř' + 249: 255, # 'ů' + 250: 255, # 'ú' + 251: 255, # 'ű' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'ţ' + 255: 253, # '˙' } -WINDOWS_1250_ROMANIAN_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1250', - language='Romanian', - char_to_order_map=WINDOWS_1250_ROMANIAN_CHAR_TO_ORDER, - language_model=ROMANIAN_LANG_MODEL, - typical_positive_ratio=0.9121161208271691, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÂÎâîĂăȘșȚț') - +WINDOWS_1250_ROMANIAN_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1250", + language="Romanian", + char_to_order_map=WINDOWS_1250_ROMANIAN_CHAR_TO_ORDER, + language_model=ROMANIAN_LANG_MODEL, + typical_positive_ratio=0.9121161208271691, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÂÎâîĂăȘșȚț", +) diff --git a/chardet/langserbianmodel.py b/chardet/langserbianmodel.py index d15e85c0..5e854f07 100644 --- a/chardet/langserbianmodel.py +++ b/chardet/langserbianmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -28410,1070 +28408,1077 @@ # Character Mapping Table(s): WINDOWS_1251_SERBIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 101, # 'A' - 66: 106, # 'B' - 67: 99, # 'C' - 68: 122, # 'D' - 69: 125, # 'E' - 70: 129, # 'F' - 71: 130, # 'G' - 72: 116, # 'H' - 73: 90, # 'I' - 74: 141, # 'J' - 75: 136, # 'K' - 76: 124, # 'L' - 77: 105, # 'M' - 78: 110, # 'N' - 79: 118, # 'O' - 80: 102, # 'P' - 81: 168, # 'Q' - 82: 121, # 'R' - 83: 95, # 'S' - 84: 109, # 'T' - 85: 135, # 'U' - 86: 113, # 'V' - 87: 137, # 'W' - 88: 139, # 'X' - 89: 158, # 'Y' - 90: 154, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 30, # 'a' - 98: 92, # 'b' - 99: 81, # 'c' - 100: 64, # 'd' - 101: 31, # 'e' - 102: 97, # 'f' - 103: 86, # 'g' - 104: 87, # 'h' - 105: 33, # 'i' - 106: 75, # 'j' - 107: 74, # 'k' - 108: 55, # 'l' - 109: 66, # 'm' - 110: 42, # 'n' - 111: 39, # 'o' - 112: 78, # 'p' - 113: 150, # 'q' - 114: 43, # 'r' - 115: 49, # 's' - 116: 48, # 't' - 117: 58, # 'u' - 118: 79, # 'v' - 119: 120, # 'w' - 120: 143, # 'x' - 121: 98, # 'y' - 122: 93, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 255, # 'Ђ' - 129: 255, # 'Ѓ' - 130: 253, # '‚' - 131: 255, # 'ѓ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 253, # '€' - 137: 253, # '‰' - 138: 255, # 'Љ' - 139: 253, # '‹' - 140: 255, # 'Њ' - 141: 255, # 'Ќ' - 142: 255, # 'Ћ' - 143: 255, # 'Џ' - 144: 255, # 'ђ' - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 255, # None - 153: 253, # '™' - 154: 255, # 'љ' - 155: 253, # '›' - 156: 255, # 'њ' - 157: 255, # 'ќ' - 158: 255, # 'ћ' - 159: 255, # 'џ' - 160: 251, # '\xa0' - 161: 255, # 'Ў' - 162: 255, # 'ў' - 163: 255, # 'Ј' - 164: 253, # '¤' - 165: 255, # 'Ґ' - 166: 253, # '¦' - 167: 253, # '§' - 168: 255, # 'Ё' - 169: 253, # '©' - 170: 255, # 'Є' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Ї' - 176: 253, # '°' - 177: 253, # '±' - 178: 255, # 'І' - 179: 255, # 'і' - 180: 255, # 'ґ' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ё' - 185: 253, # '№' - 186: 255, # 'є' - 187: 253, # '»' - 188: 255, # 'ј' - 189: 255, # 'Ѕ' - 190: 255, # 'ѕ' - 191: 255, # 'ї' - 192: 255, # 'А' - 193: 255, # 'Б' - 194: 255, # 'В' - 195: 255, # 'Г' - 196: 255, # 'Д' - 197: 255, # 'Е' - 198: 255, # 'Ж' - 199: 255, # 'З' - 200: 255, # 'И' - 201: 255, # 'Й' - 202: 255, # 'К' - 203: 255, # 'Л' - 204: 255, # 'М' - 205: 255, # 'Н' - 206: 255, # 'О' - 207: 255, # 'П' - 208: 255, # 'Р' - 209: 255, # 'С' - 210: 255, # 'Т' - 211: 255, # 'У' - 212: 255, # 'Ф' - 213: 255, # 'Х' - 214: 255, # 'Ц' - 215: 255, # 'Ч' - 216: 255, # 'Ш' - 217: 255, # 'Щ' - 218: 255, # 'Ъ' - 219: 255, # 'Ы' - 220: 255, # 'Ь' - 221: 255, # 'Э' - 222: 255, # 'Ю' - 223: 255, # 'Я' - 224: 255, # 'а' - 225: 255, # 'б' - 226: 255, # 'в' - 227: 255, # 'г' - 228: 255, # 'д' - 229: 255, # 'е' - 230: 255, # 'ж' - 231: 255, # 'з' - 232: 255, # 'и' - 233: 255, # 'й' - 234: 255, # 'к' - 235: 255, # 'л' - 236: 255, # 'м' - 237: 255, # 'н' - 238: 255, # 'о' - 239: 255, # 'п' - 240: 255, # 'р' - 241: 255, # 'с' - 242: 255, # 'т' - 243: 255, # 'у' - 244: 255, # 'ф' - 245: 255, # 'х' - 246: 255, # 'ц' - 247: 255, # 'ч' - 248: 255, # 'ш' - 249: 255, # 'щ' - 250: 255, # 'ъ' - 251: 255, # 'ы' - 252: 255, # 'ь' - 253: 255, # 'э' - 254: 255, # 'ю' - 255: 255, # 'я' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 101, # 'A' + 66: 106, # 'B' + 67: 99, # 'C' + 68: 122, # 'D' + 69: 125, # 'E' + 70: 129, # 'F' + 71: 130, # 'G' + 72: 116, # 'H' + 73: 90, # 'I' + 74: 141, # 'J' + 75: 136, # 'K' + 76: 124, # 'L' + 77: 105, # 'M' + 78: 110, # 'N' + 79: 118, # 'O' + 80: 102, # 'P' + 81: 168, # 'Q' + 82: 121, # 'R' + 83: 95, # 'S' + 84: 109, # 'T' + 85: 135, # 'U' + 86: 113, # 'V' + 87: 137, # 'W' + 88: 139, # 'X' + 89: 158, # 'Y' + 90: 154, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 30, # 'a' + 98: 92, # 'b' + 99: 81, # 'c' + 100: 64, # 'd' + 101: 31, # 'e' + 102: 97, # 'f' + 103: 86, # 'g' + 104: 87, # 'h' + 105: 33, # 'i' + 106: 75, # 'j' + 107: 74, # 'k' + 108: 55, # 'l' + 109: 66, # 'm' + 110: 42, # 'n' + 111: 39, # 'o' + 112: 78, # 'p' + 113: 150, # 'q' + 114: 43, # 'r' + 115: 49, # 's' + 116: 48, # 't' + 117: 58, # 'u' + 118: 79, # 'v' + 119: 120, # 'w' + 120: 143, # 'x' + 121: 98, # 'y' + 122: 93, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 255, # 'Ђ' + 129: 255, # 'Ѓ' + 130: 253, # '‚' + 131: 255, # 'ѓ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 253, # '€' + 137: 253, # '‰' + 138: 255, # 'Љ' + 139: 253, # '‹' + 140: 255, # 'Њ' + 141: 255, # 'Ќ' + 142: 255, # 'Ћ' + 143: 255, # 'Џ' + 144: 255, # 'ђ' + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 255, # None + 153: 253, # '™' + 154: 255, # 'љ' + 155: 253, # '›' + 156: 255, # 'њ' + 157: 255, # 'ќ' + 158: 255, # 'ћ' + 159: 255, # 'џ' + 160: 251, # '\xa0' + 161: 255, # 'Ў' + 162: 255, # 'ў' + 163: 255, # 'Ј' + 164: 253, # '¤' + 165: 255, # 'Ґ' + 166: 253, # '¦' + 167: 253, # '§' + 168: 255, # 'Ё' + 169: 253, # '©' + 170: 255, # 'Є' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Ї' + 176: 253, # '°' + 177: 253, # '±' + 178: 255, # 'І' + 179: 255, # 'і' + 180: 255, # 'ґ' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ё' + 185: 253, # '№' + 186: 255, # 'є' + 187: 253, # '»' + 188: 255, # 'ј' + 189: 255, # 'Ѕ' + 190: 255, # 'ѕ' + 191: 255, # 'ї' + 192: 255, # 'А' + 193: 255, # 'Б' + 194: 255, # 'В' + 195: 255, # 'Г' + 196: 255, # 'Д' + 197: 255, # 'Е' + 198: 255, # 'Ж' + 199: 255, # 'З' + 200: 255, # 'И' + 201: 255, # 'Й' + 202: 255, # 'К' + 203: 255, # 'Л' + 204: 255, # 'М' + 205: 255, # 'Н' + 206: 255, # 'О' + 207: 255, # 'П' + 208: 255, # 'Р' + 209: 255, # 'С' + 210: 255, # 'Т' + 211: 255, # 'У' + 212: 255, # 'Ф' + 213: 255, # 'Х' + 214: 255, # 'Ц' + 215: 255, # 'Ч' + 216: 255, # 'Ш' + 217: 255, # 'Щ' + 218: 255, # 'Ъ' + 219: 255, # 'Ы' + 220: 255, # 'Ь' + 221: 255, # 'Э' + 222: 255, # 'Ю' + 223: 255, # 'Я' + 224: 255, # 'а' + 225: 255, # 'б' + 226: 255, # 'в' + 227: 255, # 'г' + 228: 255, # 'д' + 229: 255, # 'е' + 230: 255, # 'ж' + 231: 255, # 'з' + 232: 255, # 'и' + 233: 255, # 'й' + 234: 255, # 'к' + 235: 255, # 'л' + 236: 255, # 'м' + 237: 255, # 'н' + 238: 255, # 'о' + 239: 255, # 'п' + 240: 255, # 'р' + 241: 255, # 'с' + 242: 255, # 'т' + 243: 255, # 'у' + 244: 255, # 'ф' + 245: 255, # 'х' + 246: 255, # 'ц' + 247: 255, # 'ч' + 248: 255, # 'ш' + 249: 255, # 'щ' + 250: 255, # 'ъ' + 251: 255, # 'ы' + 252: 255, # 'ь' + 253: 255, # 'э' + 254: 255, # 'ю' + 255: 255, # 'я' } -WINDOWS_1251_SERBIAN_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1251', - language='Serbian', - char_to_order_map=WINDOWS_1251_SERBIAN_CHAR_TO_ORDER, - language_model=SERBIAN_LANG_MODEL, - typical_positive_ratio=0.9087603009002065, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzЂЈЉЊЋЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшђјљњћџ') +WINDOWS_1251_SERBIAN_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1251", + language="Serbian", + char_to_order_map=WINDOWS_1251_SERBIAN_CHAR_TO_ORDER, + language_model=SERBIAN_LANG_MODEL, + typical_positive_ratio=0.9087603009002065, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzЂЈЉЊЋЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшђјљњћџ", +) MACCYRILLIC_SERBIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 101, # 'A' - 66: 106, # 'B' - 67: 99, # 'C' - 68: 122, # 'D' - 69: 125, # 'E' - 70: 129, # 'F' - 71: 130, # 'G' - 72: 116, # 'H' - 73: 90, # 'I' - 74: 141, # 'J' - 75: 136, # 'K' - 76: 124, # 'L' - 77: 105, # 'M' - 78: 110, # 'N' - 79: 118, # 'O' - 80: 102, # 'P' - 81: 168, # 'Q' - 82: 121, # 'R' - 83: 95, # 'S' - 84: 109, # 'T' - 85: 135, # 'U' - 86: 113, # 'V' - 87: 137, # 'W' - 88: 139, # 'X' - 89: 158, # 'Y' - 90: 154, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 30, # 'a' - 98: 92, # 'b' - 99: 81, # 'c' - 100: 64, # 'd' - 101: 31, # 'e' - 102: 97, # 'f' - 103: 86, # 'g' - 104: 87, # 'h' - 105: 33, # 'i' - 106: 75, # 'j' - 107: 74, # 'k' - 108: 55, # 'l' - 109: 66, # 'm' - 110: 42, # 'n' - 111: 39, # 'o' - 112: 78, # 'p' - 113: 150, # 'q' - 114: 43, # 'r' - 115: 49, # 's' - 116: 48, # 't' - 117: 58, # 'u' - 118: 79, # 'v' - 119: 120, # 'w' - 120: 143, # 'x' - 121: 98, # 'y' - 122: 93, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 255, # 'А' - 129: 255, # 'Б' - 130: 255, # 'В' - 131: 255, # 'Г' - 132: 255, # 'Д' - 133: 255, # 'Е' - 134: 255, # 'Ж' - 135: 255, # 'З' - 136: 255, # 'И' - 137: 255, # 'Й' - 138: 255, # 'К' - 139: 255, # 'Л' - 140: 255, # 'М' - 141: 255, # 'Н' - 142: 255, # 'О' - 143: 255, # 'П' - 144: 255, # 'Р' - 145: 255, # 'С' - 146: 255, # 'Т' - 147: 255, # 'У' - 148: 255, # 'Ф' - 149: 255, # 'Х' - 150: 255, # 'Ц' - 151: 255, # 'Ч' - 152: 255, # 'Ш' - 153: 255, # 'Щ' - 154: 255, # 'Ъ' - 155: 255, # 'Ы' - 156: 255, # 'Ь' - 157: 255, # 'Э' - 158: 255, # 'Ю' - 159: 255, # 'Я' - 160: 253, # '†' - 161: 253, # '°' - 162: 255, # 'Ґ' - 163: 253, # '£' - 164: 253, # '§' - 165: 253, # '•' - 166: 253, # '¶' - 167: 255, # 'І' - 168: 253, # '®' - 169: 253, # '©' - 170: 253, # '™' - 171: 255, # 'Ђ' - 172: 255, # 'ђ' - 173: 253, # '≠' - 174: 255, # 'Ѓ' - 175: 255, # 'ѓ' - 176: 253, # '∞' - 177: 253, # '±' - 178: 253, # '≤' - 179: 253, # '≥' - 180: 255, # 'і' - 181: 255, # 'µ' - 182: 255, # 'ґ' - 183: 255, # 'Ј' - 184: 255, # 'Є' - 185: 255, # 'є' - 186: 255, # 'Ї' - 187: 255, # 'ї' - 188: 255, # 'Љ' - 189: 255, # 'љ' - 190: 255, # 'Њ' - 191: 255, # 'њ' - 192: 255, # 'ј' - 193: 255, # 'Ѕ' - 194: 253, # '¬' - 195: 253, # '√' - 196: 255, # 'ƒ' - 197: 253, # '≈' - 198: 253, # '∆' - 199: 253, # '«' - 200: 253, # '»' - 201: 253, # '…' - 202: 251, # '\xa0' - 203: 255, # 'Ћ' - 204: 255, # 'ћ' - 205: 255, # 'Ќ' - 206: 255, # 'ќ' - 207: 255, # 'ѕ' - 208: 253, # '–' - 209: 253, # '—' - 210: 253, # '“' - 211: 253, # '”' - 212: 253, # '‘' - 213: 253, # '’' - 214: 253, # '÷' - 215: 253, # '„' - 216: 255, # 'Ў' - 217: 255, # 'ў' - 218: 255, # 'Џ' - 219: 255, # 'џ' - 220: 253, # '№' - 221: 255, # 'Ё' - 222: 255, # 'ё' - 223: 255, # 'я' - 224: 255, # 'а' - 225: 255, # 'б' - 226: 255, # 'в' - 227: 255, # 'г' - 228: 255, # 'д' - 229: 255, # 'е' - 230: 255, # 'ж' - 231: 255, # 'з' - 232: 255, # 'и' - 233: 255, # 'й' - 234: 255, # 'к' - 235: 255, # 'л' - 236: 255, # 'м' - 237: 255, # 'н' - 238: 255, # 'о' - 239: 255, # 'п' - 240: 255, # 'р' - 241: 255, # 'с' - 242: 255, # 'т' - 243: 255, # 'у' - 244: 255, # 'ф' - 245: 255, # 'х' - 246: 255, # 'ц' - 247: 255, # 'ч' - 248: 255, # 'ш' - 249: 255, # 'щ' - 250: 255, # 'ъ' - 251: 255, # 'ы' - 252: 255, # 'ь' - 253: 255, # 'э' - 254: 255, # 'ю' - 255: 253, # '€' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 101, # 'A' + 66: 106, # 'B' + 67: 99, # 'C' + 68: 122, # 'D' + 69: 125, # 'E' + 70: 129, # 'F' + 71: 130, # 'G' + 72: 116, # 'H' + 73: 90, # 'I' + 74: 141, # 'J' + 75: 136, # 'K' + 76: 124, # 'L' + 77: 105, # 'M' + 78: 110, # 'N' + 79: 118, # 'O' + 80: 102, # 'P' + 81: 168, # 'Q' + 82: 121, # 'R' + 83: 95, # 'S' + 84: 109, # 'T' + 85: 135, # 'U' + 86: 113, # 'V' + 87: 137, # 'W' + 88: 139, # 'X' + 89: 158, # 'Y' + 90: 154, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 30, # 'a' + 98: 92, # 'b' + 99: 81, # 'c' + 100: 64, # 'd' + 101: 31, # 'e' + 102: 97, # 'f' + 103: 86, # 'g' + 104: 87, # 'h' + 105: 33, # 'i' + 106: 75, # 'j' + 107: 74, # 'k' + 108: 55, # 'l' + 109: 66, # 'm' + 110: 42, # 'n' + 111: 39, # 'o' + 112: 78, # 'p' + 113: 150, # 'q' + 114: 43, # 'r' + 115: 49, # 's' + 116: 48, # 't' + 117: 58, # 'u' + 118: 79, # 'v' + 119: 120, # 'w' + 120: 143, # 'x' + 121: 98, # 'y' + 122: 93, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 255, # 'А' + 129: 255, # 'Б' + 130: 255, # 'В' + 131: 255, # 'Г' + 132: 255, # 'Д' + 133: 255, # 'Е' + 134: 255, # 'Ж' + 135: 255, # 'З' + 136: 255, # 'И' + 137: 255, # 'Й' + 138: 255, # 'К' + 139: 255, # 'Л' + 140: 255, # 'М' + 141: 255, # 'Н' + 142: 255, # 'О' + 143: 255, # 'П' + 144: 255, # 'Р' + 145: 255, # 'С' + 146: 255, # 'Т' + 147: 255, # 'У' + 148: 255, # 'Ф' + 149: 255, # 'Х' + 150: 255, # 'Ц' + 151: 255, # 'Ч' + 152: 255, # 'Ш' + 153: 255, # 'Щ' + 154: 255, # 'Ъ' + 155: 255, # 'Ы' + 156: 255, # 'Ь' + 157: 255, # 'Э' + 158: 255, # 'Ю' + 159: 255, # 'Я' + 160: 253, # '†' + 161: 253, # '°' + 162: 255, # 'Ґ' + 163: 253, # '£' + 164: 253, # '§' + 165: 253, # '•' + 166: 253, # '¶' + 167: 255, # 'І' + 168: 253, # '®' + 169: 253, # '©' + 170: 253, # '™' + 171: 255, # 'Ђ' + 172: 255, # 'ђ' + 173: 253, # '≠' + 174: 255, # 'Ѓ' + 175: 255, # 'ѓ' + 176: 253, # '∞' + 177: 253, # '±' + 178: 253, # '≤' + 179: 253, # '≥' + 180: 255, # 'і' + 181: 255, # 'µ' + 182: 255, # 'ґ' + 183: 255, # 'Ј' + 184: 255, # 'Є' + 185: 255, # 'є' + 186: 255, # 'Ї' + 187: 255, # 'ї' + 188: 255, # 'Љ' + 189: 255, # 'љ' + 190: 255, # 'Њ' + 191: 255, # 'њ' + 192: 255, # 'ј' + 193: 255, # 'Ѕ' + 194: 253, # '¬' + 195: 253, # '√' + 196: 255, # 'ƒ' + 197: 253, # '≈' + 198: 253, # '∆' + 199: 253, # '«' + 200: 253, # '»' + 201: 253, # '…' + 202: 251, # '\xa0' + 203: 255, # 'Ћ' + 204: 255, # 'ћ' + 205: 255, # 'Ќ' + 206: 255, # 'ќ' + 207: 255, # 'ѕ' + 208: 253, # '–' + 209: 253, # '—' + 210: 253, # '“' + 211: 253, # '”' + 212: 253, # '‘' + 213: 253, # '’' + 214: 253, # '÷' + 215: 253, # '„' + 216: 255, # 'Ў' + 217: 255, # 'ў' + 218: 255, # 'Џ' + 219: 255, # 'џ' + 220: 253, # '№' + 221: 255, # 'Ё' + 222: 255, # 'ё' + 223: 255, # 'я' + 224: 255, # 'а' + 225: 255, # 'б' + 226: 255, # 'в' + 227: 255, # 'г' + 228: 255, # 'д' + 229: 255, # 'е' + 230: 255, # 'ж' + 231: 255, # 'з' + 232: 255, # 'и' + 233: 255, # 'й' + 234: 255, # 'к' + 235: 255, # 'л' + 236: 255, # 'м' + 237: 255, # 'н' + 238: 255, # 'о' + 239: 255, # 'п' + 240: 255, # 'р' + 241: 255, # 'с' + 242: 255, # 'т' + 243: 255, # 'у' + 244: 255, # 'ф' + 245: 255, # 'х' + 246: 255, # 'ц' + 247: 255, # 'ч' + 248: 255, # 'ш' + 249: 255, # 'щ' + 250: 255, # 'ъ' + 251: 255, # 'ы' + 252: 255, # 'ь' + 253: 255, # 'э' + 254: 255, # 'ю' + 255: 253, # '€' } -MACCYRILLIC_SERBIAN_MODEL = SingleByteCharSetModel(charset_name='MacCyrillic', - language='Serbian', - char_to_order_map=MACCYRILLIC_SERBIAN_CHAR_TO_ORDER, - language_model=SERBIAN_LANG_MODEL, - typical_positive_ratio=0.9087603009002065, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzЂЈЉЊЋЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшђјљњћџ') +MACCYRILLIC_SERBIAN_MODEL = SingleByteCharSetModel( + charset_name="MacCyrillic", + language="Serbian", + char_to_order_map=MACCYRILLIC_SERBIAN_CHAR_TO_ORDER, + language_model=SERBIAN_LANG_MODEL, + typical_positive_ratio=0.9087603009002065, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzЂЈЉЊЋЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшђјљњћџ", +) IBM855_SERBIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 101, # 'A' - 66: 106, # 'B' - 67: 99, # 'C' - 68: 122, # 'D' - 69: 125, # 'E' - 70: 129, # 'F' - 71: 130, # 'G' - 72: 116, # 'H' - 73: 90, # 'I' - 74: 141, # 'J' - 75: 136, # 'K' - 76: 124, # 'L' - 77: 105, # 'M' - 78: 110, # 'N' - 79: 118, # 'O' - 80: 102, # 'P' - 81: 168, # 'Q' - 82: 121, # 'R' - 83: 95, # 'S' - 84: 109, # 'T' - 85: 135, # 'U' - 86: 113, # 'V' - 87: 137, # 'W' - 88: 139, # 'X' - 89: 158, # 'Y' - 90: 154, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 30, # 'a' - 98: 92, # 'b' - 99: 81, # 'c' - 100: 64, # 'd' - 101: 31, # 'e' - 102: 97, # 'f' - 103: 86, # 'g' - 104: 87, # 'h' - 105: 33, # 'i' - 106: 75, # 'j' - 107: 74, # 'k' - 108: 55, # 'l' - 109: 66, # 'm' - 110: 42, # 'n' - 111: 39, # 'o' - 112: 78, # 'p' - 113: 150, # 'q' - 114: 43, # 'r' - 115: 49, # 's' - 116: 48, # 't' - 117: 58, # 'u' - 118: 79, # 'v' - 119: 120, # 'w' - 120: 143, # 'x' - 121: 98, # 'y' - 122: 93, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 255, # 'ђ' - 129: 255, # 'Ђ' - 130: 255, # 'ѓ' - 131: 255, # 'Ѓ' - 132: 255, # 'ё' - 133: 255, # 'Ё' - 134: 255, # 'є' - 135: 255, # 'Є' - 136: 255, # 'ѕ' - 137: 255, # 'Ѕ' - 138: 255, # 'і' - 139: 255, # 'І' - 140: 255, # 'ї' - 141: 255, # 'Ї' - 142: 255, # 'ј' - 143: 255, # 'Ј' - 144: 255, # 'љ' - 145: 255, # 'Љ' - 146: 255, # 'њ' - 147: 255, # 'Њ' - 148: 255, # 'ћ' - 149: 255, # 'Ћ' - 150: 255, # 'ќ' - 151: 255, # 'Ќ' - 152: 255, # 'ў' - 153: 255, # 'Ў' - 154: 255, # 'џ' - 155: 255, # 'Џ' - 156: 255, # 'ю' - 157: 255, # 'Ю' - 158: 255, # 'ъ' - 159: 255, # 'Ъ' - 160: 255, # 'а' - 161: 255, # 'А' - 162: 255, # 'б' - 163: 255, # 'Б' - 164: 255, # 'ц' - 165: 255, # 'Ц' - 166: 255, # 'д' - 167: 255, # 'Д' - 168: 255, # 'е' - 169: 255, # 'Е' - 170: 255, # 'ф' - 171: 255, # 'Ф' - 172: 255, # 'г' - 173: 255, # 'Г' - 174: 253, # '«' - 175: 253, # '»' - 176: 253, # '░' - 177: 253, # '▒' - 178: 253, # '▓' - 179: 253, # '│' - 180: 253, # '┤' - 181: 255, # 'х' - 182: 255, # 'Х' - 183: 255, # 'и' - 184: 255, # 'И' - 185: 253, # '╣' - 186: 253, # '║' - 187: 253, # '╗' - 188: 253, # '╝' - 189: 255, # 'й' - 190: 255, # 'Й' - 191: 253, # '┐' - 192: 253, # '└' - 193: 253, # '┴' - 194: 253, # '┬' - 195: 253, # '├' - 196: 253, # '─' - 197: 253, # '┼' - 198: 255, # 'к' - 199: 255, # 'К' - 200: 253, # '╚' - 201: 253, # '╔' - 202: 253, # '╩' - 203: 253, # '╦' - 204: 253, # '╠' - 205: 253, # '═' - 206: 253, # '╬' - 207: 253, # '¤' - 208: 255, # 'л' - 209: 255, # 'Л' - 210: 255, # 'м' - 211: 255, # 'М' - 212: 255, # 'н' - 213: 255, # 'Н' - 214: 255, # 'о' - 215: 255, # 'О' - 216: 255, # 'п' - 217: 253, # '┘' - 218: 253, # '┌' - 219: 253, # '█' - 220: 253, # '▄' - 221: 255, # 'П' - 222: 255, # 'я' - 223: 253, # '▀' - 224: 255, # 'Я' - 225: 255, # 'р' - 226: 255, # 'Р' - 227: 255, # 'с' - 228: 255, # 'С' - 229: 255, # 'т' - 230: 255, # 'Т' - 231: 255, # 'у' - 232: 255, # 'У' - 233: 255, # 'ж' - 234: 255, # 'Ж' - 235: 255, # 'в' - 236: 255, # 'В' - 237: 255, # 'ь' - 238: 255, # 'Ь' - 239: 253, # '№' - 240: 251, # '\xad' - 241: 255, # 'ы' - 242: 255, # 'Ы' - 243: 255, # 'з' - 244: 255, # 'З' - 245: 255, # 'ш' - 246: 255, # 'Ш' - 247: 255, # 'э' - 248: 255, # 'Э' - 249: 255, # 'щ' - 250: 255, # 'Щ' - 251: 255, # 'ч' - 252: 255, # 'Ч' - 253: 253, # '§' - 254: 253, # '■' - 255: 251, # '\xa0' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 101, # 'A' + 66: 106, # 'B' + 67: 99, # 'C' + 68: 122, # 'D' + 69: 125, # 'E' + 70: 129, # 'F' + 71: 130, # 'G' + 72: 116, # 'H' + 73: 90, # 'I' + 74: 141, # 'J' + 75: 136, # 'K' + 76: 124, # 'L' + 77: 105, # 'M' + 78: 110, # 'N' + 79: 118, # 'O' + 80: 102, # 'P' + 81: 168, # 'Q' + 82: 121, # 'R' + 83: 95, # 'S' + 84: 109, # 'T' + 85: 135, # 'U' + 86: 113, # 'V' + 87: 137, # 'W' + 88: 139, # 'X' + 89: 158, # 'Y' + 90: 154, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 30, # 'a' + 98: 92, # 'b' + 99: 81, # 'c' + 100: 64, # 'd' + 101: 31, # 'e' + 102: 97, # 'f' + 103: 86, # 'g' + 104: 87, # 'h' + 105: 33, # 'i' + 106: 75, # 'j' + 107: 74, # 'k' + 108: 55, # 'l' + 109: 66, # 'm' + 110: 42, # 'n' + 111: 39, # 'o' + 112: 78, # 'p' + 113: 150, # 'q' + 114: 43, # 'r' + 115: 49, # 's' + 116: 48, # 't' + 117: 58, # 'u' + 118: 79, # 'v' + 119: 120, # 'w' + 120: 143, # 'x' + 121: 98, # 'y' + 122: 93, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 255, # 'ђ' + 129: 255, # 'Ђ' + 130: 255, # 'ѓ' + 131: 255, # 'Ѓ' + 132: 255, # 'ё' + 133: 255, # 'Ё' + 134: 255, # 'є' + 135: 255, # 'Є' + 136: 255, # 'ѕ' + 137: 255, # 'Ѕ' + 138: 255, # 'і' + 139: 255, # 'І' + 140: 255, # 'ї' + 141: 255, # 'Ї' + 142: 255, # 'ј' + 143: 255, # 'Ј' + 144: 255, # 'љ' + 145: 255, # 'Љ' + 146: 255, # 'њ' + 147: 255, # 'Њ' + 148: 255, # 'ћ' + 149: 255, # 'Ћ' + 150: 255, # 'ќ' + 151: 255, # 'Ќ' + 152: 255, # 'ў' + 153: 255, # 'Ў' + 154: 255, # 'џ' + 155: 255, # 'Џ' + 156: 255, # 'ю' + 157: 255, # 'Ю' + 158: 255, # 'ъ' + 159: 255, # 'Ъ' + 160: 255, # 'а' + 161: 255, # 'А' + 162: 255, # 'б' + 163: 255, # 'Б' + 164: 255, # 'ц' + 165: 255, # 'Ц' + 166: 255, # 'д' + 167: 255, # 'Д' + 168: 255, # 'е' + 169: 255, # 'Е' + 170: 255, # 'ф' + 171: 255, # 'Ф' + 172: 255, # 'г' + 173: 255, # 'Г' + 174: 253, # '«' + 175: 253, # '»' + 176: 253, # '░' + 177: 253, # '▒' + 178: 253, # '▓' + 179: 253, # '│' + 180: 253, # '┤' + 181: 255, # 'х' + 182: 255, # 'Х' + 183: 255, # 'и' + 184: 255, # 'И' + 185: 253, # '╣' + 186: 253, # '║' + 187: 253, # '╗' + 188: 253, # '╝' + 189: 255, # 'й' + 190: 255, # 'Й' + 191: 253, # '┐' + 192: 253, # '└' + 193: 253, # '┴' + 194: 253, # '┬' + 195: 253, # '├' + 196: 253, # '─' + 197: 253, # '┼' + 198: 255, # 'к' + 199: 255, # 'К' + 200: 253, # '╚' + 201: 253, # '╔' + 202: 253, # '╩' + 203: 253, # '╦' + 204: 253, # '╠' + 205: 253, # '═' + 206: 253, # '╬' + 207: 253, # '¤' + 208: 255, # 'л' + 209: 255, # 'Л' + 210: 255, # 'м' + 211: 255, # 'М' + 212: 255, # 'н' + 213: 255, # 'Н' + 214: 255, # 'о' + 215: 255, # 'О' + 216: 255, # 'п' + 217: 253, # '┘' + 218: 253, # '┌' + 219: 253, # '█' + 220: 253, # '▄' + 221: 255, # 'П' + 222: 255, # 'я' + 223: 253, # '▀' + 224: 255, # 'Я' + 225: 255, # 'р' + 226: 255, # 'Р' + 227: 255, # 'с' + 228: 255, # 'С' + 229: 255, # 'т' + 230: 255, # 'Т' + 231: 255, # 'у' + 232: 255, # 'У' + 233: 255, # 'ж' + 234: 255, # 'Ж' + 235: 255, # 'в' + 236: 255, # 'В' + 237: 255, # 'ь' + 238: 255, # 'Ь' + 239: 253, # '№' + 240: 251, # '\xad' + 241: 255, # 'ы' + 242: 255, # 'Ы' + 243: 255, # 'з' + 244: 255, # 'З' + 245: 255, # 'ш' + 246: 255, # 'Ш' + 247: 255, # 'э' + 248: 255, # 'Э' + 249: 255, # 'щ' + 250: 255, # 'Щ' + 251: 255, # 'ч' + 252: 255, # 'Ч' + 253: 253, # '§' + 254: 253, # '■' + 255: 251, # '\xa0' } -IBM855_SERBIAN_MODEL = SingleByteCharSetModel(charset_name='IBM855', - language='Serbian', - char_to_order_map=IBM855_SERBIAN_CHAR_TO_ORDER, - language_model=SERBIAN_LANG_MODEL, - typical_positive_ratio=0.9067233737727759, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzЂЈЉЊЋЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшђјљњћџ') +IBM855_SERBIAN_MODEL = SingleByteCharSetModel( + charset_name="IBM855", + language="Serbian", + char_to_order_map=IBM855_SERBIAN_CHAR_TO_ORDER, + language_model=SERBIAN_LANG_MODEL, + typical_positive_ratio=0.9067233737727759, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzЂЈЉЊЋЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшђјљњћџ", +) ISO_8859_5_SERBIAN_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 101, # 'A' - 66: 106, # 'B' - 67: 99, # 'C' - 68: 122, # 'D' - 69: 125, # 'E' - 70: 129, # 'F' - 71: 130, # 'G' - 72: 116, # 'H' - 73: 90, # 'I' - 74: 141, # 'J' - 75: 136, # 'K' - 76: 124, # 'L' - 77: 105, # 'M' - 78: 110, # 'N' - 79: 118, # 'O' - 80: 102, # 'P' - 81: 168, # 'Q' - 82: 121, # 'R' - 83: 95, # 'S' - 84: 109, # 'T' - 85: 135, # 'U' - 86: 113, # 'V' - 87: 137, # 'W' - 88: 139, # 'X' - 89: 158, # 'Y' - 90: 154, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 30, # 'a' - 98: 92, # 'b' - 99: 81, # 'c' - 100: 64, # 'd' - 101: 31, # 'e' - 102: 97, # 'f' - 103: 86, # 'g' - 104: 87, # 'h' - 105: 33, # 'i' - 106: 75, # 'j' - 107: 74, # 'k' - 108: 55, # 'l' - 109: 66, # 'm' - 110: 42, # 'n' - 111: 39, # 'o' - 112: 78, # 'p' - 113: 150, # 'q' - 114: 43, # 'r' - 115: 49, # 's' - 116: 48, # 't' - 117: 58, # 'u' - 118: 79, # 'v' - 119: 120, # 'w' - 120: 143, # 'x' - 121: 98, # 'y' - 122: 93, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ё' - 162: 255, # 'Ђ' - 163: 255, # 'Ѓ' - 164: 255, # 'Є' - 165: 255, # 'Ѕ' - 166: 255, # 'І' - 167: 255, # 'Ї' - 168: 255, # 'Ј' - 169: 255, # 'Љ' - 170: 255, # 'Њ' - 171: 255, # 'Ћ' - 172: 255, # 'Ќ' - 173: 251, # '\xad' - 174: 255, # 'Ў' - 175: 255, # 'Џ' - 176: 255, # 'А' - 177: 255, # 'Б' - 178: 255, # 'В' - 179: 255, # 'Г' - 180: 255, # 'Д' - 181: 255, # 'Е' - 182: 255, # 'Ж' - 183: 255, # 'З' - 184: 255, # 'И' - 185: 255, # 'Й' - 186: 255, # 'К' - 187: 255, # 'Л' - 188: 255, # 'М' - 189: 255, # 'Н' - 190: 255, # 'О' - 191: 255, # 'П' - 192: 255, # 'Р' - 193: 255, # 'С' - 194: 255, # 'Т' - 195: 255, # 'У' - 196: 255, # 'Ф' - 197: 255, # 'Х' - 198: 255, # 'Ц' - 199: 255, # 'Ч' - 200: 255, # 'Ш' - 201: 255, # 'Щ' - 202: 255, # 'Ъ' - 203: 255, # 'Ы' - 204: 255, # 'Ь' - 205: 255, # 'Э' - 206: 255, # 'Ю' - 207: 255, # 'Я' - 208: 255, # 'а' - 209: 255, # 'б' - 210: 255, # 'в' - 211: 255, # 'г' - 212: 255, # 'д' - 213: 255, # 'е' - 214: 255, # 'ж' - 215: 255, # 'з' - 216: 255, # 'и' - 217: 255, # 'й' - 218: 255, # 'к' - 219: 255, # 'л' - 220: 255, # 'м' - 221: 255, # 'н' - 222: 255, # 'о' - 223: 255, # 'п' - 224: 255, # 'р' - 225: 255, # 'с' - 226: 255, # 'т' - 227: 255, # 'у' - 228: 255, # 'ф' - 229: 255, # 'х' - 230: 255, # 'ц' - 231: 255, # 'ч' - 232: 255, # 'ш' - 233: 255, # 'щ' - 234: 255, # 'ъ' - 235: 255, # 'ы' - 236: 255, # 'ь' - 237: 255, # 'э' - 238: 255, # 'ю' - 239: 255, # 'я' - 240: 253, # '№' - 241: 255, # 'ё' - 242: 255, # 'ђ' - 243: 255, # 'ѓ' - 244: 255, # 'є' - 245: 255, # 'ѕ' - 246: 255, # 'і' - 247: 255, # 'ї' - 248: 255, # 'ј' - 249: 255, # 'љ' - 250: 255, # 'њ' - 251: 255, # 'ћ' - 252: 255, # 'ќ' - 253: 253, # '§' - 254: 255, # 'ў' - 255: 255, # 'џ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 101, # 'A' + 66: 106, # 'B' + 67: 99, # 'C' + 68: 122, # 'D' + 69: 125, # 'E' + 70: 129, # 'F' + 71: 130, # 'G' + 72: 116, # 'H' + 73: 90, # 'I' + 74: 141, # 'J' + 75: 136, # 'K' + 76: 124, # 'L' + 77: 105, # 'M' + 78: 110, # 'N' + 79: 118, # 'O' + 80: 102, # 'P' + 81: 168, # 'Q' + 82: 121, # 'R' + 83: 95, # 'S' + 84: 109, # 'T' + 85: 135, # 'U' + 86: 113, # 'V' + 87: 137, # 'W' + 88: 139, # 'X' + 89: 158, # 'Y' + 90: 154, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 30, # 'a' + 98: 92, # 'b' + 99: 81, # 'c' + 100: 64, # 'd' + 101: 31, # 'e' + 102: 97, # 'f' + 103: 86, # 'g' + 104: 87, # 'h' + 105: 33, # 'i' + 106: 75, # 'j' + 107: 74, # 'k' + 108: 55, # 'l' + 109: 66, # 'm' + 110: 42, # 'n' + 111: 39, # 'o' + 112: 78, # 'p' + 113: 150, # 'q' + 114: 43, # 'r' + 115: 49, # 's' + 116: 48, # 't' + 117: 58, # 'u' + 118: 79, # 'v' + 119: 120, # 'w' + 120: 143, # 'x' + 121: 98, # 'y' + 122: 93, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ё' + 162: 255, # 'Ђ' + 163: 255, # 'Ѓ' + 164: 255, # 'Є' + 165: 255, # 'Ѕ' + 166: 255, # 'І' + 167: 255, # 'Ї' + 168: 255, # 'Ј' + 169: 255, # 'Љ' + 170: 255, # 'Њ' + 171: 255, # 'Ћ' + 172: 255, # 'Ќ' + 173: 251, # '\xad' + 174: 255, # 'Ў' + 175: 255, # 'Џ' + 176: 255, # 'А' + 177: 255, # 'Б' + 178: 255, # 'В' + 179: 255, # 'Г' + 180: 255, # 'Д' + 181: 255, # 'Е' + 182: 255, # 'Ж' + 183: 255, # 'З' + 184: 255, # 'И' + 185: 255, # 'Й' + 186: 255, # 'К' + 187: 255, # 'Л' + 188: 255, # 'М' + 189: 255, # 'Н' + 190: 255, # 'О' + 191: 255, # 'П' + 192: 255, # 'Р' + 193: 255, # 'С' + 194: 255, # 'Т' + 195: 255, # 'У' + 196: 255, # 'Ф' + 197: 255, # 'Х' + 198: 255, # 'Ц' + 199: 255, # 'Ч' + 200: 255, # 'Ш' + 201: 255, # 'Щ' + 202: 255, # 'Ъ' + 203: 255, # 'Ы' + 204: 255, # 'Ь' + 205: 255, # 'Э' + 206: 255, # 'Ю' + 207: 255, # 'Я' + 208: 255, # 'а' + 209: 255, # 'б' + 210: 255, # 'в' + 211: 255, # 'г' + 212: 255, # 'д' + 213: 255, # 'е' + 214: 255, # 'ж' + 215: 255, # 'з' + 216: 255, # 'и' + 217: 255, # 'й' + 218: 255, # 'к' + 219: 255, # 'л' + 220: 255, # 'м' + 221: 255, # 'н' + 222: 255, # 'о' + 223: 255, # 'п' + 224: 255, # 'р' + 225: 255, # 'с' + 226: 255, # 'т' + 227: 255, # 'у' + 228: 255, # 'ф' + 229: 255, # 'х' + 230: 255, # 'ц' + 231: 255, # 'ч' + 232: 255, # 'ш' + 233: 255, # 'щ' + 234: 255, # 'ъ' + 235: 255, # 'ы' + 236: 255, # 'ь' + 237: 255, # 'э' + 238: 255, # 'ю' + 239: 255, # 'я' + 240: 253, # '№' + 241: 255, # 'ё' + 242: 255, # 'ђ' + 243: 255, # 'ѓ' + 244: 255, # 'є' + 245: 255, # 'ѕ' + 246: 255, # 'і' + 247: 255, # 'ї' + 248: 255, # 'ј' + 249: 255, # 'љ' + 250: 255, # 'њ' + 251: 255, # 'ћ' + 252: 255, # 'ќ' + 253: 253, # '§' + 254: 255, # 'ў' + 255: 255, # 'џ' } -ISO_8859_5_SERBIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-5', - language='Serbian', - char_to_order_map=ISO_8859_5_SERBIAN_CHAR_TO_ORDER, - language_model=SERBIAN_LANG_MODEL, - typical_positive_ratio=0.9067233737727759, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzЂЈЉЊЋЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшђјљњћџ') - +ISO_8859_5_SERBIAN_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-5", + language="Serbian", + char_to_order_map=ISO_8859_5_SERBIAN_CHAR_TO_ORDER, + language_model=SERBIAN_LANG_MODEL, + typical_positive_ratio=0.9067233737727759, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzЂЈЉЊЋЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшђјљњћџ", +) diff --git a/chardet/langslovakmodel.py b/chardet/langslovakmodel.py index 6d5fa2f3..9a481730 100644 --- a/chardet/langslovakmodel.py +++ b/chardet/langslovakmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -62768,536 +62766,539 @@ # Character Mapping Table(s): ISO_8859_2_SLOVAK_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 59, # 'A' - 66: 55, # 'B' - 67: 65, # 'C' - 68: 63, # 'D' - 69: 72, # 'E' - 70: 78, # 'F' - 71: 76, # 'G' - 72: 68, # 'H' - 73: 66, # 'I' - 74: 67, # 'J' - 75: 56, # 'K' - 76: 69, # 'L' - 77: 48, # 'M' - 78: 54, # 'N' - 79: 70, # 'O' - 80: 45, # 'P' - 81: 123, # 'Q' - 82: 60, # 'R' - 83: 41, # 'S' - 84: 62, # 'T' - 85: 85, # 'U' - 86: 47, # 'V' - 87: 86, # 'W' - 88: 105, # 'X' - 89: 100, # 'Y' - 90: 73, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 3, # 'a' - 98: 23, # 'b' - 99: 17, # 'c' - 100: 13, # 'd' - 101: 4, # 'e' - 102: 42, # 'f' - 103: 37, # 'g' - 104: 18, # 'h' - 105: 6, # 'i' - 106: 20, # 'j' - 107: 10, # 'k' - 108: 12, # 'l' - 109: 14, # 'm' - 110: 5, # 'n' - 111: 2, # 'o' - 112: 16, # 'p' - 113: 109, # 'q' - 114: 7, # 'r' - 115: 8, # 's' - 116: 9, # 't' - 117: 15, # 'u' - 118: 11, # 'v' - 119: 82, # 'w' - 120: 81, # 'x' - 121: 22, # 'y' - 122: 21, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ą' - 162: 253, # '˘' - 163: 255, # 'Ł' - 164: 253, # '¤' - 165: 255, # 'Ľ' - 166: 255, # 'Ś' - 167: 253, # '§' - 168: 253, # '¨' - 169: 255, # 'Š' - 170: 255, # 'Ş' - 171: 255, # 'Ť' - 172: 255, # 'Ź' - 173: 251, # '\xad' - 174: 255, # 'Ž' - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 255, # 'ą' - 178: 253, # '˛' - 179: 255, # 'ł' - 180: 253, # '´' - 181: 255, # 'ľ' - 182: 255, # 'ś' - 183: 255, # 'ˇ' - 184: 253, # '¸' - 185: 255, # 'š' - 186: 255, # 'ş' - 187: 255, # 'ť' - 188: 255, # 'ź' - 189: 253, # '˝' - 190: 255, # 'ž' - 191: 255, # 'ż' - 192: 255, # 'Ŕ' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ă' - 196: 255, # 'Ä' - 197: 255, # 'Ĺ' - 198: 255, # 'Ć' - 199: 255, # 'Ç' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ě' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ď' - 208: 255, # 'Đ' - 209: 255, # 'Ń' - 210: 255, # 'Ň' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ő' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ř' - 217: 255, # 'Ů' - 218: 255, # 'Ú' - 219: 255, # 'Ű' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Ţ' - 223: 255, # 'ß' - 224: 255, # 'ŕ' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ă' - 228: 255, # 'ä' - 229: 255, # 'ĺ' - 230: 255, # 'ć' - 231: 255, # 'ç' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ě' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ď' - 240: 255, # 'đ' - 241: 255, # 'ń' - 242: 255, # 'ň' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ő' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ř' - 249: 255, # 'ů' - 250: 255, # 'ú' - 251: 255, # 'ű' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'ţ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 59, # 'A' + 66: 55, # 'B' + 67: 65, # 'C' + 68: 63, # 'D' + 69: 72, # 'E' + 70: 78, # 'F' + 71: 76, # 'G' + 72: 68, # 'H' + 73: 66, # 'I' + 74: 67, # 'J' + 75: 56, # 'K' + 76: 69, # 'L' + 77: 48, # 'M' + 78: 54, # 'N' + 79: 70, # 'O' + 80: 45, # 'P' + 81: 123, # 'Q' + 82: 60, # 'R' + 83: 41, # 'S' + 84: 62, # 'T' + 85: 85, # 'U' + 86: 47, # 'V' + 87: 86, # 'W' + 88: 105, # 'X' + 89: 100, # 'Y' + 90: 73, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 3, # 'a' + 98: 23, # 'b' + 99: 17, # 'c' + 100: 13, # 'd' + 101: 4, # 'e' + 102: 42, # 'f' + 103: 37, # 'g' + 104: 18, # 'h' + 105: 6, # 'i' + 106: 20, # 'j' + 107: 10, # 'k' + 108: 12, # 'l' + 109: 14, # 'm' + 110: 5, # 'n' + 111: 2, # 'o' + 112: 16, # 'p' + 113: 109, # 'q' + 114: 7, # 'r' + 115: 8, # 's' + 116: 9, # 't' + 117: 15, # 'u' + 118: 11, # 'v' + 119: 82, # 'w' + 120: 81, # 'x' + 121: 22, # 'y' + 122: 21, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ą' + 162: 253, # '˘' + 163: 255, # 'Ł' + 164: 253, # '¤' + 165: 255, # 'Ľ' + 166: 255, # 'Ś' + 167: 253, # '§' + 168: 253, # '¨' + 169: 255, # 'Š' + 170: 255, # 'Ş' + 171: 255, # 'Ť' + 172: 255, # 'Ź' + 173: 251, # '\xad' + 174: 255, # 'Ž' + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 255, # 'ą' + 178: 253, # '˛' + 179: 255, # 'ł' + 180: 253, # '´' + 181: 255, # 'ľ' + 182: 255, # 'ś' + 183: 255, # 'ˇ' + 184: 253, # '¸' + 185: 255, # 'š' + 186: 255, # 'ş' + 187: 255, # 'ť' + 188: 255, # 'ź' + 189: 253, # '˝' + 190: 255, # 'ž' + 191: 255, # 'ż' + 192: 255, # 'Ŕ' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ă' + 196: 255, # 'Ä' + 197: 255, # 'Ĺ' + 198: 255, # 'Ć' + 199: 255, # 'Ç' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ě' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ď' + 208: 255, # 'Đ' + 209: 255, # 'Ń' + 210: 255, # 'Ň' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ő' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ř' + 217: 255, # 'Ů' + 218: 255, # 'Ú' + 219: 255, # 'Ű' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Ţ' + 223: 255, # 'ß' + 224: 255, # 'ŕ' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ă' + 228: 255, # 'ä' + 229: 255, # 'ĺ' + 230: 255, # 'ć' + 231: 255, # 'ç' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ě' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ď' + 240: 255, # 'đ' + 241: 255, # 'ń' + 242: 255, # 'ň' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ő' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ř' + 249: 255, # 'ů' + 250: 255, # 'ú' + 251: 255, # 'ű' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'ţ' + 255: 253, # '˙' } -ISO_8859_2_SLOVAK_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-2', - language='Slovak', - char_to_order_map=ISO_8859_2_SLOVAK_CHAR_TO_ORDER, - language_model=SLOVAK_LANG_MODEL, - typical_positive_ratio=0.866986679434158, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÄÉÍÓÔÚÝáäéíóôúýČčĎďĹ弾ŇňŔ੹ŤťŽž') +ISO_8859_2_SLOVAK_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-2", + language="Slovak", + char_to_order_map=ISO_8859_2_SLOVAK_CHAR_TO_ORDER, + language_model=SLOVAK_LANG_MODEL, + typical_positive_ratio=0.866986679434158, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÄÉÍÓÔÚÝáäéíóôúýČčĎďĹ弾ŇňŔ੹ŤťŽž", +) WINDOWS_1250_SLOVAK_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 59, # 'A' - 66: 55, # 'B' - 67: 65, # 'C' - 68: 63, # 'D' - 69: 72, # 'E' - 70: 78, # 'F' - 71: 76, # 'G' - 72: 68, # 'H' - 73: 66, # 'I' - 74: 67, # 'J' - 75: 56, # 'K' - 76: 69, # 'L' - 77: 48, # 'M' - 78: 54, # 'N' - 79: 70, # 'O' - 80: 45, # 'P' - 81: 123, # 'Q' - 82: 60, # 'R' - 83: 41, # 'S' - 84: 62, # 'T' - 85: 85, # 'U' - 86: 47, # 'V' - 87: 86, # 'W' - 88: 105, # 'X' - 89: 100, # 'Y' - 90: 73, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 3, # 'a' - 98: 23, # 'b' - 99: 17, # 'c' - 100: 13, # 'd' - 101: 4, # 'e' - 102: 42, # 'f' - 103: 37, # 'g' - 104: 18, # 'h' - 105: 6, # 'i' - 106: 20, # 'j' - 107: 10, # 'k' - 108: 12, # 'l' - 109: 14, # 'm' - 110: 5, # 'n' - 111: 2, # 'o' - 112: 16, # 'p' - 113: 109, # 'q' - 114: 7, # 'r' - 115: 8, # 's' - 116: 9, # 't' - 117: 15, # 'u' - 118: 11, # 'v' - 119: 82, # 'w' - 120: 81, # 'x' - 121: 22, # 'y' - 122: 21, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # None - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # None - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Ś' - 141: 255, # 'Ť' - 142: 255, # 'Ž' - 143: 255, # 'Ź' - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 255, # None - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'ś' - 157: 255, # 'ť' - 158: 255, # 'ž' - 159: 255, # 'ź' - 160: 251, # '\xa0' - 161: 255, # 'ˇ' - 162: 253, # '˘' - 163: 255, # 'Ł' - 164: 253, # '¤' - 165: 255, # 'Ą' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'Ş' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 253, # '±' - 178: 253, # '˛' - 179: 255, # 'ł' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 255, # 'ą' - 186: 255, # 'ş' - 187: 253, # '»' - 188: 255, # 'Ľ' - 189: 253, # '˝' - 190: 255, # 'ľ' - 191: 255, # 'ż' - 192: 255, # 'Ŕ' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ă' - 196: 255, # 'Ä' - 197: 255, # 'Ĺ' - 198: 255, # 'Ć' - 199: 255, # 'Ç' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ě' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ď' - 208: 255, # 'Đ' - 209: 255, # 'Ń' - 210: 255, # 'Ň' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ő' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ř' - 217: 255, # 'Ů' - 218: 255, # 'Ú' - 219: 255, # 'Ű' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Ţ' - 223: 255, # 'ß' - 224: 255, # 'ŕ' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ă' - 228: 255, # 'ä' - 229: 255, # 'ĺ' - 230: 255, # 'ć' - 231: 255, # 'ç' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ě' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ď' - 240: 255, # 'đ' - 241: 255, # 'ń' - 242: 255, # 'ň' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ő' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ř' - 249: 255, # 'ů' - 250: 255, # 'ú' - 251: 255, # 'ű' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'ţ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 59, # 'A' + 66: 55, # 'B' + 67: 65, # 'C' + 68: 63, # 'D' + 69: 72, # 'E' + 70: 78, # 'F' + 71: 76, # 'G' + 72: 68, # 'H' + 73: 66, # 'I' + 74: 67, # 'J' + 75: 56, # 'K' + 76: 69, # 'L' + 77: 48, # 'M' + 78: 54, # 'N' + 79: 70, # 'O' + 80: 45, # 'P' + 81: 123, # 'Q' + 82: 60, # 'R' + 83: 41, # 'S' + 84: 62, # 'T' + 85: 85, # 'U' + 86: 47, # 'V' + 87: 86, # 'W' + 88: 105, # 'X' + 89: 100, # 'Y' + 90: 73, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 3, # 'a' + 98: 23, # 'b' + 99: 17, # 'c' + 100: 13, # 'd' + 101: 4, # 'e' + 102: 42, # 'f' + 103: 37, # 'g' + 104: 18, # 'h' + 105: 6, # 'i' + 106: 20, # 'j' + 107: 10, # 'k' + 108: 12, # 'l' + 109: 14, # 'm' + 110: 5, # 'n' + 111: 2, # 'o' + 112: 16, # 'p' + 113: 109, # 'q' + 114: 7, # 'r' + 115: 8, # 's' + 116: 9, # 't' + 117: 15, # 'u' + 118: 11, # 'v' + 119: 82, # 'w' + 120: 81, # 'x' + 121: 22, # 'y' + 122: 21, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # None + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # None + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Ś' + 141: 255, # 'Ť' + 142: 255, # 'Ž' + 143: 255, # 'Ź' + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 255, # None + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'ś' + 157: 255, # 'ť' + 158: 255, # 'ž' + 159: 255, # 'ź' + 160: 251, # '\xa0' + 161: 255, # 'ˇ' + 162: 253, # '˘' + 163: 255, # 'Ł' + 164: 253, # '¤' + 165: 255, # 'Ą' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'Ş' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 253, # '±' + 178: 253, # '˛' + 179: 255, # 'ł' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 255, # 'ą' + 186: 255, # 'ş' + 187: 253, # '»' + 188: 255, # 'Ľ' + 189: 253, # '˝' + 190: 255, # 'ľ' + 191: 255, # 'ż' + 192: 255, # 'Ŕ' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ă' + 196: 255, # 'Ä' + 197: 255, # 'Ĺ' + 198: 255, # 'Ć' + 199: 255, # 'Ç' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ě' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ď' + 208: 255, # 'Đ' + 209: 255, # 'Ń' + 210: 255, # 'Ň' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ő' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ř' + 217: 255, # 'Ů' + 218: 255, # 'Ú' + 219: 255, # 'Ű' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Ţ' + 223: 255, # 'ß' + 224: 255, # 'ŕ' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ă' + 228: 255, # 'ä' + 229: 255, # 'ĺ' + 230: 255, # 'ć' + 231: 255, # 'ç' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ě' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ď' + 240: 255, # 'đ' + 241: 255, # 'ń' + 242: 255, # 'ň' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ő' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ř' + 249: 255, # 'ů' + 250: 255, # 'ú' + 251: 255, # 'ű' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'ţ' + 255: 253, # '˙' } -WINDOWS_1250_SLOVAK_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1250', - language='Slovak', - char_to_order_map=WINDOWS_1250_SLOVAK_CHAR_TO_ORDER, - language_model=SLOVAK_LANG_MODEL, - typical_positive_ratio=0.8743853122486286, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÄÉÍÓÔÚÝáäéíóôúýČčĎďĹ弾ŇňŔ੹ŤťŽž') - +WINDOWS_1250_SLOVAK_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1250", + language="Slovak", + char_to_order_map=WINDOWS_1250_SLOVAK_CHAR_TO_ORDER, + language_model=SLOVAK_LANG_MODEL, + typical_positive_ratio=0.8743853122486286, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÄÉÍÓÔÚÝáäéíóôúýČčĎďĹ弾ŇňŔ੹ŤťŽž", +) diff --git a/chardet/langslovenemodel.py b/chardet/langslovenemodel.py index 410880d9..05096022 100644 --- a/chardet/langslovenemodel.py +++ b/chardet/langslovenemodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -1140,536 +1138,539 @@ # Character Mapping Table(s): ISO_8859_2_SLOVENE_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ą' - 162: 253, # '˘' - 163: 255, # 'Ł' - 164: 253, # '¤' - 165: 255, # 'Ľ' - 166: 255, # 'Ś' - 167: 253, # '§' - 168: 253, # '¨' - 169: 255, # 'Š' - 170: 255, # 'Ş' - 171: 255, # 'Ť' - 172: 255, # 'Ź' - 173: 251, # '\xad' - 174: 255, # 'Ž' - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 255, # 'ą' - 178: 253, # '˛' - 179: 255, # 'ł' - 180: 253, # '´' - 181: 255, # 'ľ' - 182: 255, # 'ś' - 183: 255, # 'ˇ' - 184: 253, # '¸' - 185: 255, # 'š' - 186: 255, # 'ş' - 187: 255, # 'ť' - 188: 255, # 'ź' - 189: 253, # '˝' - 190: 255, # 'ž' - 191: 255, # 'ż' - 192: 255, # 'Ŕ' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ă' - 196: 255, # 'Ä' - 197: 255, # 'Ĺ' - 198: 255, # 'Ć' - 199: 255, # 'Ç' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ě' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ď' - 208: 255, # 'Đ' - 209: 255, # 'Ń' - 210: 255, # 'Ň' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ő' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ř' - 217: 255, # 'Ů' - 218: 255, # 'Ú' - 219: 255, # 'Ű' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Ţ' - 223: 255, # 'ß' - 224: 255, # 'ŕ' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ă' - 228: 255, # 'ä' - 229: 255, # 'ĺ' - 230: 255, # 'ć' - 231: 255, # 'ç' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ě' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ď' - 240: 255, # 'đ' - 241: 255, # 'ń' - 242: 255, # 'ň' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ő' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ř' - 249: 255, # 'ů' - 250: 255, # 'ú' - 251: 255, # 'ű' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'ţ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ą' + 162: 253, # '˘' + 163: 255, # 'Ł' + 164: 253, # '¤' + 165: 255, # 'Ľ' + 166: 255, # 'Ś' + 167: 253, # '§' + 168: 253, # '¨' + 169: 255, # 'Š' + 170: 255, # 'Ş' + 171: 255, # 'Ť' + 172: 255, # 'Ź' + 173: 251, # '\xad' + 174: 255, # 'Ž' + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 255, # 'ą' + 178: 253, # '˛' + 179: 255, # 'ł' + 180: 253, # '´' + 181: 255, # 'ľ' + 182: 255, # 'ś' + 183: 255, # 'ˇ' + 184: 253, # '¸' + 185: 255, # 'š' + 186: 255, # 'ş' + 187: 255, # 'ť' + 188: 255, # 'ź' + 189: 253, # '˝' + 190: 255, # 'ž' + 191: 255, # 'ż' + 192: 255, # 'Ŕ' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ă' + 196: 255, # 'Ä' + 197: 255, # 'Ĺ' + 198: 255, # 'Ć' + 199: 255, # 'Ç' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ě' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ď' + 208: 255, # 'Đ' + 209: 255, # 'Ń' + 210: 255, # 'Ň' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ő' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ř' + 217: 255, # 'Ů' + 218: 255, # 'Ú' + 219: 255, # 'Ű' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Ţ' + 223: 255, # 'ß' + 224: 255, # 'ŕ' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ă' + 228: 255, # 'ä' + 229: 255, # 'ĺ' + 230: 255, # 'ć' + 231: 255, # 'ç' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ě' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ď' + 240: 255, # 'đ' + 241: 255, # 'ń' + 242: 255, # 'ň' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ő' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ř' + 249: 255, # 'ů' + 250: 255, # 'ú' + 251: 255, # 'ű' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'ţ' + 255: 253, # '˙' } -ISO_8859_2_SLOVENE_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-2', - language='Slovene', - char_to_order_map=ISO_8859_2_SLOVENE_CHAR_TO_ORDER, - language_model=SLOVENE_LANG_MODEL, - typical_positive_ratio=0.9720601208900729, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzČ芚Žž') +ISO_8859_2_SLOVENE_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-2", + language="Slovene", + char_to_order_map=ISO_8859_2_SLOVENE_CHAR_TO_ORDER, + language_model=SLOVENE_LANG_MODEL, + typical_positive_ratio=0.9720601208900729, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzČ芚Žž", +) WINDOWS_1250_SLOVENE_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # None - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # None - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Ś' - 141: 255, # 'Ť' - 142: 255, # 'Ž' - 143: 255, # 'Ź' - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 255, # None - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'ś' - 157: 255, # 'ť' - 158: 255, # 'ž' - 159: 255, # 'ź' - 160: 251, # '\xa0' - 161: 255, # 'ˇ' - 162: 253, # '˘' - 163: 255, # 'Ł' - 164: 253, # '¤' - 165: 255, # 'Ą' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'Ş' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 253, # '±' - 178: 253, # '˛' - 179: 255, # 'ł' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 255, # 'ą' - 186: 255, # 'ş' - 187: 253, # '»' - 188: 255, # 'Ľ' - 189: 253, # '˝' - 190: 255, # 'ľ' - 191: 255, # 'ż' - 192: 255, # 'Ŕ' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ă' - 196: 255, # 'Ä' - 197: 255, # 'Ĺ' - 198: 255, # 'Ć' - 199: 255, # 'Ç' - 200: 255, # 'Č' - 201: 255, # 'É' - 202: 255, # 'Ę' - 203: 255, # 'Ë' - 204: 255, # 'Ě' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ď' - 208: 255, # 'Đ' - 209: 255, # 'Ń' - 210: 255, # 'Ň' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ő' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ř' - 217: 255, # 'Ů' - 218: 255, # 'Ú' - 219: 255, # 'Ű' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Ţ' - 223: 255, # 'ß' - 224: 255, # 'ŕ' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ă' - 228: 255, # 'ä' - 229: 255, # 'ĺ' - 230: 255, # 'ć' - 231: 255, # 'ç' - 232: 255, # 'č' - 233: 255, # 'é' - 234: 255, # 'ę' - 235: 255, # 'ë' - 236: 255, # 'ě' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ď' - 240: 255, # 'đ' - 241: 255, # 'ń' - 242: 255, # 'ň' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ő' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ř' - 249: 255, # 'ů' - 250: 255, # 'ú' - 251: 255, # 'ű' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'ţ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # None + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # None + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Ś' + 141: 255, # 'Ť' + 142: 255, # 'Ž' + 143: 255, # 'Ź' + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 255, # None + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'ś' + 157: 255, # 'ť' + 158: 255, # 'ž' + 159: 255, # 'ź' + 160: 251, # '\xa0' + 161: 255, # 'ˇ' + 162: 253, # '˘' + 163: 255, # 'Ł' + 164: 253, # '¤' + 165: 255, # 'Ą' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'Ş' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 253, # '±' + 178: 253, # '˛' + 179: 255, # 'ł' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 255, # 'ą' + 186: 255, # 'ş' + 187: 253, # '»' + 188: 255, # 'Ľ' + 189: 253, # '˝' + 190: 255, # 'ľ' + 191: 255, # 'ż' + 192: 255, # 'Ŕ' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ă' + 196: 255, # 'Ä' + 197: 255, # 'Ĺ' + 198: 255, # 'Ć' + 199: 255, # 'Ç' + 200: 255, # 'Č' + 201: 255, # 'É' + 202: 255, # 'Ę' + 203: 255, # 'Ë' + 204: 255, # 'Ě' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ď' + 208: 255, # 'Đ' + 209: 255, # 'Ń' + 210: 255, # 'Ň' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ő' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ř' + 217: 255, # 'Ů' + 218: 255, # 'Ú' + 219: 255, # 'Ű' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Ţ' + 223: 255, # 'ß' + 224: 255, # 'ŕ' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ă' + 228: 255, # 'ä' + 229: 255, # 'ĺ' + 230: 255, # 'ć' + 231: 255, # 'ç' + 232: 255, # 'č' + 233: 255, # 'é' + 234: 255, # 'ę' + 235: 255, # 'ë' + 236: 255, # 'ě' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ď' + 240: 255, # 'đ' + 241: 255, # 'ń' + 242: 255, # 'ň' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ő' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ř' + 249: 255, # 'ů' + 250: 255, # 'ú' + 251: 255, # 'ű' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'ţ' + 255: 253, # '˙' } -WINDOWS_1250_SLOVENE_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1250', - language='Slovene', - char_to_order_map=WINDOWS_1250_SLOVENE_CHAR_TO_ORDER, - language_model=SLOVENE_LANG_MODEL, - typical_positive_ratio=0.9812181963993686, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzČ芚Žž') - +WINDOWS_1250_SLOVENE_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1250", + language="Slovene", + char_to_order_map=WINDOWS_1250_SLOVENE_CHAR_TO_ORDER, + language_model=SLOVENE_LANG_MODEL, + typical_positive_ratio=0.9812181963993686, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzČ芚Žž", +) diff --git a/chardet/langspanishmodel.py b/chardet/langspanishmodel.py index 30b5346e..8299cef1 100644 --- a/chardet/langspanishmodel.py +++ b/chardet/langspanishmodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -61770,803 +61768,808 @@ # Character Mapping Table(s): WINDOWS_1252_SPANISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 35, # 'A' - 66: 49, # 'B' - 67: 32, # 'C' - 68: 54, # 'D' - 69: 29, # 'E' - 70: 60, # 'F' - 71: 56, # 'G' - 72: 66, # 'H' - 73: 47, # 'I' - 74: 67, # 'J' - 75: 75, # 'K' - 76: 39, # 'L' - 77: 43, # 'M' - 78: 65, # 'N' - 79: 70, # 'O' - 80: 44, # 'P' - 81: 85, # 'Q' - 82: 52, # 'R' - 83: 38, # 'S' - 84: 53, # 'T' - 85: 69, # 'U' - 86: 68, # 'V' - 87: 72, # 'W' - 88: 76, # 'X' - 89: 79, # 'Y' - 90: 84, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 3, # 'a' - 98: 17, # 'b' - 99: 12, # 'c' - 100: 10, # 'd' - 101: 2, # 'e' - 102: 23, # 'f' - 103: 18, # 'g' - 104: 24, # 'h' - 105: 7, # 'i' - 106: 37, # 'j' - 107: 63, # 'k' - 108: 9, # 'l' - 109: 14, # 'm' - 110: 5, # 'n' - 111: 4, # 'o' - 112: 15, # 'p' - 113: 27, # 'q' - 114: 8, # 'r' - 115: 6, # 's' - 116: 11, # 't' - 117: 13, # 'u' - 118: 21, # 'v' - 119: 73, # 'w' - 120: 50, # 'x' - 121: 20, # 'y' - 122: 30, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # 'ƒ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # 'ˆ' - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Œ' - 141: 255, # None - 142: 255, # 'Ž' - 143: 255, # None - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 253, # '˜' - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'œ' - 157: 255, # None - 158: 255, # 'ž' - 159: 255, # 'Ÿ' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 35, # 'A' + 66: 49, # 'B' + 67: 32, # 'C' + 68: 54, # 'D' + 69: 29, # 'E' + 70: 60, # 'F' + 71: 56, # 'G' + 72: 66, # 'H' + 73: 47, # 'I' + 74: 67, # 'J' + 75: 75, # 'K' + 76: 39, # 'L' + 77: 43, # 'M' + 78: 65, # 'N' + 79: 70, # 'O' + 80: 44, # 'P' + 81: 85, # 'Q' + 82: 52, # 'R' + 83: 38, # 'S' + 84: 53, # 'T' + 85: 69, # 'U' + 86: 68, # 'V' + 87: 72, # 'W' + 88: 76, # 'X' + 89: 79, # 'Y' + 90: 84, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 3, # 'a' + 98: 17, # 'b' + 99: 12, # 'c' + 100: 10, # 'd' + 101: 2, # 'e' + 102: 23, # 'f' + 103: 18, # 'g' + 104: 24, # 'h' + 105: 7, # 'i' + 106: 37, # 'j' + 107: 63, # 'k' + 108: 9, # 'l' + 109: 14, # 'm' + 110: 5, # 'n' + 111: 4, # 'o' + 112: 15, # 'p' + 113: 27, # 'q' + 114: 8, # 'r' + 115: 6, # 's' + 116: 11, # 't' + 117: 13, # 'u' + 118: 21, # 'v' + 119: 73, # 'w' + 120: 50, # 'x' + 121: 20, # 'y' + 122: 30, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # 'ƒ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # 'ˆ' + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Œ' + 141: 255, # None + 142: 255, # 'Ž' + 143: 255, # None + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 253, # '˜' + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'œ' + 157: 255, # None + 158: 255, # 'ž' + 159: 255, # 'Ÿ' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -WINDOWS_1252_SPANISH_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1252', - language='Spanish', - char_to_order_map=WINDOWS_1252_SPANISH_CHAR_TO_ORDER, - language_model=SPANISH_LANG_MODEL, - typical_positive_ratio=0.9481229543552133, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÑÓÚÜáéíñóúü') +WINDOWS_1252_SPANISH_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1252", + language="Spanish", + char_to_order_map=WINDOWS_1252_SPANISH_CHAR_TO_ORDER, + language_model=SPANISH_LANG_MODEL, + typical_positive_ratio=0.9481229543552133, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÑÓÚÜáéíñóúü", +) ISO_8859_1_SPANISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 35, # 'A' - 66: 49, # 'B' - 67: 32, # 'C' - 68: 54, # 'D' - 69: 29, # 'E' - 70: 60, # 'F' - 71: 56, # 'G' - 72: 66, # 'H' - 73: 47, # 'I' - 74: 67, # 'J' - 75: 75, # 'K' - 76: 39, # 'L' - 77: 43, # 'M' - 78: 65, # 'N' - 79: 70, # 'O' - 80: 44, # 'P' - 81: 85, # 'Q' - 82: 52, # 'R' - 83: 38, # 'S' - 84: 53, # 'T' - 85: 69, # 'U' - 86: 68, # 'V' - 87: 72, # 'W' - 88: 76, # 'X' - 89: 79, # 'Y' - 90: 84, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 3, # 'a' - 98: 17, # 'b' - 99: 12, # 'c' - 100: 10, # 'd' - 101: 2, # 'e' - 102: 23, # 'f' - 103: 18, # 'g' - 104: 24, # 'h' - 105: 7, # 'i' - 106: 37, # 'j' - 107: 63, # 'k' - 108: 9, # 'l' - 109: 14, # 'm' - 110: 5, # 'n' - 111: 4, # 'o' - 112: 15, # 'p' - 113: 27, # 'q' - 114: 8, # 'r' - 115: 6, # 's' - 116: 11, # 't' - 117: 13, # 'u' - 118: 21, # 'v' - 119: 73, # 'w' - 120: 50, # 'x' - 121: 20, # 'y' - 122: 30, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 35, # 'A' + 66: 49, # 'B' + 67: 32, # 'C' + 68: 54, # 'D' + 69: 29, # 'E' + 70: 60, # 'F' + 71: 56, # 'G' + 72: 66, # 'H' + 73: 47, # 'I' + 74: 67, # 'J' + 75: 75, # 'K' + 76: 39, # 'L' + 77: 43, # 'M' + 78: 65, # 'N' + 79: 70, # 'O' + 80: 44, # 'P' + 81: 85, # 'Q' + 82: 52, # 'R' + 83: 38, # 'S' + 84: 53, # 'T' + 85: 69, # 'U' + 86: 68, # 'V' + 87: 72, # 'W' + 88: 76, # 'X' + 89: 79, # 'Y' + 90: 84, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 3, # 'a' + 98: 17, # 'b' + 99: 12, # 'c' + 100: 10, # 'd' + 101: 2, # 'e' + 102: 23, # 'f' + 103: 18, # 'g' + 104: 24, # 'h' + 105: 7, # 'i' + 106: 37, # 'j' + 107: 63, # 'k' + 108: 9, # 'l' + 109: 14, # 'm' + 110: 5, # 'n' + 111: 4, # 'o' + 112: 15, # 'p' + 113: 27, # 'q' + 114: 8, # 'r' + 115: 6, # 's' + 116: 11, # 't' + 117: 13, # 'u' + 118: 21, # 'v' + 119: 73, # 'w' + 120: 50, # 'x' + 121: 20, # 'y' + 122: 30, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_1_SPANISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-1', - language='Spanish', - char_to_order_map=ISO_8859_1_SPANISH_CHAR_TO_ORDER, - language_model=SPANISH_LANG_MODEL, - typical_positive_ratio=0.9481229543552133, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÑÓÚÜáéíñóúü') +ISO_8859_1_SPANISH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-1", + language="Spanish", + char_to_order_map=ISO_8859_1_SPANISH_CHAR_TO_ORDER, + language_model=SPANISH_LANG_MODEL, + typical_positive_ratio=0.9481229543552133, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÑÓÚÜáéíñóúü", +) ISO_8859_15_SPANISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 35, # 'A' - 66: 49, # 'B' - 67: 32, # 'C' - 68: 54, # 'D' - 69: 29, # 'E' - 70: 60, # 'F' - 71: 56, # 'G' - 72: 66, # 'H' - 73: 47, # 'I' - 74: 67, # 'J' - 75: 75, # 'K' - 76: 39, # 'L' - 77: 43, # 'M' - 78: 65, # 'N' - 79: 70, # 'O' - 80: 44, # 'P' - 81: 85, # 'Q' - 82: 52, # 'R' - 83: 38, # 'S' - 84: 53, # 'T' - 85: 69, # 'U' - 86: 68, # 'V' - 87: 72, # 'W' - 88: 76, # 'X' - 89: 79, # 'Y' - 90: 84, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 3, # 'a' - 98: 17, # 'b' - 99: 12, # 'c' - 100: 10, # 'd' - 101: 2, # 'e' - 102: 23, # 'f' - 103: 18, # 'g' - 104: 24, # 'h' - 105: 7, # 'i' - 106: 37, # 'j' - 107: 63, # 'k' - 108: 9, # 'l' - 109: 14, # 'm' - 110: 5, # 'n' - 111: 4, # 'o' - 112: 15, # 'p' - 113: 27, # 'q' - 114: 8, # 'r' - 115: 6, # 's' - 116: 11, # 't' - 117: 13, # 'u' - 118: 21, # 'v' - 119: 73, # 'w' - 120: 50, # 'x' - 121: 20, # 'y' - 122: 30, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '€' - 165: 253, # '¥' - 166: 255, # 'Š' - 167: 253, # '§' - 168: 255, # 'š' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 255, # 'Ž' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 255, # 'ž' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 255, # 'Œ' - 189: 255, # 'œ' - 190: 255, # 'Ÿ' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ð' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ý' - 222: 255, # 'Þ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ð' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ý' - 254: 255, # 'þ' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 35, # 'A' + 66: 49, # 'B' + 67: 32, # 'C' + 68: 54, # 'D' + 69: 29, # 'E' + 70: 60, # 'F' + 71: 56, # 'G' + 72: 66, # 'H' + 73: 47, # 'I' + 74: 67, # 'J' + 75: 75, # 'K' + 76: 39, # 'L' + 77: 43, # 'M' + 78: 65, # 'N' + 79: 70, # 'O' + 80: 44, # 'P' + 81: 85, # 'Q' + 82: 52, # 'R' + 83: 38, # 'S' + 84: 53, # 'T' + 85: 69, # 'U' + 86: 68, # 'V' + 87: 72, # 'W' + 88: 76, # 'X' + 89: 79, # 'Y' + 90: 84, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 3, # 'a' + 98: 17, # 'b' + 99: 12, # 'c' + 100: 10, # 'd' + 101: 2, # 'e' + 102: 23, # 'f' + 103: 18, # 'g' + 104: 24, # 'h' + 105: 7, # 'i' + 106: 37, # 'j' + 107: 63, # 'k' + 108: 9, # 'l' + 109: 14, # 'm' + 110: 5, # 'n' + 111: 4, # 'o' + 112: 15, # 'p' + 113: 27, # 'q' + 114: 8, # 'r' + 115: 6, # 's' + 116: 11, # 't' + 117: 13, # 'u' + 118: 21, # 'v' + 119: 73, # 'w' + 120: 50, # 'x' + 121: 20, # 'y' + 122: 30, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '€' + 165: 253, # '¥' + 166: 255, # 'Š' + 167: 253, # '§' + 168: 255, # 'š' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 255, # 'Ž' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 255, # 'ž' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 255, # 'Œ' + 189: 255, # 'œ' + 190: 255, # 'Ÿ' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ð' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ý' + 222: 255, # 'Þ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ð' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ý' + 254: 255, # 'þ' + 255: 255, # 'ÿ' } -ISO_8859_15_SPANISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-15', - language='Spanish', - char_to_order_map=ISO_8859_15_SPANISH_CHAR_TO_ORDER, - language_model=SPANISH_LANG_MODEL, - typical_positive_ratio=0.9481229543552133, - keep_ascii_letters=True, - alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÑÓÚÜáéíñóúü') - +ISO_8859_15_SPANISH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-15", + language="Spanish", + char_to_order_map=ISO_8859_15_SPANISH_CHAR_TO_ORDER, + language_model=SPANISH_LANG_MODEL, + typical_positive_ratio=0.9481229543552133, + keep_ascii_letters=True, + alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÑÓÚÜáéíñóúü", +) diff --git a/chardet/langturkishmodel.py b/chardet/langturkishmodel.py index d03d4f92..d83c1167 100644 --- a/chardet/langturkishmodel.py +++ b/chardet/langturkishmodel.py @@ -62764,802 +62764,808 @@ # Character Mapping Table(s): WINDOWS_1254_TURKISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # 'ƒ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # 'ˆ' - 137: 253, # '‰' - 138: 255, # 'Š' - 139: 253, # '‹' - 140: 255, # 'Œ' - 141: 255, # None - 142: 255, # None - 143: 255, # None - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 253, # '˜' - 153: 253, # '™' - 154: 255, # 'š' - 155: 253, # '›' - 156: 255, # 'œ' - 157: 255, # None - 158: 255, # None - 159: 255, # 'Ÿ' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ğ' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'İ' - 222: 255, # 'Ş' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ğ' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ı' - 254: 255, # 'ş' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # 'ƒ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # 'ˆ' + 137: 253, # '‰' + 138: 255, # 'Š' + 139: 253, # '‹' + 140: 255, # 'Œ' + 141: 255, # None + 142: 255, # None + 143: 255, # None + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 253, # '˜' + 153: 253, # '™' + 154: 255, # 'š' + 155: 253, # '›' + 156: 255, # 'œ' + 157: 255, # None + 158: 255, # None + 159: 255, # 'Ÿ' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ğ' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'İ' + 222: 255, # 'Ş' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ğ' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ı' + 254: 255, # 'ş' + 255: 255, # 'ÿ' } -WINDOWS_1254_TURKISH_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1254', - language='Turkish', - char_to_order_map=WINDOWS_1254_TURKISH_CHAR_TO_ORDER, - language_model=TURKISH_LANG_MODEL, - typical_positive_ratio=0.9820526981498936, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş') +WINDOWS_1254_TURKISH_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1254", + language="Turkish", + char_to_order_map=WINDOWS_1254_TURKISH_CHAR_TO_ORDER, + language_model=TURKISH_LANG_MODEL, + typical_positive_ratio=0.9820526981498936, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş", +) ISO_8859_9_TURKISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ã' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Ğ' - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Õ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'İ' - 222: 255, # 'Ş' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ã' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'ğ' - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'õ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ı' - 254: 255, # 'ş' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ã' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Ğ' + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Õ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'İ' + 222: 255, # 'Ş' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ã' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'ğ' + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'õ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ı' + 254: 255, # 'ş' + 255: 255, # 'ÿ' } -ISO_8859_9_TURKISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-9', - language='Turkish', - char_to_order_map=ISO_8859_9_TURKISH_CHAR_TO_ORDER, - language_model=TURKISH_LANG_MODEL, - typical_positive_ratio=0.9739854915929828, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş') +ISO_8859_9_TURKISH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-9", + language="Turkish", + char_to_order_map=ISO_8859_9_TURKISH_CHAR_TO_ORDER, + language_model=TURKISH_LANG_MODEL, + typical_positive_ratio=0.9739854915929828, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş", +) ISO_8859_3_TURKISH_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 251, # '\x80' - 129: 251, # '\x81' - 130: 251, # '\x82' - 131: 251, # '\x83' - 132: 251, # '\x84' - 133: 251, # '\x85' - 134: 251, # '\x86' - 135: 251, # '\x87' - 136: 251, # '\x88' - 137: 251, # '\x89' - 138: 251, # '\x8a' - 139: 251, # '\x8b' - 140: 251, # '\x8c' - 141: 251, # '\x8d' - 142: 251, # '\x8e' - 143: 251, # '\x8f' - 144: 251, # '\x90' - 145: 251, # '\x91' - 146: 251, # '\x92' - 147: 251, # '\x93' - 148: 251, # '\x94' - 149: 251, # '\x95' - 150: 251, # '\x96' - 151: 251, # '\x97' - 152: 251, # '\x98' - 153: 251, # '\x99' - 154: 251, # '\x9a' - 155: 251, # '\x9b' - 156: 251, # '\x9c' - 157: 251, # '\x9d' - 158: 251, # '\x9e' - 159: 251, # '\x9f' - 160: 251, # '\xa0' - 161: 255, # 'Ħ' - 162: 253, # '˘' - 163: 253, # '£' - 164: 253, # '¤' - 165: 255, # None - 166: 255, # 'Ĥ' - 167: 253, # '§' - 168: 253, # '¨' - 169: 255, # 'İ' - 170: 255, # 'Ş' - 171: 255, # 'Ğ' - 172: 255, # 'Ĵ' - 173: 251, # '\xad' - 174: 255, # None - 175: 255, # 'Ż' - 176: 253, # '°' - 177: 255, # 'ħ' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 255, # 'ĥ' - 183: 253, # '·' - 184: 253, # '¸' - 185: 255, # 'ı' - 186: 255, # 'ş' - 187: 255, # 'ğ' - 188: 255, # 'ĵ' - 189: 252, # '½' - 190: 255, # None - 191: 255, # 'ż' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # None - 196: 255, # 'Ä' - 197: 255, # 'Ċ' - 198: 255, # 'Ĉ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 255, # 'Ì' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # None - 209: 255, # 'Ñ' - 210: 255, # 'Ò' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ġ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ĝ' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ŭ' - 222: 255, # 'Ŝ' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # None - 228: 255, # 'ä' - 229: 255, # 'ċ' - 230: 255, # 'ĉ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 255, # 'ì' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # None - 241: 255, # 'ñ' - 242: 255, # 'ò' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ġ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ĝ' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ŭ' - 254: 255, # 'ŝ' - 255: 253, # '˙' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 251, # '\x80' + 129: 251, # '\x81' + 130: 251, # '\x82' + 131: 251, # '\x83' + 132: 251, # '\x84' + 133: 251, # '\x85' + 134: 251, # '\x86' + 135: 251, # '\x87' + 136: 251, # '\x88' + 137: 251, # '\x89' + 138: 251, # '\x8a' + 139: 251, # '\x8b' + 140: 251, # '\x8c' + 141: 251, # '\x8d' + 142: 251, # '\x8e' + 143: 251, # '\x8f' + 144: 251, # '\x90' + 145: 251, # '\x91' + 146: 251, # '\x92' + 147: 251, # '\x93' + 148: 251, # '\x94' + 149: 251, # '\x95' + 150: 251, # '\x96' + 151: 251, # '\x97' + 152: 251, # '\x98' + 153: 251, # '\x99' + 154: 251, # '\x9a' + 155: 251, # '\x9b' + 156: 251, # '\x9c' + 157: 251, # '\x9d' + 158: 251, # '\x9e' + 159: 251, # '\x9f' + 160: 251, # '\xa0' + 161: 255, # 'Ħ' + 162: 253, # '˘' + 163: 253, # '£' + 164: 253, # '¤' + 165: 255, # None + 166: 255, # 'Ĥ' + 167: 253, # '§' + 168: 253, # '¨' + 169: 255, # 'İ' + 170: 255, # 'Ş' + 171: 255, # 'Ğ' + 172: 255, # 'Ĵ' + 173: 251, # '\xad' + 174: 255, # None + 175: 255, # 'Ż' + 176: 253, # '°' + 177: 255, # 'ħ' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 255, # 'ĥ' + 183: 253, # '·' + 184: 253, # '¸' + 185: 255, # 'ı' + 186: 255, # 'ş' + 187: 255, # 'ğ' + 188: 255, # 'ĵ' + 189: 252, # '½' + 190: 255, # None + 191: 255, # 'ż' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # None + 196: 255, # 'Ä' + 197: 255, # 'Ċ' + 198: 255, # 'Ĉ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 255, # 'Ì' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # None + 209: 255, # 'Ñ' + 210: 255, # 'Ò' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ġ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ĝ' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ŭ' + 222: 255, # 'Ŝ' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # None + 228: 255, # 'ä' + 229: 255, # 'ċ' + 230: 255, # 'ĉ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 255, # 'ì' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # None + 241: 255, # 'ñ' + 242: 255, # 'ò' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ġ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ĝ' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ŭ' + 254: 255, # 'ŝ' + 255: 253, # '˙' } -ISO_8859_3_TURKISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-3', - language='Turkish', - char_to_order_map=ISO_8859_3_TURKISH_CHAR_TO_ORDER, - language_model=TURKISH_LANG_MODEL, - typical_positive_ratio=0.9739854915929828, - keep_ascii_letters=False, - alphabet='ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş') +ISO_8859_3_TURKISH_MODEL = SingleByteCharSetModel( + charset_name="ISO-8859-3", + language="Turkish", + char_to_order_map=ISO_8859_3_TURKISH_CHAR_TO_ORDER, + language_model=TURKISH_LANG_MODEL, + typical_positive_ratio=0.9739854915929828, + keep_ascii_letters=False, + alphabet="ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş", +) diff --git a/chardet/langvietnamesemodel.py b/chardet/langvietnamesemodel.py index dcf1b7ef..f17a3ea5 100644 --- a/chardet/langvietnamesemodel.py +++ b/chardet/langvietnamesemodel.py @@ -1,9 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel - # 3: Positive # 2: Likely # 1: Unlikely @@ -62768,269 +62766,270 @@ # Character Mapping Table(s): WINDOWS_1258_VIETNAMESE_CHAR_TO_ORDER = { - 0: 251, # '\x00' - 1: 251, # '\x01' - 2: 251, # '\x02' - 3: 251, # '\x03' - 4: 251, # '\x04' - 5: 251, # '\x05' - 6: 251, # '\x06' - 7: 251, # '\x07' - 8: 251, # '\x08' - 9: 251, # '\t' - 10: 254, # '\n' - 11: 251, # '\x0b' - 12: 251, # '\x0c' - 13: 254, # '\r' - 14: 251, # '\x0e' - 15: 251, # '\x0f' - 16: 251, # '\x10' - 17: 251, # '\x11' - 18: 251, # '\x12' - 19: 251, # '\x13' - 20: 251, # '\x14' - 21: 251, # '\x15' - 22: 251, # '\x16' - 23: 251, # '\x17' - 24: 251, # '\x18' - 25: 251, # '\x19' - 26: 251, # '\x1a' - 27: 251, # '\x1b' - 28: 251, # '\x1c' - 29: 251, # '\x1d' - 30: 251, # '\x1e' - 31: 251, # '\x1f' - 32: 251, # ' ' - 33: 253, # '!' - 34: 253, # '"' - 35: 253, # '#' - 36: 253, # '$' - 37: 253, # '%' - 38: 253, # '&' - 39: 253, # "'" - 40: 253, # '(' - 41: 253, # ')' - 42: 253, # '*' - 43: 253, # '+' - 44: 253, # ',' - 45: 253, # '-' - 46: 253, # '.' - 47: 253, # '/' - 48: 252, # '0' - 49: 252, # '1' - 50: 252, # '2' - 51: 252, # '3' - 52: 252, # '4' - 53: 252, # '5' - 54: 252, # '6' - 55: 252, # '7' - 56: 252, # '8' - 57: 252, # '9' - 58: 253, # ':' - 59: 253, # ';' - 60: 253, # '<' - 61: 253, # '=' - 62: 253, # '>' - 63: 253, # '?' - 64: 253, # '@' - 65: 255, # 'A' - 66: 255, # 'B' - 67: 255, # 'C' - 68: 255, # 'D' - 69: 255, # 'E' - 70: 255, # 'F' - 71: 255, # 'G' - 72: 255, # 'H' - 73: 255, # 'I' - 74: 255, # 'J' - 75: 255, # 'K' - 76: 255, # 'L' - 77: 255, # 'M' - 78: 255, # 'N' - 79: 255, # 'O' - 80: 255, # 'P' - 81: 255, # 'Q' - 82: 255, # 'R' - 83: 255, # 'S' - 84: 255, # 'T' - 85: 255, # 'U' - 86: 255, # 'V' - 87: 255, # 'W' - 88: 255, # 'X' - 89: 255, # 'Y' - 90: 255, # 'Z' - 91: 253, # '[' - 92: 253, # '\\' - 93: 253, # ']' - 94: 253, # '^' - 95: 253, # '_' - 96: 253, # '`' - 97: 255, # 'a' - 98: 255, # 'b' - 99: 255, # 'c' - 100: 255, # 'd' - 101: 255, # 'e' - 102: 255, # 'f' - 103: 255, # 'g' - 104: 255, # 'h' - 105: 255, # 'i' - 106: 255, # 'j' - 107: 255, # 'k' - 108: 255, # 'l' - 109: 255, # 'm' - 110: 255, # 'n' - 111: 255, # 'o' - 112: 255, # 'p' - 113: 255, # 'q' - 114: 255, # 'r' - 115: 255, # 's' - 116: 255, # 't' - 117: 255, # 'u' - 118: 255, # 'v' - 119: 255, # 'w' - 120: 255, # 'x' - 121: 255, # 'y' - 122: 255, # 'z' - 123: 253, # '{' - 124: 253, # '|' - 125: 253, # '}' - 126: 253, # '~' - 127: 251, # '\x7f' - 128: 253, # '€' - 129: 255, # None - 130: 253, # '‚' - 131: 255, # 'ƒ' - 132: 253, # '„' - 133: 253, # '…' - 134: 253, # '†' - 135: 253, # '‡' - 136: 255, # 'ˆ' - 137: 253, # '‰' - 138: 255, # None - 139: 253, # '‹' - 140: 255, # 'Œ' - 141: 255, # None - 142: 255, # None - 143: 255, # None - 144: 255, # None - 145: 253, # '‘' - 146: 253, # '’' - 147: 253, # '“' - 148: 253, # '”' - 149: 253, # '•' - 150: 253, # '–' - 151: 253, # '—' - 152: 253, # '˜' - 153: 253, # '™' - 154: 255, # None - 155: 253, # '›' - 156: 255, # 'œ' - 157: 255, # None - 158: 255, # None - 159: 255, # 'Ÿ' - 160: 251, # '\xa0' - 161: 253, # '¡' - 162: 253, # '¢' - 163: 253, # '£' - 164: 253, # '¤' - 165: 253, # '¥' - 166: 253, # '¦' - 167: 253, # '§' - 168: 253, # '¨' - 169: 253, # '©' - 170: 255, # 'ª' - 171: 253, # '«' - 172: 253, # '¬' - 173: 251, # '\xad' - 174: 253, # '®' - 175: 253, # '¯' - 176: 253, # '°' - 177: 253, # '±' - 178: 252, # '²' - 179: 252, # '³' - 180: 253, # '´' - 181: 255, # 'µ' - 182: 253, # '¶' - 183: 253, # '·' - 184: 253, # '¸' - 185: 252, # '¹' - 186: 255, # 'º' - 187: 253, # '»' - 188: 252, # '¼' - 189: 252, # '½' - 190: 252, # '¾' - 191: 253, # '¿' - 192: 255, # 'À' - 193: 255, # 'Á' - 194: 255, # 'Â' - 195: 255, # 'Ă' - 196: 255, # 'Ä' - 197: 255, # 'Å' - 198: 255, # 'Æ' - 199: 255, # 'Ç' - 200: 255, # 'È' - 201: 255, # 'É' - 202: 255, # 'Ê' - 203: 255, # 'Ë' - 204: 253, # '̀' - 205: 255, # 'Í' - 206: 255, # 'Î' - 207: 255, # 'Ï' - 208: 255, # 'Đ' - 209: 255, # 'Ñ' - 210: 253, # '̉' - 211: 255, # 'Ó' - 212: 255, # 'Ô' - 213: 255, # 'Ơ' - 214: 255, # 'Ö' - 215: 253, # '×' - 216: 255, # 'Ø' - 217: 255, # 'Ù' - 218: 255, # 'Ú' - 219: 255, # 'Û' - 220: 255, # 'Ü' - 221: 255, # 'Ư' - 222: 253, # '̃' - 223: 255, # 'ß' - 224: 255, # 'à' - 225: 255, # 'á' - 226: 255, # 'â' - 227: 255, # 'ă' - 228: 255, # 'ä' - 229: 255, # 'å' - 230: 255, # 'æ' - 231: 255, # 'ç' - 232: 255, # 'è' - 233: 255, # 'é' - 234: 255, # 'ê' - 235: 255, # 'ë' - 236: 253, # '́' - 237: 255, # 'í' - 238: 255, # 'î' - 239: 255, # 'ï' - 240: 255, # 'đ' - 241: 255, # 'ñ' - 242: 253, # '̣' - 243: 255, # 'ó' - 244: 255, # 'ô' - 245: 255, # 'ơ' - 246: 255, # 'ö' - 247: 253, # '÷' - 248: 255, # 'ø' - 249: 255, # 'ù' - 250: 255, # 'ú' - 251: 255, # 'û' - 252: 255, # 'ü' - 253: 255, # 'ư' - 254: 253, # '₫' - 255: 255, # 'ÿ' + 0: 251, # '\x00' + 1: 251, # '\x01' + 2: 251, # '\x02' + 3: 251, # '\x03' + 4: 251, # '\x04' + 5: 251, # '\x05' + 6: 251, # '\x06' + 7: 251, # '\x07' + 8: 251, # '\x08' + 9: 251, # '\t' + 10: 254, # '\n' + 11: 251, # '\x0b' + 12: 251, # '\x0c' + 13: 254, # '\r' + 14: 251, # '\x0e' + 15: 251, # '\x0f' + 16: 251, # '\x10' + 17: 251, # '\x11' + 18: 251, # '\x12' + 19: 251, # '\x13' + 20: 251, # '\x14' + 21: 251, # '\x15' + 22: 251, # '\x16' + 23: 251, # '\x17' + 24: 251, # '\x18' + 25: 251, # '\x19' + 26: 251, # '\x1a' + 27: 251, # '\x1b' + 28: 251, # '\x1c' + 29: 251, # '\x1d' + 30: 251, # '\x1e' + 31: 251, # '\x1f' + 32: 251, # ' ' + 33: 253, # '!' + 34: 253, # '"' + 35: 253, # '#' + 36: 253, # '$' + 37: 253, # '%' + 38: 253, # '&' + 39: 253, # "'" + 40: 253, # '(' + 41: 253, # ')' + 42: 253, # '*' + 43: 253, # '+' + 44: 253, # ',' + 45: 253, # '-' + 46: 253, # '.' + 47: 253, # '/' + 48: 252, # '0' + 49: 252, # '1' + 50: 252, # '2' + 51: 252, # '3' + 52: 252, # '4' + 53: 252, # '5' + 54: 252, # '6' + 55: 252, # '7' + 56: 252, # '8' + 57: 252, # '9' + 58: 253, # ':' + 59: 253, # ';' + 60: 253, # '<' + 61: 253, # '=' + 62: 253, # '>' + 63: 253, # '?' + 64: 253, # '@' + 65: 255, # 'A' + 66: 255, # 'B' + 67: 255, # 'C' + 68: 255, # 'D' + 69: 255, # 'E' + 70: 255, # 'F' + 71: 255, # 'G' + 72: 255, # 'H' + 73: 255, # 'I' + 74: 255, # 'J' + 75: 255, # 'K' + 76: 255, # 'L' + 77: 255, # 'M' + 78: 255, # 'N' + 79: 255, # 'O' + 80: 255, # 'P' + 81: 255, # 'Q' + 82: 255, # 'R' + 83: 255, # 'S' + 84: 255, # 'T' + 85: 255, # 'U' + 86: 255, # 'V' + 87: 255, # 'W' + 88: 255, # 'X' + 89: 255, # 'Y' + 90: 255, # 'Z' + 91: 253, # '[' + 92: 253, # '\\' + 93: 253, # ']' + 94: 253, # '^' + 95: 253, # '_' + 96: 253, # '`' + 97: 255, # 'a' + 98: 255, # 'b' + 99: 255, # 'c' + 100: 255, # 'd' + 101: 255, # 'e' + 102: 255, # 'f' + 103: 255, # 'g' + 104: 255, # 'h' + 105: 255, # 'i' + 106: 255, # 'j' + 107: 255, # 'k' + 108: 255, # 'l' + 109: 255, # 'm' + 110: 255, # 'n' + 111: 255, # 'o' + 112: 255, # 'p' + 113: 255, # 'q' + 114: 255, # 'r' + 115: 255, # 's' + 116: 255, # 't' + 117: 255, # 'u' + 118: 255, # 'v' + 119: 255, # 'w' + 120: 255, # 'x' + 121: 255, # 'y' + 122: 255, # 'z' + 123: 253, # '{' + 124: 253, # '|' + 125: 253, # '}' + 126: 253, # '~' + 127: 251, # '\x7f' + 128: 253, # '€' + 129: 255, # None + 130: 253, # '‚' + 131: 255, # 'ƒ' + 132: 253, # '„' + 133: 253, # '…' + 134: 253, # '†' + 135: 253, # '‡' + 136: 255, # 'ˆ' + 137: 253, # '‰' + 138: 255, # None + 139: 253, # '‹' + 140: 255, # 'Œ' + 141: 255, # None + 142: 255, # None + 143: 255, # None + 144: 255, # None + 145: 253, # '‘' + 146: 253, # '’' + 147: 253, # '“' + 148: 253, # '”' + 149: 253, # '•' + 150: 253, # '–' + 151: 253, # '—' + 152: 253, # '˜' + 153: 253, # '™' + 154: 255, # None + 155: 253, # '›' + 156: 255, # 'œ' + 157: 255, # None + 158: 255, # None + 159: 255, # 'Ÿ' + 160: 251, # '\xa0' + 161: 253, # '¡' + 162: 253, # '¢' + 163: 253, # '£' + 164: 253, # '¤' + 165: 253, # '¥' + 166: 253, # '¦' + 167: 253, # '§' + 168: 253, # '¨' + 169: 253, # '©' + 170: 255, # 'ª' + 171: 253, # '«' + 172: 253, # '¬' + 173: 251, # '\xad' + 174: 253, # '®' + 175: 253, # '¯' + 176: 253, # '°' + 177: 253, # '±' + 178: 252, # '²' + 179: 252, # '³' + 180: 253, # '´' + 181: 255, # 'µ' + 182: 253, # '¶' + 183: 253, # '·' + 184: 253, # '¸' + 185: 252, # '¹' + 186: 255, # 'º' + 187: 253, # '»' + 188: 252, # '¼' + 189: 252, # '½' + 190: 252, # '¾' + 191: 253, # '¿' + 192: 255, # 'À' + 193: 255, # 'Á' + 194: 255, # 'Â' + 195: 255, # 'Ă' + 196: 255, # 'Ä' + 197: 255, # 'Å' + 198: 255, # 'Æ' + 199: 255, # 'Ç' + 200: 255, # 'È' + 201: 255, # 'É' + 202: 255, # 'Ê' + 203: 255, # 'Ë' + 204: 253, # '̀' + 205: 255, # 'Í' + 206: 255, # 'Î' + 207: 255, # 'Ï' + 208: 255, # 'Đ' + 209: 255, # 'Ñ' + 210: 253, # '̉' + 211: 255, # 'Ó' + 212: 255, # 'Ô' + 213: 255, # 'Ơ' + 214: 255, # 'Ö' + 215: 253, # '×' + 216: 255, # 'Ø' + 217: 255, # 'Ù' + 218: 255, # 'Ú' + 219: 255, # 'Û' + 220: 255, # 'Ü' + 221: 255, # 'Ư' + 222: 253, # '̃' + 223: 255, # 'ß' + 224: 255, # 'à' + 225: 255, # 'á' + 226: 255, # 'â' + 227: 255, # 'ă' + 228: 255, # 'ä' + 229: 255, # 'å' + 230: 255, # 'æ' + 231: 255, # 'ç' + 232: 255, # 'è' + 233: 255, # 'é' + 234: 255, # 'ê' + 235: 255, # 'ë' + 236: 253, # '́' + 237: 255, # 'í' + 238: 255, # 'î' + 239: 255, # 'ï' + 240: 255, # 'đ' + 241: 255, # 'ñ' + 242: 253, # '̣' + 243: 255, # 'ó' + 244: 255, # 'ô' + 245: 255, # 'ơ' + 246: 255, # 'ö' + 247: 253, # '÷' + 248: 255, # 'ø' + 249: 255, # 'ù' + 250: 255, # 'ú' + 251: 255, # 'û' + 252: 255, # 'ü' + 253: 255, # 'ư' + 254: 253, # '₫' + 255: 255, # 'ÿ' } -WINDOWS_1258_VIETNAMESE_MODEL = SingleByteCharSetModel(charset_name='WINDOWS-1258', - language='Vietnamese', - char_to_order_map=WINDOWS_1258_VIETNAMESE_CHAR_TO_ORDER, - language_model=VIETNAMESE_LANG_MODEL, - typical_positive_ratio=0.5195965593221449, - keep_ascii_letters=False, - alphabet='ABCDEGHIKLMNOPQRSTUVXYabcdeghiklmnopqrstuvxyÂÊÔâêôĂăĐđƠơƯư') - +WINDOWS_1258_VIETNAMESE_MODEL = SingleByteCharSetModel( + charset_name="WINDOWS-1258", + language="Vietnamese", + char_to_order_map=WINDOWS_1258_VIETNAMESE_CHAR_TO_ORDER, + language_model=VIETNAMESE_LANG_MODEL, + typical_positive_ratio=0.5195965593221449, + keep_ascii_letters=False, + alphabet="ABCDEGHIKLMNOPQRSTUVXYabcdeghiklmnopqrstuvxyÂÊÔâêôĂăĐđƠơƯư", +) diff --git a/chardet/sbcsgroupprober.py b/chardet/sbcsgroupprober.py index 520e60e0..a31e22f3 100644 --- a/chardet/sbcsgroupprober.py +++ b/chardet/sbcsgroupprober.py @@ -29,8 +29,8 @@ from .charsetgroupprober import CharSetGroupProber from .hebrewprober import HebrewProber from .langarabicmodel import ( - CP864_ARABIC_MODEL, CP720_ARABIC_MODEL, + CP864_ARABIC_MODEL, ISO_8859_6_ARABIC_MODEL, WINDOWS_1256_ARABIC_MODEL, ) diff --git a/create_language_model.py b/create_language_model.py index a9c83c93..f6cfdf42 100755 --- a/create_language_model.py +++ b/create_language_model.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- ######################## BEGIN LICENSE BLOCK ######################## # Contributor(s): @@ -27,33 +26,29 @@ Create a language model for single byte character encoding detection based on the given file(s). """ -from __future__ import absolute_import, print_function - import os import re import sys import unicodedata -from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser from collections import Counter, defaultdict from functools import partial -from io import open from multiprocessing import Pool from operator import itemgetter from string import ascii_letters try: from mediawiki import MediaWiki + HAVE_WIKIPEDIA = True except: HAVE_WIKIPEDIA = False from chardet import __version__ -from chardet.compat import iteritems -from chardet.metadata.languages import LANGUAGES from chardet.enums import CharacterCategory, SequenceLikelihood +from chardet.metadata.languages import LANGUAGES from chardet.sbcharsetprober import SingleByteCharSetModel - # Turn ascii_letters into a set to make other ops easier ascii_letters = set(ascii_letters) @@ -63,37 +58,39 @@ def normalize_name(charset_name): # Title case to start charset_name = charset_name.upper() # Underscores instead of hyphens - charset_name = charset_name.replace('-', '_') + charset_name = charset_name.replace("-", "_") return charset_name -def unicode_to_category(unicode_char, char_ranks, keep_ascii_letters=False, - alphabet=None): - """Convert a Unicode character to categories used by SingleByteCharSetProber - """ + +def unicode_to_category( + unicode_char, char_ranks, keep_ascii_letters=False, alphabet=None +): + """Convert a Unicode character to categories used by SingleByteCharSetProber""" if alphabet is None: alphabet = set() valid_letters = (alphabet | ascii_letters) if keep_ascii_letters else alphabet unicode_cat = unicodedata.category(unicode_char) - if unicode_cat.startswith('N'): + if unicode_cat.startswith("N"): ret_val = CharacterCategory.DIGIT # Valid letters have their category set to their order/rank - elif unicode_cat.startswith('L'): + elif unicode_cat.startswith("L"): if unicode_char in valid_letters: ret_val = char_ranks.get(unicode_char, CharacterCategory.UNDEFINED) else: ret_val = CharacterCategory.UNDEFINED - elif unicode_char in ('\r', '\n'): + elif unicode_char in ("\r", "\n"): ret_val = CharacterCategory.LINE_BREAK # Punctuation, Symbols, and Marks are all symbols as far as we care - elif unicode_cat.startswith(('P', 'S', 'M')): + elif unicode_cat.startswith(("P", "S", "M")): ret_val = CharacterCategory.SYMBOL else: ret_val = CharacterCategory.CONTROL return ret_val -def get_charset_mappings(charset_name, char_ranks, keep_ascii_letters=False, - alphabet=None): +def get_charset_mappings( + charset_name, char_ranks, keep_ascii_letters=False, alphabet=None +): """Returns `charset_categories` & `charset_code_points` mappings for charset `charset_categories` maps from bytes in charset/encoding to categories @@ -115,10 +112,12 @@ def get_charset_mappings(charset_name, char_ranks, keep_ascii_letters=False, char = bytes(bytearray((byte_hex,))) try: unicode_char = char.decode(charset_name) - char_cat = unicode_to_category(unicode_char, - char_ranks, - keep_ascii_letters=keep_ascii_letters, - alphabet=alphabet) + char_cat = unicode_to_category( + unicode_char, + char_ranks, + keep_ascii_letters=keep_ascii_letters, + alphabet=alphabet, + ) charset_code_points[unicode_char] = char except UnicodeDecodeError: char_cat = CharacterCategory.UNDEFINED @@ -129,13 +128,20 @@ def get_charset_mappings(charset_name, char_ranks, keep_ascii_letters=False, def gen_input_lines(input_paths, input_encoding): """Yield decoded lines from files in input_paths""" for input_path in input_paths: - with open(input_path, 'r', encoding=input_encoding) as input_file: - for line in input_file: - yield line - - -def gen_wiki_lines(titles, language, max_depth, max_pages=None, depth=0, - visited_pages=None, skipped_pages=None, wikipedia=None): + with open(input_path, encoding=input_encoding) as input_file: + yield from input_file + + +def gen_wiki_lines( + titles, + language, + max_depth, + max_pages=None, + depth=0, + visited_pages=None, + skipped_pages=None, + wikipedia=None, +): """Generate lines from Wikipedia articles, starting with titles. Will crawl at most `max_depth` deep in the page hierarchy. @@ -147,9 +153,11 @@ def gen_wiki_lines(titles, language, max_depth, max_pages=None, depth=0, skipped_pages = set() if not titles or depth > max_depth or len(visited_pages) > max_pages: - print('Visited {} pages: {} ({} skipped)'.format(language, - len(visited_pages), - len(skipped_pages))) + print( + "Visited {} pages: {} ({} skipped)".format( + language, len(visited_pages), len(skipped_pages) + ) + ) return # Visit all pages in titles and add their links to next_titles @@ -157,10 +165,12 @@ def gen_wiki_lines(titles, language, max_depth, max_pages=None, depth=0, for title in titles: if title in visited_pages or title in skipped_pages: continue - print('Visited {} pages: {} ({} skipped)'.format(language, - len(visited_pages), - len(skipped_pages)), - end='\r') + print( + "Visited {} pages: {} ({} skipped)".format( + language, len(visited_pages), len(skipped_pages) + ), + end="\r", + ) sys.stdout.flush() if len(visited_pages) == max_pages: break @@ -168,21 +178,20 @@ def gen_wiki_lines(titles, language, max_depth, max_pages=None, depth=0, page = wikipedia.page(title, auto_suggest=False) # Remove Wikipedia markup (this is inside of try block because it # does an implicit request to Wikipedia to get the content) - content = re.sub(r'(=+) *([^=]+) *\1', r'\2', page.content) + content = re.sub(r"(=+) *([^=]+) *\1", r"\2", page.content) # Clean up repeated whitespace, since that could skew model - content = re.sub(r'(\s)\1+', r'\1', content) + content = re.sub(r"(\s)\1+", r"\1", content) except: if depth > 0: skipped_pages.add(title) continue else: - print('Failed to visit start page:') + print("Failed to visit start page:") raise visited_pages.add(title) - for line in content.splitlines(True): - yield line + yield from content.splitlines(True) # Sometimes things go wrong when extracting the links try: @@ -191,16 +200,21 @@ def gen_wiki_lines(titles, language, max_depth, max_pages=None, depth=0, continue # Recursive generators are fun - for line in gen_wiki_lines(next_titles, language, max_depth, - depth=depth + 1, - visited_pages=visited_pages, - skipped_pages=skipped_pages, - max_pages=max_pages, - wikipedia=wikipedia): - yield line + yield from gen_wiki_lines( + next_titles, + language, + max_depth, + depth=depth + 1, + visited_pages=visited_pages, + skipped_pages=skipped_pages, + max_pages=max_pages, + wikipedia=wikipedia, + ) -def calc_ngram_freqs(input_generator, alphabet, keep_ascii_letters, save_training_data, training_path): +def calc_ngram_freqs( + input_generator, alphabet, keep_ascii_letters, save_training_data, training_path +): """Create a language model with the likelihoods of all bigrams in input. This LM is based on Unicode code point frequencies and not encoded character @@ -216,17 +230,17 @@ def calc_ngram_freqs(input_generator, alphabet, keep_ascii_letters, save_trainin num_bigrams = 0 size_in_bytes = 0 if save_training_data: - training_output_file = open(training_path, 'w', encoding='utf-8') + training_output_file = open(training_path, "w", encoding="utf-8") # Calculate unfiltered frequencies for line in input_generator: prev_char = None # Normalize so that combining and non-combining forms are # counted as the same, and because this is meant for single-byte # encodings, which don't support combining forms - line = unicodedata.normalize('NFC', line) + line = unicodedata.normalize("NFC", line) if save_training_data: print(line, file=training_output_file) - size_in_bytes += len(line.encode('utf-8')) + size_in_bytes += len(line.encode("utf-8")) for unicode_char in line: # Skip ASCII letters if we're supposed to if not keep_ascii_letters and unicode_char in ascii_letters: @@ -242,23 +256,21 @@ def calc_ngram_freqs(input_generator, alphabet, keep_ascii_letters, save_trainin num_tokens = sum(char_freqs.values()) print( ( - '\nUnique character types in training data: {:,}\n' - 'Number of character tokens in training data: {:,}\n' - 'Size of training data in bytes: {:,}' + "\nUnique character types in training data: {:,}\n" + "Number of character tokens in training data: {:,}\n" + "Size of training data in bytes: {:,}" ).format( - len(char_freqs), - num_tokens, - size_in_bytes, + len(char_freqs), num_tokens, size_in_bytes, ) ) - min_alpha_freq = min(freq for unicode_char, freq in char_freqs.items() - if unicode_char in alphabet) + min_alpha_freq = min( + freq for unicode_char, freq in char_freqs.items() if unicode_char in alphabet + ) # Filter language model down to only those within sample size - for rank, (unicode_char, freq) in enumerate(sorted(list(char_freqs.items()), - key=itemgetter(1), - reverse=True), - 1): + for rank, (unicode_char, freq) in enumerate( + sorted(list(char_freqs.items()), key=itemgetter(1), reverse=True), 1 + ): if rank >= CharacterCategory.CONTROL or freq < min_alpha_freq: del char_freqs[unicode_char] language_model.pop(unicode_char, None) @@ -300,18 +312,25 @@ def collapse_language_model_freqs(language_model, sequence_count_threshold): def flatten_language_model(language_model): """Yield items from model as (count, first_char, second_char) tuples""" - for first_char, sub_dict in iteritems(language_model): - for second_char, count in iteritems(sub_dict): + for first_char, sub_dict in language_model.items(): + for second_char, count in sub_dict.items(): yield count, first_char, second_char -def generate_sbcs_model(charset_name, language, language_model, num_bigrams, - char_ranks, keep_ascii_letters, alphabet): +def generate_sbcs_model( + charset_name, + language, + language_model, + num_bigrams, + char_ranks, + keep_ascii_letters, + alphabet, +): """Create a SingleByteCharSetModel object representing the charset.""" # Setup tables necessary for computing transition frequencies for model - char_to_order, charset_code_points = get_charset_mappings(charset_name, - char_ranks, - keep_ascii_letters) + char_to_order, charset_code_points = get_charset_mappings( + charset_name, char_ranks, keep_ascii_letters + ) # Calculate positive ratio for charset by counting positive likelihood # bigrams where both characters are in charset @@ -320,254 +339,317 @@ def generate_sbcs_model(charset_name, language, language_model, num_bigrams, # Collapse bigram frequencies to SequenceLikelihood categories for rank, (count, first_char, second_char) in enumerate(sorted_lm, 1): - if rank <= 512 and (first_char in charset_code_points and - second_char in charset_code_points): + if rank <= 512 and ( + first_char in charset_code_points and second_char in charset_code_points + ): pos_count += count pos_ratio = (pos_count / num_bigrams) if num_bigrams else 0 - curr_model = SingleByteCharSetModel(charset_name=charset_name, - language=language, - char_to_order_map=char_to_order, - # language_model is filled in later - language_model=None, - typical_positive_ratio=pos_ratio, - keep_ascii_letters=keep_ascii_letters, - alphabet=alphabet) + curr_model = SingleByteCharSetModel( + charset_name=charset_name, + language=language, + char_to_order_map=char_to_order, + # language_model is filled in later + language_model=None, + typical_positive_ratio=pos_ratio, + keep_ascii_letters=keep_ascii_letters, + alphabet=alphabet, + ) return curr_model def print_char_to_order(var_name, order_map, charset_name, output_file): - print('{} = {{'.format(var_name), file=output_file) - for char, order in sorted(iteritems(order_map)): + print(f"{var_name} = {{", file=output_file) + for char, order in sorted(order_map.items()): char_bytes = bytes(bytearray((char,))) try: unicode_char = char_bytes.decode(charset_name) except UnicodeError: unicode_char = None - print(' {!r}: {!r}, # {!r}'.format(char, order, unicode_char), - file=output_file) - print('}\n', file=output_file) + print( + f" {char!r}: {order!r}, # {unicode_char!r}", file=output_file, + ) + print("}\n", file=output_file) def print_language_model(var_name, language_model, output_file, char_ranks): - print('# 3: Positive\n' - '# 2: Likely\n' - '# 1: Unlikely\n' - '# 0: Negative\n', - file=output_file) - print('{} = {{'.format(var_name), file=output_file) - for first_char, sub_dict in sorted(iteritems(language_model)): + print( + "# 3: Positive\n" "# 2: Likely\n" "# 1: Unlikely\n" "# 0: Negative\n", + file=output_file, + ) + print(f"{var_name} = {{", file=output_file) + for first_char, sub_dict in sorted(language_model.items()): # Skip empty sub_dicts if not sub_dict: continue - print(' {!r}: {{ # {!r}'.format(char_ranks[first_char], first_char), - file=output_file) - for second_char, likelihood in sorted(iteritems(sub_dict)): - print(' {!r}: {!r}, # {!r}'.format(char_ranks[second_char], - likelihood, - second_char), - file=output_file) - print(' },', file=output_file) - print('}\n', file=output_file) - - -def train_model_for_lang(language, depth=None, input_encoding=None, - input_paths=None, sequence_count_threshold=None, - max_pages=None): + print( + " {!r}: {{ # {!r}".format(char_ranks[first_char], first_char), + file=output_file, + ) + for second_char, likelihood in sorted(sub_dict.items()): + print( + " {!r}: {!r}, # {!r}".format( + char_ranks[second_char], likelihood, second_char + ), + file=output_file, + ) + print(" },", file=output_file) + print("}\n", file=output_file) + + +def train_model_for_lang( + language, + depth=None, + input_encoding=None, + input_paths=None, + sequence_count_threshold=None, + max_pages=None, +): """Train a SingleByteCharSetModel for the given language and settings""" # Validate language language = language.title() lang_metadata = LANGUAGES.get(language) if not lang_metadata: - raise ValueError('Unknown language: {}. If you are adding a model for a' - ' new language, you must first update metadata/' - 'languages.py'.format(language)) + raise ValueError( + "Unknown language: {}. If you are adding a model for a" + " new language, you must first update metadata/" + "languages.py".format(language) + ) print( - '\n{}\n----------------------------------------------------------------\n' - 'Keep ASCII Letters: {}\n' - 'Alphabet: {}\n' - 'Unlikely Sequence Count Threshold: {}\n'.format( + "\n{}\n----------------------------------------------------------------\n" + "Keep ASCII Letters: {}\n" + "Alphabet: {}\n" + "Unlikely Sequence Count Threshold: {}\n".format( language, lang_metadata.use_ascii, lang_metadata.alphabet, - sequence_count_threshold - ) + + sequence_count_threshold, + ) + + # Do this before other branch to increase chance that this header gets # spit out together when using multiprocessing ( - 'Input Encoding: {}'.format(input_encoding) if input_paths - else 'Wikipedia Depth: {}'.format(depth) + f"Input Encoding: {input_encoding}" + if input_paths + else f"Wikipedia Depth: {depth}" ) ) # See if we're doing file-based or wiki-based training if input_paths: # Check that files are big enough before doing anything else - data_size = sum(os.path.getsize(input_path) - for input_path in input_paths) + data_size = sum(os.path.getsize(input_path) for input_path in input_paths) if data_size < 10000000: - raise ValueError('Input files must be at least 10MB to train a ' - 'decent model. You only provided {} bytes.' - .format(data_size)) + raise ValueError( + "Input files must be at least 10MB to train a " + "decent model. You only provided {} bytes.".format(data_size) + ) input_gen = gen_input_lines(input_paths, input_encoding) - data_size_str = '{} bytes of'.format(data_size) + data_size_str = f"{data_size} bytes of" else: if not HAVE_WIKIPEDIA: - raise ValueError('The pymediawiki Python package could not be ' - 'imported, so you must either specify input files ' - 'to use for training, or install it with pip.') + raise ValueError( + "The pymediawiki Python package could not be " + "imported, so you must either specify input files " + "to use for training, or install it with pip." + ) wikipedia = MediaWiki(lang=lang_metadata.iso_code) - input_gen = gen_wiki_lines(lang_metadata.wiki_start_pages, - lang_metadata.iso_code, depth, - max_pages=max_pages, - wikipedia=wikipedia) - data_size_str = 'Wikipedia' - - print('\nCreating character frequency tables for {} from {} training data' - .format(language, data_size_str)) + input_gen = gen_wiki_lines( + lang_metadata.wiki_start_pages, + lang_metadata.iso_code, + depth, + max_pages=max_pages, + wikipedia=wikipedia, + ) + data_size_str = "Wikipedia" + + print( + "\nCreating character frequency tables for {} from {} training data".format( + language, data_size_str + ) + ) sys.stdout.flush() - char_ranks, language_model, num_bigrams = calc_ngram_freqs(input_gen, - lang_metadata.alphabet, - lang_metadata.use_ascii, - not input_paths, - 'wiki_{}.txt'.format(language)) + char_ranks, language_model, num_bigrams = calc_ngram_freqs( + input_gen, + lang_metadata.alphabet, + lang_metadata.use_ascii, + not input_paths, + f"wiki_{language}.txt", + ) # Create char-to-order maps (aka char-to-rank dicts) charset_models = {} for charset_name in lang_metadata.charsets: - print('Creating charset model for {}'.format(charset_name)) + print(f"Creating charset model for {charset_name}") sys.stdout.flush() - charset_models[charset_name] = generate_sbcs_model(charset_name, - language, - language_model, - num_bigrams, - char_ranks, - lang_metadata.use_ascii, - lang_metadata.alphabet) + charset_models[charset_name] = generate_sbcs_model( + charset_name, + language, + language_model, + num_bigrams, + char_ranks, + lang_metadata.use_ascii, + lang_metadata.alphabet, + ) # Collapse language model freqs to SequenceLikelihood values after # calculating positive ratio for each charset collapse_language_model_freqs(language_model, sequence_count_threshold) # Write output files - print('Writing output file for {}\n\n'.format(language)) + print(f"Writing output file for {language}\n\n") sys.stdout.flush() - with open('lang{}model.py'.format(language.lower()), 'w') as output_file: + with open(f"lang{language.lower()}model.py", "w") as output_file: upper_lang = language.upper() # print header to set encoding - print('#!/usr/bin/env python\n' - '# -*- coding: utf-8 -*-\n\n' - 'from chardet.sbcharsetprober import SingleByteCharSetModel\n\n', - file=output_file) + print( + "from chardet.sbcharsetprober import SingleByteCharSetModel\n\n", + file=output_file, + ) - lm_name = '{}_LANG_MODEL'.format(upper_lang) + lm_name = f"{upper_lang}_LANG_MODEL" print_language_model(lm_name, language_model, output_file, char_ranks) - print('# 255: Undefined characters that did not exist in training text\n' - '# 254: Carriage/Return\n' - '# 253: symbol (punctuation) that does not belong to word\n' - '# 252: 0 - 9\n' - '# 251: Control characters\n\n' - '# Character Mapping Table(s):', - file=output_file) - for charset_name, sbcs_model in iteritems(charset_models): + print( + "# 255: Undefined characters that did not exist in training text\n" + "# 254: Carriage/Return\n" + "# 253: symbol (punctuation) that does not belong to word\n" + "# 252: 0 - 9\n" + "# 251: Control characters\n\n" + "# Character Mapping Table(s):", + file=output_file, + ) + for charset_name, sbcs_model in charset_models.items(): normal_name = normalize_name(charset_name) - char_to_order_name = ('{}_{}_CHAR_TO_ORDER'.format(normal_name, - upper_lang)) - print_char_to_order(char_to_order_name, sbcs_model.char_to_order_map, - charset_name, output_file) - - sbcs_model_name = '{}_{}_MODEL'.format(normal_name, upper_lang) + char_to_order_name = f"{normal_name}_{upper_lang}_CHAR_TO_ORDER" + print_char_to_order( + char_to_order_name, + sbcs_model.char_to_order_map, + charset_name, + output_file, + ) + + sbcs_model_name = f"{normal_name}_{upper_lang}_MODEL" sbcs_model.char_to_order_map.clear() - sbcs_model_repr = (repr(sbcs_model) - .replace('None', lm_name) - .replace('{}', char_to_order_name) - .replace(', ', (',\n' + - ' ' * (len(sbcs_model_name) + - 26)))) - print('{} = {}\n'.format(sbcs_model_name, sbcs_model_repr), - file=output_file) + sbcs_model_repr = ( + repr(sbcs_model) + .replace("None", lm_name) + .replace("{}", char_to_order_name) + .replace(", ", (",\n" + " " * (len(sbcs_model_name) + 26))) + ) + print(f"{sbcs_model_name} = {sbcs_model_repr}\n", file=output_file) def main(): - parser = ArgumentParser(description=__doc__, - formatter_class=ArgumentDefaultsHelpFormatter) - parser.add_argument('language', - help='The name of the language the input documents are ' - 'in. Also the name of the language the generated ' - 'model will detect. If no language is specified, ' - 'models for all languages known to chardet will be' - ' trained.', - nargs='*', - default=list(sorted(LANGUAGES.keys()))) - parser.add_argument('-d', '--depth', - help='Maximum depth to crawl Wikipedia articles for ' - 'training data.', - type=int, default=2) - parser.add_argument('-e', '--input_encoding', - help='Encoding the input files are in. Does not need to' - ' match CHARSET_NAME.', - default='UTF-8') - parser.add_argument('-i', '--input_files', - help='File to use to train language model. If no files ' - 'are specified, will crawl Wikipedia for training ' - 'data.', - nargs='*', - dest='input_paths') - parser.add_argument('-m', '--max_pages', - help='Maximum number of Wikipedia pages to crawl per ' - 'language.', - type=int, default=20000) - parser.add_argument('-p', '--parallel_langs', - help='Number of languages models to train at once.', - type=int, default=8) - parser.add_argument('-t', '--sequence_count_threshold', - help='Minimum number of times a particular two-' - 'character sequence must have occurred in the ' - 'input files in order to be considered unlikely ' - '(instead of illegal).', - type=int, default=3) - parser.add_argument('--version', action='version', version=__version__) + parser = ArgumentParser( + description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "language", + help="The name of the language the input documents are " + "in. Also the name of the language the generated " + "model will detect. If no language is specified, " + "models for all languages known to chardet will be" + " trained.", + nargs="*", + default=list(sorted(LANGUAGES.keys())), + ) + parser.add_argument( + "-d", + "--depth", + help="Maximum depth to crawl Wikipedia articles for " "training data.", + type=int, + default=2, + ) + parser.add_argument( + "-e", + "--input_encoding", + help="Encoding the input files are in. Does not need to" " match CHARSET_NAME.", + default="UTF-8", + ) + parser.add_argument( + "-i", + "--input_files", + help="File to use to train language model. If no files " + "are specified, will crawl Wikipedia for training " + "data.", + nargs="*", + dest="input_paths", + ) + parser.add_argument( + "-m", + "--max_pages", + help="Maximum number of Wikipedia pages to crawl per " "language.", + type=int, + default=20000, + ) + parser.add_argument( + "-p", + "--parallel_langs", + help="Number of languages models to train at once.", + type=int, + default=8, + ) + parser.add_argument( + "-t", + "--sequence_count_threshold", + help="Minimum number of times a particular two-" + "character sequence must have occurred in the " + "input files in order to be considered unlikely " + "(instead of illegal).", + type=int, + default=3, + ) + parser.add_argument("--version", action="version", version=__version__) args = parser.parse_args() # Make sure we aren't trying to do anything weird if len(args.language) > 1: if args.input_paths: - raise ValueError('Specifying input paths is not valid when training' - ' models for multiple languages at the same time. ' - ' This only works for Wikipedia training.') + raise ValueError( + "Specifying input paths is not valid when training" + " models for multiple languages at the same time. " + " This only works for Wikipedia training." + ) if not HAVE_WIKIPEDIA and not args.input_paths: - raise ValueError('The pymediawiki Python package could not be ' - 'imported, so you must either specify input files ' - 'to use for training, or install it with pip.') - + raise ValueError( + "The pymediawiki Python package could not be " + "imported, so you must either specify input files " + "to use for training, or install it with pip." + ) # Only create multiprocessing pool if doing things in parallel, otherwise # it's harder to debug if args.parallel_langs > 1 and len(args.language) > 1: pool = Pool(args.parallel_langs) - pool.map_async(partial(train_model_for_lang, - depth=args.depth, - input_encoding=args.input_encoding, - input_paths=args.input_paths, - sequence_count_threshold=args.sequence_count_threshold, - max_pages=args.max_pages), - args.language) + pool.map_async( + partial( + train_model_for_lang, + depth=args.depth, + input_encoding=args.input_encoding, + input_paths=args.input_paths, + sequence_count_threshold=args.sequence_count_threshold, + max_pages=args.max_pages, + ), + args.language, + ) pool.close() pool.join() else: for language in args.language: - train_model_for_lang(language, - depth=args.depth, - input_encoding=args.input_encoding, - input_paths=args.input_paths, - sequence_count_threshold=args.sequence_count_threshold, - max_pages=args.max_pages) + train_model_for_lang( + language, + depth=args.depth, + input_encoding=args.input_encoding, + input_paths=args.input_paths, + sequence_count_threshold=args.sequence_count_threshold, + max_pages=args.max_pages, + ) -if __name__ == '__main__': +if __name__ == "__main__": main()