Skip to content

Commit

Permalink
Add language property to probers (#108)
Browse files Browse the repository at this point in the history
  • Loading branch information
dan-blanchard committed Apr 10, 2017
1 parent 93b7c80 commit 9ce79eb
Show file tree
Hide file tree
Showing 24 changed files with 147 additions and 57 deletions.
4 changes: 4 additions & 0 deletions chardet/big5prober.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,7 @@ def __init__(self):
@property
def charset_name(self):
return "Big5"

@property
def language(self):
return "Chinese"
10 changes: 9 additions & 1 deletion chardet/charsetgroupprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ def charset_name(self):
return None
return self._best_guess_prober.charset_name

@property
def language(self):
if not self._best_guess_prober:
self.get_confidence()
if not self._best_guess_prober:
return None
return self._best_guess_prober.language

def feed(self, byte_str):
for prober in self.probers:
if not prober:
Expand Down Expand Up @@ -89,7 +97,7 @@ def get_confidence(self):
self.logger.debug('%s not active', prober.charset_name)
continue
conf = prober.get_confidence()
self.logger.debug('%s confidence = %s', prober.charset_name, conf)
self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
if best_conf < conf:
best_conf = conf
self._best_guess_prober = prober
Expand Down
4 changes: 4 additions & 0 deletions chardet/codingstatemachine.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,7 @@ def get_current_charlen(self):

def get_coding_state_machine(self):
return self._model['name']

@property
def language(self):
return self._model['language']
4 changes: 4 additions & 0 deletions chardet/cp949prober.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,7 @@ def __init__(self):
@property
def charset_name(self):
return "CP949"

@property
def language(self):
return "Korean"
7 changes: 7 additions & 0 deletions chardet/escprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(self, lang_filter=None):
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
self.active_sm_count = None
self._detected_charset = None
self._detected_language = None
self._state = None
self.reset()

Expand All @@ -63,11 +64,16 @@ def reset(self):
coding_sm.reset()
self.active_sm_count = len(self.coding_sm)
self._detected_charset = None
self._detected_language = None

@property
def charset_name(self):
return self._detected_charset

@property
def language(self):
return self._detected_language

def get_confidence(self):
if self._detected_charset:
return 0.99
Expand All @@ -89,6 +95,7 @@ def feed(self, byte_str):
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
self._detected_charset = coding_sm.get_coding_state_machine()
self._detected_language = coding_sm.language
return self.state

return self.state
12 changes: 8 additions & 4 deletions chardet/escsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@
'class_factor': 6,
'state_table': HZ_ST,
'char_len_table': HZ_CHAR_LEN_TABLE,
'name': "HZ-GB-2312"}
'name': "HZ-GB-2312",
'language': 'Chinese'}

ISO2022CN_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
Expand Down Expand Up @@ -131,7 +132,8 @@
'class_factor': 9,
'state_table': ISO2022CN_ST,
'char_len_table': ISO2022CN_CHAR_LEN_TABLE,
'name': "ISO-2022-CN"}
'name': "ISO-2022-CN",
'language': 'Chinese'}

ISO2022JP_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
Expand Down Expand Up @@ -186,7 +188,8 @@
'class_factor': 10,
'state_table': ISO2022JP_ST,
'char_len_table': ISO2022JP_CHAR_LEN_TABLE,
'name': "ISO-2022-JP"}
'name': "ISO-2022-JP",
'language': 'Japanese'}

ISO2022KR_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
Expand Down Expand Up @@ -237,6 +240,7 @@
'class_factor': 6,
'state_table': ISO2022KR_ST,
'char_len_table': ISO2022KR_CHAR_LEN_TABLE,
'name': "ISO-2022-KR"}
'name': "ISO-2022-KR",
'language': 'Korean'}


8 changes: 6 additions & 2 deletions chardet/eucjpprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,17 @@ def reset(self):
def charset_name(self):
return "EUC-JP"

@property
def language(self):
return "Japanese"

def feed(self, byte_str):
for i in range(len(byte_str)):
# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
coding_state = self.coding_sm.next_state(byte_str[i])
if coding_state == MachineState.ERROR:
self.logger.debug('%s prober hit error at byte %s',
self.charset_name, i)
self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, self.language, i)
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.ITS_ME:
Expand Down
4 changes: 4 additions & 0 deletions chardet/euckrprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,7 @@ def __init__(self):
@property
def charset_name(self):
return "EUC-KR"

@property
def language(self):
return "Korean"
4 changes: 4 additions & 0 deletions chardet/euctwprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,7 @@ def __init__(self):
@property
def charset_name(self):
return "EUC-TW"

@property
def language(self):
return "Taiwan"
4 changes: 4 additions & 0 deletions chardet/gb2312prober.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,7 @@ def __init__(self):
@property
def charset_name(self):
return "GB2312"

@property
def language(self):
return "Chinese"
4 changes: 4 additions & 0 deletions chardet/hebrewprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,10 @@ def charset_name(self):
# Logical.
return self.LOGICAL_HEBREW_NAME

@property
def language(self):
return 'Hebrew'

@property
def state(self):
# Remain active as long as any of the model probers are active.
Expand Down
9 changes: 4 additions & 5 deletions chardet/langbulgarianmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,16 +214,15 @@
'precedence_matrix': BulgarianLangModel,
'typical_positive_ratio': 0.969392,
'keep_english_letter': False,
'charset_name': "ISO-8859-5"
'charset_name': "ISO-8859-5",
'language': 'Bulgairan',
}

Win1251BulgarianModel = {
'char_to_order_map': win1251BulgarianCharToOrderMap,
'precedence_matrix': BulgarianLangModel,
'typical_positive_ratio': 0.969392,
'keep_english_letter': False,
'charset_name': "windows-1251"
'charset_name': "windows-1251",
'language': 'Bulgarian',
}



20 changes: 12 additions & 8 deletions chardet/langcyrillicmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,47 +283,51 @@
'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601,
'keep_english_letter': False,
'charset_name': "KOI8-R"
'charset_name': "KOI8-R",
'language': 'Russian',
}

Win1251CyrillicModel = {
'char_to_order_map': win1251_char_to_order_map,
'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601,
'keep_english_letter': False,
'charset_name': "windows-1251"
'charset_name': "windows-1251",
'language': 'Russian',
}

Latin5CyrillicModel = {
'char_to_order_map': latin5_char_to_order_map,
'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601,
'keep_english_letter': False,
'charset_name': "ISO-8859-5"
'charset_name': "ISO-8859-5",
'language': 'Russian',
}

MacCyrillicModel = {
'char_to_order_map': macCyrillic_char_to_order_map,
'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601,
'keep_english_letter': False,
'charset_name': "MacCyrillic"
'charset_name': "MacCyrillic",
'language': 'Russian',
}

Ibm866Model = {
'char_to_order_map': IBM866_char_to_order_map,
'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601,
'keep_english_letter': False,
'charset_name': "IBM866"
'charset_name': "IBM866",
'language': 'Russian',
}

Ibm855Model = {
'char_to_order_map': IBM855_char_to_order_map,
'precedence_matrix': RussianLangModel,
'typical_positive_ratio': 0.976601,
'keep_english_letter': False,
'charset_name': "IBM855"
'charset_name': "IBM855",
'language': 'Russian',
}


8 changes: 4 additions & 4 deletions chardet/langgreekmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,15 +211,15 @@
'precedence_matrix': GreekLangModel,
'typical_positive_ratio': 0.982851,
'keep_english_letter': False,
'charset_name': "ISO-8859-7"
'charset_name': "ISO-8859-7",
'language': 'Greek',
}

Win1253GreekModel = {
'char_to_order_map': win1253_char_to_order_map,
'precedence_matrix': GreekLangModel,
'typical_positive_ratio': 0.982851,
'keep_english_letter': False,
'charset_name': "windows-1253"
'charset_name': "windows-1253",
'language': 'Greek',
}


5 changes: 2 additions & 3 deletions chardet/langhebrewmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@
'precedence_matrix': HEBREW_LANG_MODEL,
'typical_positive_ratio': 0.984004,
'keep_english_letter': False,
'charset_name': "windows-1255"
'charset_name': "windows-1255",
'language': 'Hebrew',
}


8 changes: 4 additions & 4 deletions chardet/langhungarianmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,15 +211,15 @@
'precedence_matrix': HungarianLangModel,
'typical_positive_ratio': 0.947368,
'keep_english_letter': True,
'charset_name': "ISO-8859-2"
'charset_name': "ISO-8859-2",
'language': 'Hungarian',
}

Win1250HungarianModel = {
'char_to_order_map': win1250HungarianCharToOrderMap,
'precedence_matrix': HungarianLangModel,
'typical_positive_ratio': 0.947368,
'keep_english_letter': True,
'charset_name': "windows-1250"
'charset_name': "windows-1250",
'language': 'Hungarian',
}


5 changes: 2 additions & 3 deletions chardet/langthaimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,6 @@
'precedence_matrix': ThaiLangModel,
'typical_positive_ratio': 0.926386,
'keep_english_letter': False,
'charset_name': "TIS-620"
'charset_name': "TIS-620",
'language': 'Thai',
}


3 changes: 2 additions & 1 deletion chardet/langturkishmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,5 +188,6 @@
'precedence_matrix': TurkishLangModel,
'typical_positive_ratio': 0.970290,
'keep_english_letter': True,
'charset_name': "ISO-8859-9"
'charset_name': "ISO-8859-9",
'language': 'Turkish',
}
4 changes: 4 additions & 0 deletions chardet/latin1prober.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ def reset(self):
def charset_name(self):
return "ISO-8859-1"

@property
def language(self):
return ""

def feed(self, byte_str):
byte_str = self.filter_with_english_letters(byte_str)
for c in byte_str:
Expand Down
12 changes: 8 additions & 4 deletions chardet/mbcharsetprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,18 @@ def reset(self):

@property
def charset_name(self):
pass
raise NotImplementedError

@property
def language(self):
raise NotImplementedError

def feed(self, byte_str):
for i in range(len(byte_str)):
coding_state = self.coding_sm.next_state(byte_str[i])
if coding_state == MachineState.ERROR:
self.logger.debug('%s prober hit error at byte %s',
self.charset_name, i)
self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, self.language, i)
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.ITS_ME:
Expand All @@ -72,7 +76,7 @@ def feed(self, byte_str):
self.distribution_analyzer.feed(self._last_char, char_len)
else:
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
char_len)
char_len)

self._last_char[0] = byte_str[-1]

Expand Down
9 changes: 7 additions & 2 deletions chardet/sbcharsetprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################

from collections import namedtuple

from .charsetprober import CharSetProber
from .enums import CharacterCategory, ProbingState, SequenceLikelihood

Expand Down Expand Up @@ -69,6 +67,13 @@ def charset_name(self):
else:
return self._model['charset_name']

@property
def language(self):
if self._name_prober:
return self._name_prober.language
else:
return self._model.get('language')

def feed(self, byte_str):
if not self._model['keep_english_letter']:
byte_str = self.filter_international_words(byte_str)
Expand Down

0 comments on commit 9ce79eb

Please sign in to comment.