<a href="https://colab.research.google.com/github/entelecheia/ekorpkit-config/blob/main/notebooks/preprocessor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Compose an ekorpkit config

In [19]:
# %%capture
# %pip install ekorpkit[transformers,tokenize]

In [1]:
from ekorpkit import eKonf

## Instantiating a mecab class

In [2]:
config_group='preprocessor/tokenizer=mecab'
cfg = eKonf.compose(config_group=config_group)
eKonf.pprint(cfg)
mecab = eKonf.instantiate(cfg)

{'_target_': 'ekorpkit.preprocessors.tokenizer.MecabTokenizer',
 'extract': {'no_space_for_non_nouns': False,
             'noun_postags': ['NNG', 'NNP', 'XSN', 'SL', 'XR', 'NNB', 'NR'],
             'postags': None,
             'stop_postags': ['SP'],
             'stopwords': None,
             'stopwords_path': None},
 'mecab': {'backend': 'mecab-python3', 'userdic_path': None, 'verbose': False},
 'normalize': None,
 'tokenize': {'concat_surface_and_pos': True,
              'flatten': True,
              'include_whitespace_token': True,
              'lowercase': False,
              'punct_postags': ['SF', 'SP', 'SSO', 'SSC', 'SY'],
              'tokenize_each_word': False,
              'userdic_path': None,
              'wordpieces_prefix': '##'},
 'tokenize_article': {'return_type': 'str', 'sentence_separator': '\\n'},
 'verbose': False}


[ekorpkit]: Initializing mecab...


In [4]:
text = 'IMF가 推定한 우리나라의 GDP갭률은 今年에도 소폭의 마이너스(−)를 持續하고 있다.'
tokens = mecab.tokenize(text)
print(tokens)
print(mecab(text))

['IMF/SL', '가/JKS', ' /SP', '推定/NNG', '한/XSA+ETM', ' /SP', '우리나라/NNG', '의/JKG', ' /SP', 'GDP/SL', '갭/NNG', '률/XSN', '은/JX', ' /SP', '今年/NNG', '에/JKB', '도/JX', ' /SP', '소폭/NNG', '의/JKG', ' /SP', '마이너스/NNG', '(/SSO', '−)/SY', '를/JKO', ' /SP', '持續/NNG', '하/XSV', '고/EC', ' /SP', '있/VX', '다/EF', './SF']
['IMF/SL', '가/JKS', ' /SP', '推定/NNG', '한/XSA+ETM', ' /SP', '우리나라/NNG', '의/JKG', ' /SP', 'GDP/SL', '갭/NNG', '률/XSN', '은/JX', ' /SP', '今年/NNG', '에/JKB', '도/JX', ' /SP', '소폭/NNG', '의/JKG', ' /SP', '마이너스/NNG', '(/SSO', '−)/SY', '를/JKO', ' /SP', '持續/NNG', '하/XSV', '고/EC', ' /SP', '있/VX', '다/EF', './SF']


In [4]:
config_group='preprocessor/normalizer=formal_ko'
cfg_norm = eKonf.compose(config_group=config_group)
eKonf.pprint(cfg_norm)
norm = eKonf.instantiate(cfg_norm)

{'_target_': 'ekorpkit.preprocessors.normalizer.Normalizer',
 'ftfy': {'decode_inconsistent_utf8': True,
          'fix_c1_controls': True,
          'fix_character_width': True,
          'fix_encoding': True,
          'fix_latin_ligatures': True,
          'fix_line_breaks': True,
          'fix_surrogates': True,
          'max_decode_length': 1000000,
          'normalization': 'NFKC',
          'remove_control_chars': True,
          'remove_terminal_escapes': True,
          'replace_lossy_sequences': True,
          'restore_byte_a0': True,
          'uncurl_quotes': True,
          'unescape_html': True},
 'hanja2hangle': True,
 'num_repeats': 2,
 'spaces': {'collapse_whitespaces': True,
            'fix_whitespaces': True,
            'num_spaces_for_tab': 4,
            'replace_tabs': True,
            'strip': True},
 'special_characters': {'fix_ellipsis': True,
                        'fix_emoticons': False,
                        'fix_hyphens': True,
                   

In [5]:
norm(text)

'IMF가 추정한 우리나라의 GDP갭률은 금년에도 소폭의 마이너스(-)를 지속하고 있다.'

In [6]:
config_group='preprocessor/tokenizer=mecab'
cfg = eKonf.compose(config_group=config_group)
mecab = eKonf.instantiate(cfg, normalize=norm)
tokens = mecab.tokenize(text)
print(tokens)

[ekorpkit]: Initializing mecab...


['IMF/SL', '가/JKS', ' /SP', '추정/NNG', '한/XSA+ETM', ' /SP', '우리나라/NNG', '의/JKG', ' /SP', 'GDP/SL', '갭/NNG', '률/XSN', '은/JX', ' /SP', '금년/NNG', '에/JKB', '도/JX', ' /SP', '소폭/NNG', '의/JKG', ' /SP', '마이너스/NNG', '(/SSO', '-)/SY', '를/JKO', ' /SP', '지속/NNG', '하/XSV', '고/EC', ' /SP', '있/VX', '다/EF', './SF']


In [7]:
config_group='preprocessor/tokenizer=mecab'
cfg = eKonf.compose(config_group=config_group)
cfg['normalize'] = cfg_norm
eKonf.pprint(cfg)
mecab = eKonf.instantiate(cfg)
tokens = mecab.tokenize(text)
print(tokens)

{'_target_': 'ekorpkit.preprocessors.tokenizer.MecabTokenizer',
 'extract': {'no_space_for_non_nouns': False,
             'noun_postags': ['NNG', 'NNP', 'XSN', 'SL', 'XR', 'NNB', 'NR'],
             'postags': None,
             'stop_postags': ['SP'],
             'stopwords': None,
             'stopwords_path': None},
 'mecab': {'backend': 'mecab-python3', 'userdic_path': None, 'verbose': False},
 'normalize': {'_target_': 'ekorpkit.preprocessors.normalizer.Normalizer',
               'ftfy': {'decode_inconsistent_utf8': True,
                        'fix_c1_controls': True,
                        'fix_character_width': True,
                        'fix_encoding': True,
                        'fix_latin_ligatures': True,
                        'fix_line_breaks': True,
                        'fix_surrogates': True,
                        'max_decode_length': 1000000,
                        'normalization': 'NFKC',
                        'remove_control_chars': True,
        

[ekorpkit]: Initializing mecab...


['IMF/SL', '가/JKS', ' /SP', '추정/NNG', '한/XSA+ETM', ' /SP', '우리나라/NNG', '의/JKG', ' /SP', 'GDP/SL', '갭/NNG', '률/XSN', '은/JX', ' /SP', '금년/NNG', '에/JKB', '도/JX', ' /SP', '소폭/NNG', '의/JKG', ' /SP', '마이너스/NNG', '(/SSO', '-)/SY', '를/JKO', ' /SP', '지속/NNG', '하/XSV', '고/EC', ' /SP', '있/VX', '다/EF', './SF']


In [8]:
print(mecab.nouns(text))

['IMF', '추정', '우리나라', 'GDP', '갭', '률', '금년', '소폭', '마이너스', '지속']


In [9]:
print(mecab.morphs(text))

['IMF', '가', '추정', '한', '우리나라', '의', 'GDP', '갭', '률', '은', '금년', '에', '도', '소폭', '의', '마이너스', '(', '-)', '를', '지속', '하', '고', '있', '다', '.']


In [10]:
text = '금통위는 통화신용정책과 한국은행의 운영에 관한 의결권을 행사한다.'
print(mecab.tokenize(text))

['금/MAJ', '통/MAG', '위/NNG', '는/JX', ' /SP', '통화/NNG', '신용/NNG', '정책/NNG', '과/JC', ' /SP', '한국은행/NNP', '의/JKG', ' /SP', '운영/NNG', '에/JKB', ' /SP', '관한/VV+ETM', ' /SP', '의결/NNG', '권/XSN', '을/JKO', ' /SP', '행사/NNG', '한다/XSV+EF', './SF']


In [11]:
config_group='preprocessor/tokenizer=mecab_econ'
cfg = eKonf.compose(config_group=config_group)
eKonf.pprint(cfg)
mecab = eKonf.instantiate(cfg)
tokens = mecab.tokenize(text)
print(tokens)

{'_target_': 'ekorpkit.preprocessors.tokenizer.MecabTokenizer',
 'extract': {'no_space_for_non_nouns': False,
             'noun_postags': ['NNG', 'NNP', 'XSN', 'SL', 'XR', 'NNB', 'NR'],
             'postags': None,
             'stop_postags': ['SP'],
             'stopwords': None,
             'stopwords_path': None},
 'mecab': {'backend': 'mecab-python3',
           'userdic_path': '/workspace/projects/ekorpkit/ekorpkit/resources/dictionaries/mecab/ekon_v1.dic',
           'verbose': False},
 'normalize': None,
 'tokenize': {'concat_surface_and_pos': True,
              'flatten': True,
              'include_whitespace_token': True,
              'lowercase': False,
              'punct_postags': ['SF', 'SP', 'SSO', 'SSC', 'SY'],
              'tokenize_each_word': False,
              'userdic_path': None,
              'wordpieces_prefix': '##'},
 'tokenize_article': {'return_type': 'str', 'sentence_separator': '\\n'},
 'verbose': False}


[ekorpkit]: Initializing mecab...


['금통위/NNP', '는/JX', ' /SP', '통화/NNG', '신용/NNG', '정책/NNG', '과/JC', ' /SP', '한국은행/NNP', '의/JKG', ' /SP', '운영/NNG', '에/JKB', ' /SP', '관한/VV+ETM', ' /SP', '의결권/NNP', '을/JKO', ' /SP', '행사/NNG', '한다/XSV+EF', './SF']
