# Compose an ekorpkit config

In [1]:
# import ekorpkit.ekonf as eKonf
from ekorpkit import eKonf
from pprint import pprint

In [2]:
cfg = eKonf.compose()
print('Config type:', type(cfg))
pprint(cfg)


Config type: <class 'dict'>
{'_target_': 'ekorpkit.cli.about',
 'about': {'app': {'author': 'entelecheia',
                   'description': 'This package provides corpus management '
                                  'tools such as extraction, trasformation, '
                                  'and tokenization.',
                   'name': 'ekorpkit'}},
 'app_name': 'ekorpkit',
 'corpus': {'_target_': 'ekorpkit.corpora.loader.Corpora',
            'autoload': True,
            'column_info': {'data': None,
                            'keys': None,
                            'meta': None,
                            'timestamp': {'format': None,
                                          'key': None,
                                          'params': None}},
            'data_dir': './workspace/data/datasets/corpus/ekorpkit',
            'filetype': None,
            'metadata_dir': './workspace/data/datasets/corpus/ekorpkit',
            'name': None,
            'segment_separator'

In [3]:
cfg = eKonf.compose(config_group='corpus', return_as_dict=True)
print('Config type:', type(cfg))
pprint(cfg)
cfg = eKonf.compose(config_group='corpus', return_as_dict=False)
print('\nConfig type:', type(cfg))
pprint(cfg)

Config type: <class 'dict'>
{'_target_': 'ekorpkit.corpora.loader.Corpora',
 'autoload': True,
 'column_info': {'data': None,
                 'keys': None,
                 'meta': None,
                 'timestamp': {'format': None, 'key': None, 'params': None}},
 'data_dir': './workspace/data/datasets/corpus/ekorpkit',
 'filetype': None,
 'metadata_dir': './workspace/data/datasets/corpus/ekorpkit',
 'name': None,
 'segment_separator': '\\n\\n',
 'sentence_separator': '\\n',
 'use_name_as_subdir': True,
 'verbose': False}

Config type: <class 'omegaconf.dictconfig.DictConfig'>
{'name': '${oc.select:.builtin.name, null}', 'data_dir': '${dir.corpus}', 'filetype': '${oc.select:.builtin.filetype, null}', 'autoload': True, 'verbose': '${oc.select:..verbose, true}', 'segment_separator': "${oc.select:.builtin.segment_separator, '\\n\\n'}", 'sentence_separator': "${oc.select:.builtin.sentence_separator, '\\n'}", 'column_info': {'keys': '${oc.select:..builtin.column_info.keys, null}', 'data':

In [4]:
config_group='preprocessor/tokenizer=mecab'
cfg = eKonf.compose(config_group=config_group)
print(type(cfg))
pprint(cfg)

<class 'dict'>
{'_target_': 'ekorpkit.preprocessors.tokenizer.MecabTokenizer',
 'concat_token_and_pos': True,
 'exclude_pos': ['SP'],
 'flatten': True,
 'include_whitespace_token': True,
 'lowercase': False,
 'mecab': {'backend': 'mecab-python3', 'userdic_path': None, 'verbose': False},
 'no_space_for_non_nouns': False,
 'normalize': None,
 'noun_pos': ['NNG', 'NNP', 'XSN', 'SL', 'XR', 'NNB', 'NR'],
 'punct_pos': ['SF', 'SP', 'SSO', 'SSC', 'SY'],
 'sentence_separator': '\\n',
 'stopwords_path': None,
 'tokenize_each_word': False,
 'userdic_path': None,
 'verbose': False,
 'wordpieces_prefix': '##'}


In [5]:
cfg = eKonf.to_config(cfg)
print(type(cfg))

<class 'omegaconf.dictconfig.DictConfig'>


In [6]:
print(eKonf.to_yaml(cfg))

_target_: ekorpkit.preprocessors.tokenizer.MecabTokenizer
normalize: null
lowercase: false
tokenize_each_word: false
wordpieces_prefix: '##'
punct_pos:
- SF
- SP
- SSO
- SSC
- SY
noun_pos:
- NNG
- NNP
- XSN
- SL
- XR
- NNB
- NR
exclude_pos:
- SP
no_space_for_non_nouns: false
flatten: true
concat_token_and_pos: true
include_whitespace_token: true
userdic_path: null
stopwords_path: null
sentence_separator: \n
verbose: false
mecab:
  userdic_path: null
  backend: mecab-python3
  verbose: false



## Instantiating objects with an ekorpkit config

In [7]:
config_group='preprocessor/tokenizer=mecab'
cfg = eKonf.compose(config_group=config_group)
mecab = eKonf.instantiate(cfg)

[ekorpkit]: Initializing mecab...


In [8]:
text = 'IMF가 推定한 우리나라의 GDP갭률은 今年에도 소폭의 마이너스(−)를 持續하고 있다.'
mecab.tokenize(text)

'IMF/SL 가/JKS  /SP 推定/NNG 한/XSA+ETM  /SP 우리나라/NNG 의/JKG  /SP GDP/SL 갭/NNG 률/XSN 은/JX  /SP 今年/NNG 에/JKB 도/JX  /SP 소폭/NNG 의/JKG  /SP 마이너스/NNG (/SSO −)/SY 를/JKO  /SP 持續/NNG 하/XSV 고/EC  /SP 있/VX 다/EF ./SF'

In [9]:
config_group='preprocessor/normalizer=formal_ko'
cfg_norm = eKonf.compose(config_group=config_group)
norm = eKonf.instantiate(cfg_norm)

In [10]:
norm(text)

'IMF가 추정한 우리나라의 GDP갭률은 금년에도 소폭의 마이너스(-)를 지속하고 있다.'

In [11]:
config_group='preprocessor/tokenizer=mecab'
cfg = eKonf.compose(config_group=config_group)
mecab = eKonf.instantiate(cfg, normalize=norm.normalize)
mecab.tokenize(text)

[ekorpkit]: Initializing mecab...


'IMF/SL 가/JKS  /SP 추정/NNG 한/XSA+ETM  /SP 우리나라/NNG 의/JKG  /SP GDP/SL 갭/NNG 률/XSN 은/JX  /SP 금년/NNG 에/JKB 도/JX  /SP 소폭/NNG 의/JKG  /SP 마이너스/NNG (/SSO -)/SY 를/JKO  /SP 지속/NNG 하/XSV 고/EC  /SP 있/VX 다/EF ./SF'

In [12]:
config_group='preprocessor/tokenizer=mecab'
cfg = eKonf.compose(config_group=config_group)
cfg['normalize'] = cfg_norm
mecab = eKonf.instantiate(cfg)
mecab.tokenize(text)

[ekorpkit]: Initializing mecab...


'IMF/SL 가/JKS  /SP 추정/NNG 한/XSA+ETM  /SP 우리나라/NNG 의/JKG  /SP GDP/SL 갭/NNG 률/XSN 은/JX  /SP 금년/NNG 에/JKB 도/JX  /SP 소폭/NNG 의/JKG  /SP 마이너스/NNG (/SSO -)/SY 를/JKO  /SP 지속/NNG 하/XSV 고/EC  /SP 있/VX 다/EF ./SF'

In [13]:
config_group='preprocessor/segmenter=pysbd'
cfg_seg = eKonf.compose(config_group=config_group)
seg = eKonf.instantiate(cfg_seg)

In [14]:
text = "For strains harboring the pYV plasmid and Yop-encoding plasmids, bacteria were grown with aeration at 26 °C overnight in broth supplemented with 2.5 mm CaCl2 and 100 μg/ml ampicillin and then subcultured and grown at 26 °C until A600 of 0.2. At this point, the cultures were shifted to 37 °C and aerated for 1 h. A multiplicity of infection of 50:1 was used for YPIII(p-) incubations, and a multiplicity of infection of 25:1 was used for other derivatives. For the pYopE-expressing plasmid, 0.1 mm isopropyl-β-d-thiogalactopyranoside was supplemented during infection to induce YopE expression."
seg(text)

['For strains harboring the pYV plasmid and Yop-encoding plasmids, bacteria were grown with aeration at 26 °C overnight in broth supplemented with 2.5 mm CaCl2 and 100 μg/ml ampicillin and then subcultured and grown at 26 °C until A600 of 0.2. ',
 'At this point, the cultures were shifted to 37 °C and aerated for 1 h. ',
 'A multiplicity of infection of 50:1 was used for YPIII(p-) incubations, and a multiplicity of infection of 25:1 was used for other derivatives. ',
 'For the pYopE-expressing plasmid, 0.1 mm isopropyl-β-d-thiogalactopyranoside was supplemented during infection to induce YopE expression.']