# Lab 1: Preparing Wikipedia Corpora

## Install the ekorpkit library along with its dependencies

In [None]:
%pip install --pre ekorpkit[wiki,fetch]

In [2]:
from ekorpkit import eKonf

eKonf.setLogger("INFO")
print("version:", eKonf.__version__)

is_notebook = eKonf.is_notebook()
is_colab = eKonf.is_colab()
print("is notebook?", is_notebook)
print("is colab?", is_colab)
if is_colab:
    eKonf.mount_google_drive()
eKonf.set_workspace(workspace="/content/drive/MyDrive/workspace/", project="ekorpkit-book")

INFO:ekorpkit.utils.notebook:shell type: ZMQInteractiveShell
INFO:ekorpkit.utils.notebook:Google Colab not detected.
INFO:ekorpkit.base:Setting EKORPKIT_WORKSPACE_ROOT to /content/drive/MyDrive/workspace/
INFO:ekorpkit.base:Setting EKORPKIT_PROJECT to ekorpkit-book
INFO:ekorpkit.base:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
INFO:ekorpkit.base:Loaded .env from /workspace/projects/ekorpkit-book/config/.env


version: 0.1.40.post0.dev2
is notebook? True
is colab? False


('/content/drive/MyDrive/workspace', 'ekorpkit-book')

## Build corpora with the ekorpkit configs

### Wikipedia Dump

- The first step is to download the Wikipedia dump. 
- The dump is a collection of all Wikipedia articles in XML format. 
- The dump for English is available at https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2. 
- The dump is about 20.3 GB in size.
- For other languages, change `en` to the language code of your choice.
- For the detailed list of language codes, see https://meta.wikimedia.org/wiki/List_of_Wikipedias.

#### Fetch the dump and extract it to the `data` directory 

```python
from ekorpkit.io.fetch.loader.wiki import Wiki

wiki = Wiki(lang="ko", output_dir="data")
wiki.download_dump()
wiki.extract_wiki()
```

Following the instructions above, you can download the dump for other languages.

### Build the Korean Wikipedia Corpus

In [3]:
wiki_cfg = eKonf.compose('io/fetcher=wiki')
wiki_cfg.lang = "ko"
wiki_cfg.name = "kowiki"
wiki_cfg.output_dir = f"{wiki_cfg.dump.dump_dir}/extracted"
wiki_cfg.autoload = False
wiki_cfg.force_download = False
wiki_cfg.num_workers = 50
wiki_cfg.verbose = True

eKonf.print(wiki_cfg)

{'_name_': 'fetcher',
 '_target_': 'ekorpkit.io.fetch.loader.wiki.Wiki',
 'auto': {'load': False},
 'autoload': False,
 'compress': False,
 'dump': {'_target_': 'web_download',
          'dump_dir': '/content/drive/MyDrive/workspace/.cache/corpus/kowiki',
          'dump_file': 'kowiki.xml.bz2',
          'url': 'https://dumps.wikimedia.org/kowiki/latest/kowiki-latest-pages-articles.xml.bz2'},
 'extract': {'_target_': 'extract_wiki'},
 'force': {'download': False},
 'force_download': False,
 'lang': 'ko',
 'limit': -1,
 'name': 'kowiki',
 'num_workers': 50,
 'output_dir': '/content/drive/MyDrive/workspace/.cache/corpus/kowiki/extracted',
 'output_file': None,
 'path': {'cached_path': None,
          'columns': None,
          'concat_data': False,
          'data_columns': None,
          'data_dir': '/content/drive/MyDrive/workspace/data/kowiki',
          'data_file': None,
          'filetype': '',
          'name': 'kowiki',
          'output': {'base_dir': '/content/drive/MyDrive/

In [8]:
from ekorpkit.io.fetch.loader.wiki import Wiki

args = eKonf.to_dict(wiki_cfg)
wiki = Wiki(**args)
wiki.download_dump()
wiki.extract_wiki()


[kowiki.xml.bz2] download kowiki.xml.bz2: 0.00B [00:00, ?B/s]

INFO: Preprocessing '/content/drive/MyDrive/workspace/.cache/corpus/kowiki/kowiki.xml.bz2' to collect template definitions: this may take some time.
INFO: Preprocessed 100000 pages
INFO: Preprocessed 200000 pages
INFO: Preprocessed 300000 pages
INFO: Preprocessed 400000 pages
INFO: Preprocessed 500000 pages
INFO: Preprocessed 600000 pages
INFO: Preprocessed 700000 pages
INFO: Preprocessed 800000 pages
INFO: Preprocessed 900000 pages
INFO: Preprocessed 1000000 pages
INFO: Preprocessed 1100000 pages
INFO: Preprocessed 1200000 pages
INFO: Preprocessed 1300000 pages
INFO: Preprocessed 1400000 pages
INFO: Preprocessed 1500000 pages
INFO: Preprocessed 1600000 pages
INFO: Preprocessed 1700000 pages
INFO: Loaded 60850 templates in 214.3s
INFO: Starting page extraction from /content/drive/MyDrive/workspace/.cache/corpus/kowiki/kowiki.xml.bz2.
INFO: Using 50 extract processes.
INFO: Extracted 100000 articles (2809.9 art/s)
INFO: Extracted 200000 articles (4194.9 art/s)
INFO: Extracted 300000 art

Extracted kowiki from dump file /content/drive/MyDrive/workspace/.cache/corpus/kowiki/kowiki.xml.bz2


INFO: Finished 50-process extraction of 1339049 articles in 247.2s (5417.6 art/s)


#### Build the corpus by Parsing the Wikipedia Dump

- The next step is to parse the Wikipedia dump and build the corpus.
- Extracted Wikipedia dump is in JSON Lines format, which is a line-delimited JSON format.


In [13]:
# get the list of extracted files

files = eKonf.get_filepaths("**/*", wiki_cfg.output_dir)
print(f"Number of files: {len(files)}")
files[:10]

INFO:ekorpkit.io.file:Processing [1628] files from ['**/*']


Number of files: 1628


['/content/drive/MyDrive/workspace/.cache/corpus/kowiki/extracted/AH/wiki_23',
 '/content/drive/MyDrive/workspace/.cache/corpus/kowiki/extracted/AH/wiki_67',
 '/content/drive/MyDrive/workspace/.cache/corpus/kowiki/extracted/AH/wiki_05',
 '/content/drive/MyDrive/workspace/.cache/corpus/kowiki/extracted/AH/wiki_20',
 '/content/drive/MyDrive/workspace/.cache/corpus/kowiki/extracted/AH/wiki_40',
 '/content/drive/MyDrive/workspace/.cache/corpus/kowiki/extracted/AH/wiki_92',
 '/content/drive/MyDrive/workspace/.cache/corpus/kowiki/extracted/AH/wiki_42',
 '/content/drive/MyDrive/workspace/.cache/corpus/kowiki/extracted/AH/wiki_14',
 '/content/drive/MyDrive/workspace/.cache/corpus/kowiki/extracted/AH/wiki_28',
 '/content/drive/MyDrive/workspace/.cache/corpus/kowiki/extracted/AH/wiki_24']

Check the first few lines of the extracted dump.

In [11]:
print(eKonf.read(files[0], mode="r", encoding="utf-8", head=200))

{"id": "634327", "revid": "414775", "url": "https://ko.wikipedia.org/wiki?curid=634327", "title": "\uc131\uc774\uc131", "text": "\uc131\uc774\uc131(\u6210\u4ee5\u6027, 1595\ub144(\uc120\uc870 28\ub144


In [7]:
cfg = eKonf.compose('corpus/builtin=kowiki')
cfg.io.fetcher = wiki_cfg
cfg.io.loader.data_dir = wiki_cfg.output_dir
cfg.verbose = True
cfg.num_workers = 50
# eKonf.print(cfg.io)
db = eKonf.instantiate(cfg)

INFO:ekorpkit.base:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
INFO:ekorpkit.utils.notebook:shell type: ZMQInteractiveShell
INFO:ekorpkit.base:setting environment variable CACHED_PATH_CACHE_ROOT to /content/drive/MyDrive/workspace/.cache/cached_path
INFO:ekorpkit.base:setting environment variable KMP_DUPLICATE_LIB_OK to TRUE
INFO:ekorpkit.base:instantiating ekorpkit.datasets.build.DatasetBuilder...
INFO:ekorpkit.base:instantiating ekorpkit.io.fetch.loader.wiki.Wiki...
INFO:ekorpkit.base:instantiating ekorpkit.info.stat.SummaryInfo...
INFO:ekorpkit.info.stat:Loading info file: /content/drive/MyDrive/workspace/data/datasets/corpus/kowiki/info-kowiki.yaml
INFO:ekorpkit.base:instantiating ekorpkit.io.load.data.load_data...
INFO:ekorpkit.io.file:Processing [1628] files from ['**/*']
INFO:ekorpkit.io.load.data:Starting multiprocessing with 50 processes at load_data


{'category': 'formal',
 'column_info': {'columns': {'id': 'id',
                             'merge_meta_on': 'id',
                             'text': 'text',
                             'timestamp': None},
                 'data': {'id': 'int', 'text': 'str'},
                 'datetime': {'columns': None,
                              'format': None,
                              'rcParams': None},
                 'meta': {'curid': 'str',
                          'id': 'int',
                          'title': 'str',
                          'url': 'str'},
                 'segment_separator': '\\n\\n',
                 'sentence_separator': '\\n',
                 'timestamp': {'format': None, 'key': None, 'rcParams': None}},
 'description': '위키백과, 우리 모두의 백과사전',
 'fullname': 'Korean Wikipedia Corpus',
 'homepage': 'https://ko.wikipedia.org',
 'lang': 'ko',
 'license': 'CC Attribution / Share-Alike 3.0',
 'name': 'kowiki',
 'version': '1.0.0'}


::load_data():   0%|          | 0/1628 [00:00<?, ?it/s]

{'curid': '634327', 'url': 'https://ko.wikipedia.org/wiki?curid=634327', 'title': '성이성', 'text': "성이성(成以性, 1595년(선조 28년) ∼ 1664년(현종 5년))은 조선 후기의 문신이자 유학자, 청백리이다. 자(字)는 여습(汝習)이고 호는 계서(溪西)이다. 본관은 창녕(昌寧). 춘향전의 실제 주인공으로 춘향전의 주인공인 몽룡은 원래 성몽룡이었다. 남원부사와 승정원승지를 지낸 성안의의 아들이다.\n강직한 간관이자 청백리이다. 그의 직계 후손들은 춘향전에 나온 '금준미주 천인혈'이 그가 실제로 지은 한시라고 주장한다. 호서 암행어사와 호남 암행어사로 활동, 감찰하며 부패 수령들을 봉고파직시켰다. 이것 역시 춘향전의 소재가 된다. 학맥으로는 김굉필의 손제자이자 그의 학맥을 계승한 강복성(康復誠)의 문인이다. 경상북도 출신.\n생애.\n생애 초반.\n출생과 가계.\n성이성은 경상북도 봉화군 물야면 가평리 태생으로 아버지는 창녕 성씨로 승정원승지와 군수를 지낸 성안의(成安義)이고, 어머니는 예안 김씨로 증(贈) 호조 참판에 추증(追贈)된 김계선의 딸이다.\n그는 어려서부터 그는 학업에 열중하여 13세때 그가 쓴 글을 우연히 정경세(鄭經世)가 보게 되었다. 정경세는 그의 글을 읽고 장차 크게 될 인물이라 하였다.\n수학과 남원 생활.\n어려서부터 공부를 게을리하지 않고 학문에 더욱 증진하여 조경남의 문하에서 수학하다가 뒤에 강복성(康復誠)의 문인이 되었다. 강복성은 사림의 학통인 길재-김숙자-김종직-김굉필(金宏弼)-조광조-이연경(李延慶)의 학통을 계승한 학자였다.\n1607년(선조 40) 남원부사로 부임한 아버지 성안의를 따라 갔다가 그곳에서 만난 기생과의 일화가 후일 춘향전의 주 뼈대가 되었다. 그러나 아버지 성안의가 참의로 발령되면서 기생 춘향과는 이별하게 된다. 이때 시중에는 성이성과 춘향을 소재로 한 춘향전이 희극과 인형극, 만담 등으로 확산되었는데, 양반가의 자제의 스캔들이라 하여

INFO:ekorpkit.datasets.build: >> elapsed time to load and parse data: 0:00:10.173167
INFO:ekorpkit.datasets.build:
Transforming dataframe with pipeline: ['reset_index', 'save_metadata']
INFO:ekorpkit.pipelines.pipe:Applying pipeline: OrderedDict([('reset_index', 'reset_index'), ('save_metadata', 'save_metadata')])
INFO:ekorpkit.base:Applying pipe: functools.partial(<function reset_index at 0x7fce771d3ca0>)
INFO:ekorpkit.pipelines.pipe:Resetting index: {'_func_': {'_partial_': True, '_target_': 'ekorpkit.pipelines.pipe.reset_index'}, 'index_column_name': 'id', 'drop_index': False, 'verbose': True}
INFO:ekorpkit.base:Applying pipe: functools.partial(<function save_metadata at 0x7fce771d35e0>)
INFO:ekorpkit.pipelines.pipe:Saving metadata: {'_func_': {'_partial_': True, '_target_': 'ekorpkit.pipelines.pipe.save_metadata'}, 'path': {'root': '/content/drive/MyDrive/workspace/data/ekorpkit-book', 'name': 'ekorpkit-book', 'cached_path': None, 'filetype': None, 'verbose': True, 'data_dir': '/co

    curid                                         url title  \
0  634327  https://ko.wikipedia.org/wiki?curid=634327   성이성   
1  634328  https://ko.wikipedia.org/wiki?curid=634328    누타   
2  634329  https://ko.wikipedia.org/wiki?curid=634329  공중그네   
3  634331  https://ko.wikipedia.org/wiki?curid=634331   성몽룡   
4  634332  https://ko.wikipedia.org/wiki?curid=634332    계서   

                                                text  split filename  
0  성이성(成以性, 1595년(선조 28년) ∼ 1664년(현종 5년))은 조선 후기의...  train  wiki_23  
1  누타(ぬた)는 잘게 썬 생선이나 조개를 파, 채소, 미역과 함께 초된장으로 무친 요...  train  wiki_23  
2                         공중그네(空中-)는 서커스의 기술 중 하나이다.  train  wiki_23  
3                                                     train  wiki_23  
4                                                     train  wiki_23  
(1339048, 6)
   id   curid                                         url title  \
0   0  634327  https://ko.wikipedia.org/wiki?curid=634327   성이성   
1   1  634328  https://ko.wikipedia.org/wiki?cur

INFO:ekorpkit.io.file: >> elapsed time to save data: 0:00:05.303806
INFO:ekorpkit.io.file:Saving dataframe to /content/drive/MyDrive/workspace/data/datasets/corpus/kowiki/kowiki-train.parquet
INFO:ekorpkit.io.file: >> elapsed time to save data: 0:01:05.529620
INFO:ekorpkit.info.stat:Initializing statistics for split: train with stats: {'name': 'train', 'dataset_name': 'kowiki', 'data_file': 'kowiki-train.parquet', 'meta_file': 'meta-kowiki-train.parquet'}
INFO:ekorpkit.base:Using batcher with minibatch size: 1000
INFO:ekorpkit.utils.batch.batcher: backend: joblib  minibatch_size: 1000  procs: 230  input_split: False  merge_output: True  len(data): 1339048 len(args): 5


apply len_bytes to num_bytes:   0%|          | 0/1340 [00:00<?, ?it/s]

INFO:ekorpkit.base:Using batcher with minibatch size: 1000
INFO:ekorpkit.utils.batch.batcher: backend: joblib  minibatch_size: 1000  procs: 230  input_split: False  merge_output: True  len(data): 1339048 len(args): 5


apply len_sents to num_sents:   0%|          | 0/1340 [00:00<?, ?it/s]

INFO:ekorpkit.info.stat: >> elapsed time to calculate statistics before processing: 0:00:32.387224
INFO:ekorpkit.info.stat: >> updated splits: {'train': {'name': 'train', 'dataset_name': 'kowiki', 'data_file': 'kowiki-train.parquet', 'meta_file': 'meta-kowiki-train.parquet', 'num_docs_before_processing': 1339048, 'num_bytes_before_processing': 801994255, 'num_sents': 3829874}}
INFO:ekorpkit.datasets.build:
Processing dataframe with pipeline: ['normalize', 'segment', 'filter_length', 'drop_duplicates', 'save_samples']
INFO:ekorpkit.pipelines.pipe:Applying pipeline: OrderedDict([('normalize', 'normalize'), ('segment', 'segment'), ('filter_length', 'filter_length'), ('drop_duplicates', 'drop_duplicates'), ('save_samples', 'save_samples')])
INFO:ekorpkit.base:Applying pipe: functools.partial(<function normalize at 0x7fce771d3ee0>)
INFO:ekorpkit.pipelines.pipe:instantiating normalizer
INFO:ekorpkit.base:Using batcher with minibatch size: 1000
INFO:ekorpkit.utils.batch.batcher: backend: jobl

Normalizing column: text:   0%|          | 0/1340 [00:00<?, ?it/s]

INFO:ekorpkit.pipelines.pipe: >> elapsed time to normalize: 0:00:18.489442
INFO:ekorpkit.base:Applying pipe: functools.partial(<function segment at 0x7fce771d30d0>)
INFO:ekorpkit.pipelines.pipe:instantiating segmenter
INFO:ekorpkit.base:instantiating ekorpkit.preprocessors.segmenter.KSSSegmenter...
INFO:ekorpkit.base:Using batcher with minibatch size: 1000
INFO:ekorpkit.utils.batch.batcher: backend: joblib  minibatch_size: 1000  procs: 50  input_split: False  merge_output: True  len(data): 1339048 len(args): 5


Splitting column: text:   0%|          | 0/1340 [00:00<?, ?it/s]

INFO:ekorpkit.pipelines.pipe: >> elapsed time to segment: 0:26:43.238374
INFO:ekorpkit.base:Applying pipe: functools.partial(<function filter_length at 0x7fce771d3280>, len_bytes={'_partial_': True, '_target_': 'ekorpkit.utils.func.len_bytes'}, len_words={'_partial_': True, '_target_': 'ekorpkit.utils.func.len_words'})
INFO:ekorpkit.pipelines.pipe:Filtering by length: {'_func_': {'_partial_': True, '_target_': 'ekorpkit.pipelines.pipe.filter_length', 'len_bytes': {'_partial_': True, '_target_': 'ekorpkit.utils.func.len_bytes'}, 'len_words': {'_partial_': True, '_target_': 'ekorpkit.utils.func.len_words'}}, 'apply_to': 'text', 'min_length': 30, 'max_length': None, 'len_func': 'len_bytes', 'len_column': 'num_bytes', 'add_len_column': True, 'verbose': True, 'use_batcher': True}
INFO:ekorpkit.base:Using batcher with minibatch size: 1000
INFO:ekorpkit.utils.batch.batcher: backend: joblib  minibatch_size: 1000  procs: 50  input_split: False  merge_output: True  len(data): 1339048 len(args): 

Calculating length:   0%|          | 0/1340 [00:00<?, ?it/s]

INFO:ekorpkit.pipelines.pipe:removed 736936 of 1339048 documents with length < 30
INFO:ekorpkit.pipelines.pipe: >> elapsed time to filter length: 0:00:03.079006
INFO:ekorpkit.base:Applying pipe: functools.partial(<function drop_duplicates at 0x7fce771d34c0>)
INFO:ekorpkit.pipelines.pipe:Dropping duplicates: {'_func_': {'_partial_': True, '_target_': 'ekorpkit.pipelines.pipe.drop_duplicates'}, 'apply_to': 'text', 'verbose': True}
INFO:ekorpkit.pipelines.pipe:601641 documents after dropping 471 duplicates from [['text']]
INFO:ekorpkit.pipelines.pipe: >> elapsed time to drop duplicates: 0:00:01.811704
INFO:ekorpkit.base:Applying pipe: functools.partial(<function save_samples at 0x7fce771d3790>)
INFO:ekorpkit.pipelines.pipe:Saving samples: {'_func_': {'_partial_': True, '_target_': 'ekorpkit.pipelines.pipe.save_samples'}, 'path': {'root': '/content/drive/MyDrive/workspace/data/ekorpkit-book', 'name': 'ekorpkit-book', 'cached_path': None, 'filetype': '', 'verbose': True, 'data_dir': '/conte

----------------------------------------------------------------------------------------------------

text: 
《그랜드 점프》(, )는 슈에이샤가 발행하는 일본의 소년 만화 잡지이다.

----------------------------------------------------------------------------------------------------
text: 
레이크파크()는 다음과 같은 뜻이 있다.

----------------------------------------------------------------------------------------------------


apply len_bytes to num_bytes:   0%|          | 0/602 [00:00<?, ?it/s]

INFO:ekorpkit.base:Using batcher with minibatch size: 1000
INFO:ekorpkit.utils.batch.batcher: backend: joblib  minibatch_size: 1000  procs: 50  input_split: False  merge_output: True  len(data): 601641 len(args): 5


apply len_wospc to num_bytes_wospc:   0%|          | 0/602 [00:00<?, ?it/s]

INFO:ekorpkit.base:Using batcher with minibatch size: 1000
INFO:ekorpkit.utils.batch.batcher: backend: joblib  minibatch_size: 1000  procs: 50  input_split: False  merge_output: True  len(data): 601641 len(args): 5


apply len_words to num_words:   0%|          | 0/602 [00:00<?, ?it/s]

INFO:ekorpkit.base:Using batcher with minibatch size: 1000
INFO:ekorpkit.utils.batch.batcher: backend: joblib  minibatch_size: 1000  procs: 50  input_split: False  merge_output: True  len(data): 601641 len(args): 5


apply len_sents to num_sents:   0%|          | 0/602 [00:00<?, ?it/s]

INFO:ekorpkit.base:Using batcher with minibatch size: 1000
INFO:ekorpkit.utils.batch.batcher: backend: joblib  minibatch_size: 1000  procs: 50  input_split: False  merge_output: True  len(data): 601641 len(args): 5


apply len_segments to num_segments:   0%|          | 0/602 [00:00<?, ?it/s]

INFO:ekorpkit.info.stat: >> elapsed time to calculate statistics: 0:00:07.233255
INFO:ekorpkit.info.stat:Saving updated info file: /content/drive/MyDrive/workspace/data/datasets/corpus/kowiki/info-kowiki.yaml
INFO:ekorpkit.datasets.build:
Corpus [kowiki] is built to [/content/drive/MyDrive/workspace/data/datasets/corpus/kowiki] from [/content/drive/MyDrive/workspace/data/archive/datasets/source/kowiki]


{'category': 'formal',
 'column_info': {'columns': {'id': 'id',
                             'merge_meta_on': 'id',
                             'text': 'text',
                             'timestamp': None},
                 'data': {'id': 'int', 'text': 'str'},
                 'datetime': {'columns': None,
                              'format': None,
                              'rcParams': None},
                 'meta': {'curid': 'str',
                          'id': 'int',
                          'title': 'str',
                          'url': 'str'},
                 'segment_separator': '\\n\\n',
                 'sentence_separator': '\\n',
                 'timestamp': {'format': None, 'key': None, 'rcParams': None}},
 'data_files': {'train': 'kowiki-train.parquet'},
 'data_files_modified': '2022-10-29 06:30:41',
 'description': '위키백과, 우리 모두의 백과사전',
 'fullname': 'Korean Wikipedia Corpus',
 'homepage': 'https://ko.wikipedia.org',
 'info_updated': '2022-10-29 06:58:28',


### Build the English Wikipedia Corpus

In [8]:
cfg = eKonf.compose('corpus/builtin=enwiki')
cfg.verbose = True
cfg.num_workers = 50
db = eKonf.instantiate(cfg)

INFO:ekorpkit.base:instantiating ekorpkit.datasets.build.DatasetBuilder...
INFO:ekorpkit.base:instantiating ekorpkit.io.fetch.loader.wiki.Wiki...


[enwiki.xml.bz2] download enwiki.xml.bz2: 0.00B [00:00, ?B/s]

INFO: Preprocessing '/content/drive/MyDrive/workspace/.cache/corpus/enwiki/enwiki.xml.bz2' to collect template definitions: this may take some time.
INFO: Preprocessed 100000 pages


## Load the corpus

In [2]:
cfg = eKonf.compose('corpus=corpora')
cfg.name = ['bok_minutes', 'fomc_minutes']
cfg.data_dir = '../data'
cfg.auto.load = True
crps = eKonf.instantiate(cfg)
print(crps)

Corpora
----------
bok_minutes
fomc_minutes



In [3]:
crps['bok_minutes'].data

Unnamed: 0_level_0,id,text,split
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ...,train


In [4]:
crps['fomc_minutes'].data

Unnamed: 0_level_0,id,text,content_type,split
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,A meeting of the Federal Open Market Committee...,fomc_minutes,train
1,1,A meeting of the Federal Open Market Committee...,fomc_minutes,train
2,2,A meeting of the Federal Open Market Committee...,fomc_minutes,train
3,3,A meeting of the Federal Open Market Committee...,fomc_minutes,train
4,4,A meeting of the Federal Open Market Committee...,fomc_minutes,train


In [5]:
crps.concat_corpora()

In [6]:
crps.data

Unnamed: 0,id,text,split,corpus,content_type
0,0,Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ...,train,bok_minutes,
1,0,A meeting of the Federal Open Market Committee...,train,fomc_minutes,fomc_minutes
2,1,A meeting of the Federal Open Market Committee...,train,fomc_minutes,fomc_minutes
3,2,A meeting of the Federal Open Market Committee...,train,fomc_minutes,fomc_minutes
4,3,A meeting of the Federal Open Market Committee...,train,fomc_minutes,fomc_minutes
5,4,A meeting of the Federal Open Market Committee...,train,fomc_minutes,fomc_minutes


In [7]:
crps.metadata

Unnamed: 0,id,mdate,rdate,filename,split,corpus,date,speaker,title
0,0,2018-11-30 10:00:00,2018-12-18 16:00:00,BOK_20181130_20181218,train,bok_minutes,,,
1,0,,,,train,fomc_minutes,1993-02-03,Alan Greenspan,FOMC Meeting Minutes
2,1,,,,train,fomc_minutes,1993-03-23,Alan Greenspan,FOMC Meeting Minutes
3,2,,,,train,fomc_minutes,1993-05-18,Alan Greenspan,FOMC Meeting Minutes
4,3,,,,train,fomc_minutes,1993-07-07,Alan Greenspan,FOMC Meeting Minutes
5,4,,,,train,fomc_minutes,1993-08-17,Alan Greenspan,FOMC Meeting Minutes


## Instantiating a corpus

In [8]:
cfg = eKonf.compose('corpus')
cfg.name = 'bok_minutes'
cfg.data_dir = '../data'
cfg.column_info.timestamp.key = 'mdate'
crps = eKonf.instantiate(cfg)
print(crps)

Corpus : bok_minutes


In [9]:
crps.data

Unnamed: 0_level_0,id,text,split
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ...,train


In [10]:
crps.metadata

Unnamed: 0,id,mdate,rdate,filename,split
0,0,2018-11-30 10:00:00,2018-12-18 16:00:00,BOK_20181130_20181218,train


In [11]:
print(crps.ID, crps.IDs, crps.TEXT, crps.DATA, crps.METADATA)

id ['id', 'split'] text ['id', 'text', 'split'] ['id', 'mdate', 'rdate', 'filename', 'split']


In [12]:
crps.merge_metadata()
crps.data

Unnamed: 0,id,text,split,mdate,rdate,filename
0,0,Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ...,train,2018-11-30 10:00:00,2018-12-18 16:00:00,BOK_20181130_20181218


In [14]:
crps.COLUMN.TIMESTAMP_INFO.key = 'mdate'
crps.load_timestamp()
crps.data

Unnamed: 0,id,text,split,mdate,rdate,filename,timestamp
0,0,Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ...,train,2018-11-30 10:00:00,2018-12-18 16:00:00,BOK_20181130_20181218,2018-11-30 10:00:00


In [15]:
eKonf.pprint(crps.INFO)

{'category': 'formal',
 'column_info': {'_keys_': {'dataset': 'dataset',
                            'id': 'id',
                            'split': 'split',
                            'text': 'text',
                            'timestamp': 'timestamp'},
                 'columns': {'id': 'id',
                             'merge_meta_on': 'id',
                             'text': 'text',
                             'timestamp': None},
                 'data': {'id': 'int', 'text': 'str'},
                 'datetime': {'columns': None,
                              'format': None,
                              'rcParams': None},
                 'meta': {'filename': 'str',
                          'id': 'int',
                          'mdate': 'str',
                          'rdate': 'str'},
                 'segment_separator': '\\n\\n',
                 'sentence_separator': '\\n',
                 'timestamp': {'format': None, 'key': None, 'rcParams': None}},
 'data_files': 