# Lab 1: Preparing Data for Language Modeling

## Install the ekorpkit library along with its dependencies

In [None]:
%pip install --pre ekorpkit[wiki,fetch]

In [1]:
from ekorpkit import eKonf

eKonf.setLogger("INFO")
print("version:", eKonf.__version__)

is_notebook = eKonf.is_notebook()
is_colab = eKonf.is_colab()
print("is notebook?", is_notebook)
print("is colab?", is_colab)
if is_colab:
    eKonf.mount_google_drive()
eKonf.set_workspace(workspace="/content/drive/MyDrive/workspace/", project="ekorpkit-book")

print("environment variables:")
eKonf.print(eKonf.env().dict())

INFO:ekorpkit.utils.notebook:shell type: ZMQInteractiveShell
INFO:ekorpkit.utils.notebook:Google Colab not detected.
INFO:ekorpkit.base:Setting EKORPKIT_WORKSPACE_ROOT to /content/drive/MyDrive/workspace/
INFO:ekorpkit.base:Setting EKORPKIT_PROJECT to ekorpkit-book
INFO:ekorpkit.base:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
INFO:ekorpkit.base:Loaded .env from /workspace/projects/ekorpkit-book/config/.env


version: 0.1.40.post0.dev2
is notebook? True
is colab? False
environment variables:
{'CUDA_DEVICE_ORDER': None,
 'CUDA_VISIBLE_DEVICES': None,
 'EKORPKIT_CONFIG_DIR': '/workspace/projects/ekorpkit-book/config',
 'EKORPKIT_DATA_DIR': None,
 'EKORPKIT_LOG_LEVEL': 'INFO',
 'EKORPKIT_PROJECT': 'ekorpkit-book',
 'EKORPKIT_WORKSPACE_ROOT': '/content/drive/MyDrive/workspace',
 'KMP_DUPLICATE_LIB_OK': 'TRUE',
 'NUM_WORKERS': 230}


## Build corpora with the ekorpkit configs

### Wikipedia Dump

- The first step is to download the Wikipedia dump. 
- The dump is a collection of all Wikipedia articles in XML format. 
- The dump for English is available at https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2. 
- The dump is about 20.3 GB in size.
- For other languages, change `en` to the language code of your choice.
- For the detailed list of language codes, see https://meta.wikimedia.org/wiki/List_of_Wikipedias.

#### Fetch the dump and extract it to the `data` directory 

```python
from ekorpkit.io.fetch.loader.wiki import Wiki

wiki = Wiki(lang="ko", output_dir="data")
wiki.download_dump()
wiki.extract_wiki()
```


Following the instructions above, you can download the dump for other languages.

In [7]:
wiki_cfg = eKonf.compose('io/fetcher=wiki')
wiki_cfg.lang = "ko"
wiki_cfg.name = "kowiki"
wiki_cfg.output_dir = f"{wiki_cfg.dump.dump_dir}/extracted"
wiki_cfg.autoload = False
wiki_cfg.force_download = True
wiki_cfg.num_workers = 50
wiki_cfg.verbose = True

eKonf.print(wiki_cfg)

{'_name_': 'fetcher',
 '_target_': 'ekorpkit.io.fetch.loader.wiki.Wiki',
 'auto': {'load': False},
 'autoload': False,
 'compress': False,
 'dump': {'_target_': 'web_download',
          'dump_dir': '/content/drive/MyDrive/workspace/.cache/corpus/kowiki',
          'dump_file': 'kowiki.xml.bz2',
          'url': 'https://dumps.wikimedia.org/kowiki/latest/kowiki-latest-pages-articles.xml.bz2'},
 'extract': {'_target_': 'extract_wiki'},
 'force': {'download': True},
 'force_download': True,
 'lang': 'ko',
 'limit': -1,
 'name': 'kowiki',
 'num_workers': 50,
 'output_dir': '/content/drive/MyDrive/workspace/.cache/corpus/kowiki/extracted',
 'output_file': None,
 'path': {'cached_path': None,
          'columns': None,
          'concat_data': False,
          'data_columns': None,
          'data_dir': '/content/drive/MyDrive/workspace/data/kowiki',
          'data_file': None,
          'filetype': '',
          'name': 'kowiki',
          'output': {'base_dir': '/content/drive/MyDrive/wo

In [8]:
from ekorpkit.io.fetch.loader.wiki import Wiki

args = eKonf.to_dict(wiki_cfg)
wiki = Wiki(**args)
wiki.download_dump()
wiki.extract_wiki()


[kowiki.xml.bz2] download kowiki.xml.bz2: 0.00B [00:00, ?B/s]

INFO: Preprocessing '/content/drive/MyDrive/workspace/.cache/corpus/kowiki/kowiki.xml.bz2' to collect template definitions: this may take some time.
INFO: Preprocessed 100000 pages
INFO: Preprocessed 200000 pages
INFO: Preprocessed 300000 pages
INFO: Preprocessed 400000 pages
INFO: Preprocessed 500000 pages
INFO: Preprocessed 600000 pages
INFO: Preprocessed 700000 pages
INFO: Preprocessed 800000 pages
INFO: Preprocessed 900000 pages
INFO: Preprocessed 1000000 pages
INFO: Preprocessed 1100000 pages
INFO: Preprocessed 1200000 pages
INFO: Preprocessed 1300000 pages
INFO: Preprocessed 1400000 pages
INFO: Preprocessed 1500000 pages
INFO: Preprocessed 1600000 pages
INFO: Preprocessed 1700000 pages
INFO: Loaded 60850 templates in 214.3s
INFO: Starting page extraction from /content/drive/MyDrive/workspace/.cache/corpus/kowiki/kowiki.xml.bz2.
INFO: Using 50 extract processes.
INFO: Extracted 100000 articles (2809.9 art/s)
INFO: Extracted 200000 articles (4194.9 art/s)
INFO: Extracted 300000 art

Extracted kowiki from dump file /content/drive/MyDrive/workspace/.cache/corpus/kowiki/kowiki.xml.bz2


INFO: Finished 50-process extraction of 1339049 articles in 247.2s (5417.6 art/s)


#### Build the corpus by Parsing the Wikipedia Dump

- The next step is to parse the Wikipedia dump and build the corpus.
- Extracted Wikipedia dump is in JSON Lines format, which is a line-delimited JSON format.

In [4]:
cfg = eKonf.compose('corpus/builtin=kowiki')
cfg.data_dir = '../data/kowiki'
cfg.verbose = True
# eKonf.print(cfg)
db = eKonf.instantiate(cfg)

INFO:ekorpkit.base:instantiating ekorpkit.datasets.build.DatasetBuilder...
INFO:ekorpkit.base:instantiating ekorpkit.io.fetch.loader.wiki.Wiki...


[kowiki.xml.bz] download kowiki.xml.bz: 0.00B [00:00, ?B/s]

Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/conda/lib/python3.8/site-packages/wikiextractor/WikiExtractor.py", line 645, in <module>
    main()
  File "/opt/conda/lib/python3.8/site-packages/wikiextractor/WikiExtractor.py", line 640, in main
    process_dump(input_file, args.templates, output_path, file_size,
  File "/opt/conda/lib/python3.8/site-packages/wikiextractor/WikiExtractor.py", line 301, in process_dump
    for line in input:
  File "/opt/conda/lib/python3.8/codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 16: invalid start byte
INFO:ekorpkit.base:instantiating ekorpkit.info.stat.SummaryInfo...
INFO:ekorpkit.info.stat:Loading info file: 

Extracted kowiki from dump file /workspace/.cache/corpus/kowiki/kowiki.xml.bz
{'category': 'formal',
 'column_info': {'columns': {'id': 'id',
                             'merge_meta_on': 'id',
                             'text': 'text',
                             'timestamp': None},
                 'data': {'id': 'int', 'text': 'str'},
                 'datetime': {'columns': None,
                              'format': None,
                              'rcParams': None},
                 'meta': {'curid': 'str',
                          'id': 'int',
                          'title': 'str',
                          'url': 'str'},
                 'segment_separator': '\\n\\n',
                 'sentence_separator': '\\n',
                 'timestamp': {'format': None, 'key': None, 'rcParams': None}},
 'description': '위키백과, 우리 모두의 백과사전',
 'fullname': 'Korean Wikipedia Corpus',
 'homepage': 'https://ko.wikipedia.org',
 'lang': 'ko',
 'license': 'CC Attribution / Share-Alike 3.

InstantiationException: Error in call to target 'ekorpkit.datasets.build.DatasetBuilder':
KeyError("['curid', 'url', 'title'] not in index")
full_key: corpus.builtin

In [3]:
cfg = eKonf.compose("corpus/builtin=_dummy_fomc_minutes")
cfg.data_dir = "../data/fomc_minutes"
db = eKonf.instantiate(cfg)
db.build()

apply len_bytes to num_bytes:   0%|          | 0/5 [00:00<?, ?it/s]

apply len_sents to num_sents:   0%|          | 0/5 [00:00<?, ?it/s]

Normalizing column: text:   0%|          | 0/5 [00:00<?, ?it/s]

Splitting column: text:   0%|          | 0/5 [00:00<?, ?it/s]

apply len_bytes to num_bytes:   0%|          | 0/5 [00:00<?, ?it/s]

apply len_wospc to num_bytes_wospc:   0%|          | 0/5 [00:00<?, ?it/s]

apply len_words to num_words:   0%|          | 0/5 [00:00<?, ?it/s]

apply len_sents to num_sents:   0%|          | 0/5 [00:00<?, ?it/s]

apply len_segments to num_segments:   0%|          | 0/5 [00:00<?, ?it/s]

apply len_bytes to num_bytes:   0%|          | 0/5 [00:00<?, ?it/s]

apply len_sents to num_sents:   0%|          | 0/5 [00:00<?, ?it/s]

Normalizing column: text:   0%|          | 0/5 [00:00<?, ?it/s]

Splitting column: text:   0%|          | 0/5 [00:00<?, ?it/s]

apply len_bytes to num_bytes:   0%|          | 0/5 [00:00<?, ?it/s]

apply len_wospc to num_bytes_wospc:   0%|          | 0/5 [00:00<?, ?it/s]

apply len_words to num_words:   0%|          | 0/5 [00:00<?, ?it/s]

apply len_sents to num_sents:   0%|          | 0/5 [00:00<?, ?it/s]

apply len_segments to num_segments:   0%|          | 0/5 [00:00<?, ?it/s]

## Instantiating corpora

In [2]:
cfg = eKonf.compose('corpus=corpora')
cfg.name = ['bok_minutes', 'fomc_minutes']
cfg.data_dir = '../data'
cfg.auto.load = True
crps = eKonf.instantiate(cfg)
print(crps)

Corpora
----------
bok_minutes
fomc_minutes



In [3]:
crps['bok_minutes'].data

Unnamed: 0_level_0,id,text,split
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ...,train


In [4]:
crps['fomc_minutes'].data

Unnamed: 0_level_0,id,text,content_type,split
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,A meeting of the Federal Open Market Committee...,fomc_minutes,train
1,1,A meeting of the Federal Open Market Committee...,fomc_minutes,train
2,2,A meeting of the Federal Open Market Committee...,fomc_minutes,train
3,3,A meeting of the Federal Open Market Committee...,fomc_minutes,train
4,4,A meeting of the Federal Open Market Committee...,fomc_minutes,train


In [5]:
crps.concat_corpora()

In [6]:
crps.data

Unnamed: 0,id,text,split,corpus,content_type
0,0,Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ...,train,bok_minutes,
1,0,A meeting of the Federal Open Market Committee...,train,fomc_minutes,fomc_minutes
2,1,A meeting of the Federal Open Market Committee...,train,fomc_minutes,fomc_minutes
3,2,A meeting of the Federal Open Market Committee...,train,fomc_minutes,fomc_minutes
4,3,A meeting of the Federal Open Market Committee...,train,fomc_minutes,fomc_minutes
5,4,A meeting of the Federal Open Market Committee...,train,fomc_minutes,fomc_minutes


In [7]:
crps.metadata

Unnamed: 0,id,mdate,rdate,filename,split,corpus,date,speaker,title
0,0,2018-11-30 10:00:00,2018-12-18 16:00:00,BOK_20181130_20181218,train,bok_minutes,,,
1,0,,,,train,fomc_minutes,1993-02-03,Alan Greenspan,FOMC Meeting Minutes
2,1,,,,train,fomc_minutes,1993-03-23,Alan Greenspan,FOMC Meeting Minutes
3,2,,,,train,fomc_minutes,1993-05-18,Alan Greenspan,FOMC Meeting Minutes
4,3,,,,train,fomc_minutes,1993-07-07,Alan Greenspan,FOMC Meeting Minutes
5,4,,,,train,fomc_minutes,1993-08-17,Alan Greenspan,FOMC Meeting Minutes


## Instantiating a corpus

In [8]:
cfg = eKonf.compose('corpus')
cfg.name = 'bok_minutes'
cfg.data_dir = '../data'
cfg.column_info.timestamp.key = 'mdate'
crps = eKonf.instantiate(cfg)
print(crps)

Corpus : bok_minutes


In [9]:
crps.data

Unnamed: 0_level_0,id,text,split
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ...,train


In [10]:
crps.metadata

Unnamed: 0,id,mdate,rdate,filename,split
0,0,2018-11-30 10:00:00,2018-12-18 16:00:00,BOK_20181130_20181218,train


In [11]:
print(crps.ID, crps.IDs, crps.TEXT, crps.DATA, crps.METADATA)

id ['id', 'split'] text ['id', 'text', 'split'] ['id', 'mdate', 'rdate', 'filename', 'split']


In [12]:
crps.merge_metadata()
crps.data

Unnamed: 0,id,text,split,mdate,rdate,filename
0,0,Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ...,train,2018-11-30 10:00:00,2018-12-18 16:00:00,BOK_20181130_20181218


In [14]:
crps.COLUMN.TIMESTAMP_INFO.key = 'mdate'
crps.load_timestamp()
crps.data

Unnamed: 0,id,text,split,mdate,rdate,filename,timestamp
0,0,Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ...,train,2018-11-30 10:00:00,2018-12-18 16:00:00,BOK_20181130_20181218,2018-11-30 10:00:00


In [15]:
eKonf.pprint(crps.INFO)

{'category': 'formal',
 'column_info': {'_keys_': {'dataset': 'dataset',
                            'id': 'id',
                            'split': 'split',
                            'text': 'text',
                            'timestamp': 'timestamp'},
                 'columns': {'id': 'id',
                             'merge_meta_on': 'id',
                             'text': 'text',
                             'timestamp': None},
                 'data': {'id': 'int', 'text': 'str'},
                 'datetime': {'columns': None,
                              'format': None,
                              'rcParams': None},
                 'meta': {'filename': 'str',
                          'id': 'int',
                          'mdate': 'str',
                          'rdate': 'str'},
                 'segment_separator': '\\n\\n',
                 'sentence_separator': '\\n',
                 'timestamp': {'format': None, 'key': None, 'rcParams': None}},
 'data_files': 