## Wikiextractor 를 통해 

In [1]:
import os
import re
from tqdm import tqdm
from kiwipiepy import Kiwi, Option
from kiwipiepy.utils import Stopwords

from gensim.models import FastText
from multiprocessing import Pool

## wikiextractor 설치

- pip install wikiextractor
- wget https://dumps.wikimedia.org/kowiki/latest/kowiki-latest-pages-articles.xml.bz2
- python -m wikiextractor.WikiExtractor kowiki-latest-pages-articles.xml.bz2

```
├── kowiki-latest-pages-articles6.xml-p3270441p3326027.bz2
├── model_pipeline.ipynb
└── text
    └── AA
        ├── wiki_00
        ├── wiki_01
        ├── wiki_02
        ├── wiki_03
        ├── wiki_04
        ├── wiki_05
        ├── wiki_06
        ├── wiki_07
        ├── wiki_08
        ├── wiki_09
        ├── wiki_10
        └── wiki_11
```

In [2]:
wiki_files = []
for root, dirs, files in os.walk('./text'):
    for f in files:
        if f.startswith('wiki'): wiki_files.append(f'{root}/{f}')

In [3]:
wiki_files

['./text/AA/wiki_00',
 './text/AA/wiki_10',
 './text/AA/wiki_06',
 './text/AA/wiki_05',
 './text/AA/wiki_03',
 './text/AA/wiki_07',
 './text/AA/wiki_11',
 './text/AA/wiki_08',
 './text/AA/wiki_01',
 './text/AA/wiki_02',
 './text/AA/wiki_09',
 './text/AA/wiki_04']

In [4]:
kiwi = Kiwi()
stopwords = Stopwords()

In [21]:
WIKI_REMOVE_CHARS = re.compile("'+|(=+.{2,30}=+)|__TOC__|(ファイル:).+|:(en|de|it|fr|es|kr|zh|no|fi):|\n", re.UNICODE)
WIKI_SPACE_CHARS = re.compile("(\\s|゙|゚|　)+", re.UNICODE)
EMAIL_PATTERN = re.compile("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", re.UNICODE)
URL_PATTERN = re.compile("(ftp|http|https)?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", re.UNICODE)
WIKI_REMOVE_TOKEN_CHARS = re.compile("(\\*$|:$|^파일:.+|^;)", re.UNICODE)
MULTIPLE_SPACES = re.compile(' +', re.UNICODE)
WIKI_DOC_TAG = re.compile(r'\s*</?')

def replacer(content):
    content = re.sub(EMAIL_PATTERN, ' ', content)  # remove email pattern
    content = re.sub(URL_PATTERN, ' ', content) # remove url pattern
    content = re.sub(WIKI_REMOVE_CHARS, ' ', content)  # remove unnecessary chars
    content = re.sub(WIKI_SPACE_CHARS, ' ', content)
    content = re.sub(MULTIPLE_SPACES, ' ', content)
    return content

def tokenizer(content):
    content = replacer(content);
    tokens = kiwi.tokenize(content, stopwords=stopwords, normalize_coda=True)
    return tokens
    
def worker(file_path):
    return_tokens = []
    with open(file_path, 'r') as fr:
        for row in fr:
            if re.match(WIKI_DOC_TAG, row): continue
            tokens = tokenizer(row)
            if 5 < len(tokens) < 1000:
                return_tokens.append([t.form for t in tokens])
    return return_tokens

### map vs apply vs map_async vs imap vs imap_unordered
https://tempdev.tistory.com/27

In [22]:
all_list = []
with Pool(3) as pool:
    for ret in pool.imap_unordered(worker, wiki_files):
        all_list.append(ret)
flat_list = [item for sublist in all_list for item in sublist ]        

In [24]:
model = FastText(vector_size=100, window=5, min_count=3)
model.build_vocab(corpus_iterable=flat_list)
model.train(corpus_iterable=flat_list, total_examples=len(flat_list), epochs=10) 

In [45]:
model.wv.similarity('러시아', '소련')

0.78960735