# Wikipedia

In [1]:
import os
import re
import glob
import pickle
import logging
from datetime import datetime
from collections import Counter
import ftfy
import spacy
from textacy.datasets.wikipedia import Wikipedia, strip_markup
from tqdm import tqdm
tqdm.monitor_interval = 0

logging.basicConfig(filename='logs/wikipedia.log', filemode='w', level=logging.INFO, 
                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

WIKI_DATA_DIR = '/home/bacon/miniconda3/lib/python3.6/site-packages/textacy/data/wikipedia'
OUT_DIR = 'wikipedia'

In [2]:
class WikipediaCorpus:
    
    MIN_NUM_SENTS_PER_ARTICLE = 10
    MIN_NUM_WORDS_IN_SENT = 5
    MAX_NUM_WORDS_IN_SENT = 70
    
    SENT_ENDS = ['.', '!', '?']
    TABLE_PREFIX = re.compile(r'\s*(\{\))|(\|)|(\|\})')
    TAG = re.compile('<[^<>]+>')
    
    BAD_POS = ['INTJ', 'SYM', 'X']
    
    def __init__(self, lg):
        self.lg = lg
        nlp = spacy.load(lg, disable=['tagger', 'parser', 'ner', 'textcat'])
        self.tokenizer = nlp.tokenizer
        self.out_dir = os.path.join(OUT_DIR, self.lg, 'articles')
        os.makedirs(self.out_dir) # errors if out_dir already exists so I don't re-write data
    
    def download(self):
        wp = Wikipedia(lang=self.lg, version='latest', data_dir=WIKI_DATA_DIR)
        _ = wp.download()
        return
    
    def prepare(self, N):
        self.N = N
        self.num_sents = 0
        self.num_tokens = 0
        self.freq_dist = Counter()
        wp = Wikipedia(lang=self.lg, version='latest', data_dir=WIKI_DATA_DIR)
        for i, _, content in tqdm(wp):
            n, text = self.prepare_one_article(content)
            if text:
                self.num_sents += n
                tokens = text.split()
                self.num_tokens += len(tokens)
                self.freq_dist.update(tokens)
                self.write(text, i)
                if self.num_sents >= self.N:
                    self.write_metadata()
                    return  
    
    def prepare_one_article(self, content):
        content = '\n'.join([line for line in content.split('\n') if not self.TABLE_PREFIX.match(line)])
        content = strip_markup(content)
        lines = self.clean_lines(content)
        sentences = []
        for line in lines:
            for sent in self.tokenize(line):
                sentences.append(sent)
        article_length = len(sentences)
        if article_length >= self.MIN_NUM_SENTS_PER_ARTICLE:
            return article_length, '\n'.join(sentences).lower()
        return 0, ''
    
    def clean_lines(self, content):
        content = ftfy.fix_text(content)
        for line in content.split('\n'):
            line = self.TAG.sub('', line.strip())
            if line and line[0].isalnum():
                char_ratio = float(sum(ch.islower() for ch in line)) / sum(not ch.isspace() for ch in line)
                if char_ratio > 0.9:
                    yield line
    
    def tokenize(self, content):
        tokens = []
        for token in self.tokenizer(content):
            token = token.text
            tokens.append(token)
            if token in self.SENT_ENDS:
                if self.MIN_NUM_WORDS_IN_SENT <= len(tokens) <= self.MAX_NUM_WORDS_IN_SENT:
                    yield ' '.join(tokens)
                tokens = []
        if tokens:
            if self.MIN_NUM_WORDS_IN_SENT <= len(tokens) <= self.MAX_NUM_WORDS_IN_SENT:
                yield ' '.join(tokens)
    
    def write(self, text, i):
        fname = os.path.join(self.out_dir, '{}.txt'.format(i))
        with open(fname, encoding='utf-8', mode='w') as f:
            text = text.replace('\xa0',' ')
            f.write(text)
    
    def write_metadata(self):
        metadata = {'num_sents': self.num_sents, 'num_tokens': self.num_tokens,
                    'freq_dist': self.freq_dist, 'N': self.N}
        fname = os.path.join(OUT_DIR, self.lg, 'metadata.pkl')
        with open(fname, mode='wb') as f:
            pickle.dump(metadata, f)

In [3]:
def main(lgs, N):
    for lg in lgs:
        start = datetime.now()
        corpus = WikipediaCorpus(lg)
        #corpus.download()
        corpus.prepare(N)
        end = datetime.now()
        msg = 'Processing {} sentences for {} took {}'.format(N, lg, end-start)
        logging.info(msg)

In [4]:
main(['en', 'fr', 'it', 'es'], 1000)

47it [00:00, 48.81it/s]
