# Wikipedia

In [1]:
import os
import re
import glob
import pickle
#import logging
from datetime import datetime
from collections import Counter
import ftfy
import spacy
from textacy.datasets.wikipedia import Wikipedia, strip_markup
from tqdm import tqdm
tqdm.monitor_interval = 0

#logging.basicConfig(filename='logs/wikipedia.log', filemode='a', level=logging.INFO, 
#                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

WIKI_DATA_DIR = '/home/bacon/miniconda3/lib/python3.6/site-packages/textacy/data/wikipedia'
OUT_DIR = 'wikipedia'

In [2]:
class WikipediaCorpus:
    
    MIN_NUM_SENTS_PER_ARTICLE = 10
    MIN_NUM_WORDS_IN_SENT = 5
    MAX_NUM_WORDS_IN_SENT = 70
    
    SENT_ENDS = ['.', '!', '?']
    TABLE_PREFIX = re.compile(r'\s*(\{\))|(\|)|(\|\})')
    TAG = re.compile('<[^<>]+>')
    
    def __init__(self, lg):
        self.lg = lg
        nlp = spacy.load(lg, disable=['tagger', 'parser', 'ner', 'textcat'])
        self.tokenizer = nlp.tokenizer
        self.out_dir = os.path.join(OUT_DIR, self.lg, 'articles')
        os.makedirs(self.out_dir) # errors if out_dir already exists so I don't re-write data
    
    def download(self):
        wp = Wikipedia(lang=self.lg, version='latest', data_dir=WIKI_DATA_DIR)
        _ = wp.download()
        return
    
    def prepare(self, N):
        self.N = N
        self.num_sents = 0
        self.num_tokens = 0
        self.freq_dist = Counter()
        wp = Wikipedia(lang=self.lg, version='latest', data_dir=WIKI_DATA_DIR)
        for i, _, content in tqdm(wp):
            n, text = self.prepare_one_article(content)
            if text:
                self.num_sents += n
                tokens = text.split()
                self.num_tokens += len(tokens)
                self.freq_dist.update(tokens)
                self.write(text, i)
                if self.num_sents >= self.N:
                    self.write_metadata()
                    return  
    
    def prepare_one_article(self, content):
        content = '\n'.join([line for line in content.split('\n') if not self.TABLE_PREFIX.match(line)])
        content = strip_markup(content)
        lines = self.clean_lines(content)
        sentences = []
        for line in lines:
            for sent in self.tokenize(line):
                sentences.append(sent)
        article_length = len(sentences)
        if article_length >= self.MIN_NUM_SENTS_PER_ARTICLE:
            return article_length, '\n'.join(sentences).lower()
        return 0, ''
    
    def clean_lines(self, content):
        content = ftfy.fix_text(content)
        for line in content.split('\n'):
            line = self.TAG.sub('', line.strip())
            if line and line[0].isalnum():
                char_ratio = float(sum(ch.islower() for ch in line)) / sum(not ch.isspace() for ch in line)
                if char_ratio > 0.9:
                    yield line
    
    def tokenize(self, content):
        tokens = []
        for token in self.tokenizer(content):
            token = token.text
            tokens.append(token)
            if token in self.SENT_ENDS:
                if self.MIN_NUM_WORDS_IN_SENT <= len(tokens) <= self.MAX_NUM_WORDS_IN_SENT:
                    yield ' '.join(tokens)
                tokens = []
        if tokens:
            if self.MIN_NUM_WORDS_IN_SENT <= len(tokens) <= self.MAX_NUM_WORDS_IN_SENT:
                yield ' '.join(tokens)
    
    def write(self, text, i):
        fname = os.path.join(self.out_dir, '{}.txt'.format(i))
        with open(fname, encoding='utf-8', mode='w') as f:
            text = text.replace('\xa0',' ')
            f.write(text)
    
    def write_metadata(self):
        metadata = {'num_sents': self.num_sents, 'num_tokens': self.num_tokens,
                    'freq_dist': self.freq_dist, 'N': self.N}
        fname = os.path.join(OUT_DIR, self.lg, 'metadata.pkl')
        with open(fname, mode='wb') as f:
            pickle.dump(metadata, f)

In [3]:
def main(lgs, N):
    for lg in lgs:
        start = datetime.now()
        corpus = WikipediaCorpus(lg)
        #corpus.download()
        corpus.prepare(N)
        end = datetime.now()
        msg = 'Processing {} sentences for {} took {}'.format(N, lg, end-start)
        logging.info(msg)

## Postprocessing

In [4]:
def postprocess(lg, T):
    unk_dir = os.path.join(OUT_DIR, lg, 'unk-articles')
    os.makedirs(unk_dir) # errors if already exists
    UNK = 'UNK'
    fname = os.path.join(OUT_DIR, lg, 'metadata.pkl')
    with open(fname, 'rb') as f:
        obj = pickle.load(f)
    wiki_freq_dist = obj['freq_dist']
    vocab = set([w for w,f in wiki_freq_dist.most_common(T)])
    fname = os.path.join('UD', lg, 'metadata.pkl')
    with open(fname, 'rb') as f:
        obj = pickle.load(f)
    ud_freq_dist = obj['freq_dist']
    ud_vocab = set(ud_freq_dist.keys())
    not_in_wiki = set()
    for t in ud_vocab:
        if t in wiki_freq_dist:
            vocab.add(t)
        else:
            not_in_wiki.add(t)
    article_fnames = glob.iglob(os.path.join(OUT_DIR, lg, 'articles/*.txt'))
    for fname in article_fnames:
        with open(fname, encoding='utf-8') as f:
            new_lines = []
            for line in f:
                tokens = line.strip().split(' ')
                new_tokens = []
                for token in tokens:
                    if token in vocab:
                        new_tokens.append(token)
                    else:
                        new_tokens.append(UNK)
                new_line = ' '.join(new_tokens)
                new_lines.append(new_line)
            new_text = '\n'.join(new_lines)
        new_fname = os.path.join(unk_dir, os.path.basename(fname))
        with open(new_fname, mode='w', encoding='utf-8') as f:
            f.write(new_text)
    word2id = {w:i for (w,i) in zip(vocab, range(len(vocab)))}
    word2id['UNK'] = len(word2id)
    fname = os.path.join(OUT_DIR, lg, 'unk-metadata.pkl')
    metadata = {'word2id': word2id, 'not-in-wiki': not_in_wiki}
    with open(fname, 'wb') as f:
        pickle.dump(metadata, f)

In [5]:
N = 1000000 # number of sentences
main(['en', 'fr', 'it', 'es'], N)
T = 50000 # vocab size
for lg in ['en', 'fr', 'it', 'es']:
    start = datetime.now()
    postprocess(lg, T)
    end = datetime.now()
    msg = 'Postprocesing {} took {}'.format(lg, end-start)
    logging.info(msg)

In [42]:
def check_missing_words(lg):
    fname = os.path.join('wikipedia', lg, 'unk-metadata.pkl')
    with open(fname, 'rb') as f:
        obj = pickle.load(f)
    not_in_wiki = obj['not-in-wiki']
    fname = os.path.join('UD', lg, 'metadata.pkl')
    with open(fname, 'rb') as f:
        obj = pickle.load(f)
    ud_vocab = obj['freq_dist']
    tensed = obj['tensed_types']
    responsible = obj['responsible_types']
    missing_vocab = Counter({w:f for w,f in ud_vocab.items() if w in not_in_wiki})
    return (tensed & not_in_wiki), (responsible & not_in_wiki), missing_vocab

def check_unk_proportion(lg):
    fname = os.path.join('wikipedia', lg, 'metadata.pkl')
    with open(fname, 'rb') as f:
        obj = pickle.load(f)
    wiki_freq_dist = obj['freq_dist']
    num_tokens = obj['num_tokens']
    fname = os.path.join('wikipedia', lg, 'unk-metadata.pkl')
    with open(fname, 'rb') as f:
        obj = pickle.load(f)
    word2id = obj['word2id']
    num_unks = 0
    for word in wiki_freq_dist:
        if word not in word2id:
            num_unks += wiki_freq_dist[word]
    return num_unks / num_tokens

In [48]:
for lg in ['en', 'fr', 'it', 'es']:
    missing_tense, missing_responsible, missing_vocab = check_missing_words(lg)
    print(lg)
    print('Missing vocab: ', len(missing_vocab))
    print('Missing tense: ', len(missing_tense))
    print('Missing responsible: ', len(missing_responsible))
    print('Number of UNKs: ', check_unk_proportion(lg))
    print()

en
Missing vocab:  2692
Missing tense:  133
Missing responsible:  38
Number of UNKs:  0.02040813786045971

fr
Missing vocab:  9455
Missing tense:  679
Missing responsible:  294
Number of UNKs:  0.023629998821879106

it
Missing vocab:  1894
Missing tense:  319
Missing responsible:  164
Number of UNKs:  0.04170310933736561

es
Missing vocab:  9561
Missing tense:  842
Missing responsible:  300
Number of UNKs:  0.02912028187030663

