#### Parsing data from the wiki10+ dataset

This consists of two files:

* HTML content of 20,764 wikipedia articles
* Corresponding article "tags"

Tags are hand-labelled categories, each article can have mutiple tags

#### Downloading / unzipping data

In [5]:
#%env WIKI10_DIR=../../data/wiki10
#!wget 'http://nlp.uned.es/social-tagging/wiki10+/wiki10+_documents.tar.bz2' -O $WIKI10_DIR'/content.tar.bz2'
#!wget 'http://nlp.uned.es/social-tagging/wiki10+/wiki10+_tag-data.tar.gz' -O $WIKI10_DIR'/tags.tar.gz'
#!tar -xzC $WIKI10_DIR -f $WIKI10_DIR'/tags.tar.gz' && rm $WIKI10_DIR'/tags.tar.gz'
#!tar -xjC $WIKI10_DIR -f $WIKI10_DIR'/content.tar.bz2' && rm $WIKI10_DIR'/content.tar.bz2'
#!ls $WIKI10_DIR

In [6]:
import json
from pathlib import Path
from lxml import etree, html
from itertools import islice
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np 
%matplotlib inline 

#### Parsing tag data
This is an XML file containing meta data about each document, including the tags assigned to it

In [7]:
DATA_DIR = Path('../../data/wiki10')

DOCUMENTS_DIR = DATA_DIR / 'documents'
TAGS_PATH = DATA_DIR / 'tag-data.xml'

DOCUMENTS_OUTPUT_DIR = DATA_DIR / 'text' 
TAGS_OUTPUT_PATH = DATA_DIR / 'tags.json'

In [8]:
tag_bytes = TAGS_PATH.read_bytes()
tag_tree = etree.fromstring(tag_bytes) 

Checking tag distribution 

In [9]:
tag_count = Counter(tag for tag in tag_tree.xpath('//name/text()')) 

In [10]:
tag_count_100 = tag_count.most_common(100)

In [11]:
tag_count_100

[('wikipedia', 16715),
 ('wiki', 8681),
 ('reference', 5914),
 ('history', 3829),
 ('research', 2980),
 ('science', 2610),
 ('interesting', 2085),
 ('programming', 2062),
 ('article', 1944),
 ('people', 1901),
 ('philosophy', 1856),
 ('culture', 1803),
 ('art', 1627),
 ('politics', 1554),
 ('software', 1550),
 ('design', 1492),
 ('language', 1390),
 ('books', 1354),
 ('technology', 1338),
 ('psychology', 1293),
 ('music', 1286),
 ('development', 1241),
 ('math', 1238),
 ('theory', 1157),
 ('religion', 1149),
 ('computer', 1132),
 ('literature', 1089),
 ('business', 1058),
 ('education', 1037),
 ('writing', 1003),
 ('health', 986),
 ('definition', 949),
 ('information', 911),
 ('economics', 897),
 ('cool', 889),
 ('web', 887),
 ('mathematics', 880),
 ('encyclopedia', 858),
 ('internet', 857),
 ('articles', 819),
 ('english', 794),
 ('fun', 759),
 ('architecture', 747),
 ('book', 737),
 ('inspiration', 735),
 ('film', 725),
 ('linux', 715),
 ('reading', 715),
 ('free', 690),
 ('biography

Excluding some generic tags

In [12]:
excluded_tags = {'wikipedia', 'wiki','reference', 'research', 'interesting', 'article', 
                      'definition', 'information', 'cool', '-', 'free', 'fun','articles', 'inspiration', 
                       'encyclopedia', 'tools', 'read', 'work', 'learning', 'info', 'social', 'words'
                       'list', 'ideas', 'todo', 'humor', 'of', 'future', 'data'}

In [13]:
categories = {tag for tag, _ in tag_count_100 if tag not in excluded_tags} 

In [14]:
len(categories) 

73

Parse tag XML:

* Filter tags which are not in category list
* Tag value stored the proportion of users which assigned this tag to this document
* Store result as json 

In [18]:
tag_dict = {} 
for document in tag_tree.xpath('/articles/article'):
    record = {} 
    if len(document) > 2: # skip documents with missing data
        record_id = document.xpath('hash/text()')[0]
        record['title'] = document.xpath('title/text()')[0]
        record['user_count'] = document.xpath('users/text()')[0]
        record['tags'] = {} 
        for tag in document.xpath('.//tag'):
            tag_name, tag_count = tag.xpath('./*/text()')
            if tag_name in categories:
                record['tags'][tag_name] = float(tag_count) / float(record['user_count']) 
        if record['tags']: # check we have found tags in category list
            tag_dict[record_id] = record
            
with TAGS_OUTPUT_PATH.open('w') as output_file:
    json.dump(tag_dict, output_file) 

#### Parsing text data
This is a series of HTML documents, one per Wikipedia article. We will extract the text from the HTML, and filter documents which do not contain any of the tags in our new categories list. 

In [20]:
filtered_ids = tag_dict.keys()

In [21]:
len(filtered_ids) 

19691

In [None]:
for document in DOCUMENTS_DIR.iterdir():
    document_id = document.name
    if document_id in filtered_ids:
        html_bytes = document.read_bytes()
        html_tree = html.fromstring(html_bytes)
        html_text = ' '.join(p.text_content() for p in html_tree.xpath('//p')).lower()
        OUTPUT_PATH = DOCUMENTS_OUTPUT_DIR / document.name
        OUTPUT_PATH.write_text(html_text)