In [None]:
import codecs
from IPython.display import clear_output
from lxml import html
import requests
import sys
from urlparse import parse_qs, urlparse
import logbook

In [None]:
def ensure_unicode(s):
    if isinstance(s, unicode):
        return s
    else:
        return s.decode('utf-8')

In [None]:
exp = logbook.Experiment("04b_find_importance")
log = exp.get_logger()

In [None]:
page = requests.get("https://tools.wmflabs.org/enwp10/cgi-bin/pindex.fcgi?sec=[All]")
parser = html.HTMLParser(encoding='utf-8')
tree = html.document_fromstring(page.content, parser=parser)
rows = tree.xpath("//table[@class='wikitable']//tr")

log.info("Parsing projects")
projects = []
for i, row in enumerate(rows):
    cells = row.xpath("td")
    if len(cells) < 3:
        continue
    # Parse project title
    if (len(cells[0][0]) == 0):
        title = ensure_unicode(cells[0][0].text)
        project_title = title
        project_unique = title
    else:
        title = ensure_unicode(cells[0][0][0].text)
        try:
            url = cells[0][0][0].attrib['href']
            query = urlparse(url).query
            unique = parse_qs(query)['title'][0].decode('utf8')
            project_title = title
            project_unique = unique
        except KeyError:
            # No title
            project_title = title
            project_unique = title
    # Parse list url
    project_list = cells[2].xpath("a[1]")[0].attrib['href']
    projects.append( (project_title, project_unique, project_list))

In [None]:
# Parse articles
log.info("Parsing project articles")
try:
    with open(exp.get_filename("importance_url.utf8.tsv"), "wb") as out:
        out.write("proj_title, proj_unique, page_url, importance\n")
        articles = [] # [project_title, project_unique, article_url, importance]
        for project in projects:
            # Get list of articles in project
            query = urlparse(project[2]).query
            project_query = parse_qs(query)['project'][0]
            url = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=%s&namespace=&pagename=&quality=&importance=&score=&limit=250&offset=1&sorta=Importance&sortb=Quality" % project_query
            page = requests.get(url)
            parser = html.HTMLParser(encoding='utf-8')
            tree = html.document_fromstring(page.content, parser=parser)
            rows = tree.xpath("//table[@class='wikitable']//tr")
            if len(rows) == 0:
                print "No rows in " + project[0]
                continue
            for row in rows:
                cells = row.xpath("td")
                article_data = [project[0], project[1]]
                try:
                    article_data.append(cells[1][0].attrib['href'])
                    article_data.append(cells[2][0].text)
                except IndexError:
                    continue
                out.write("\t".join([
                    article_data[0].encode('utf8'),
                    article_data[1].encode('utf8'),
                    article_data[2].encode('utf8'),
                    article_data[3].encode('utf8')
                ]) + "\n")
                out.flush()
                articles.append(article_data)
                clear_output()
except:
    log.error(sys.exc_info())
    raise

In [None]:
# Replace url with page id
try:
    for i, article in enumerate(articles):
        query = urlparse(article[2]).query
        article_title = parse_qs(query)['title'][0]
        info_url = "https://en.wikipedia.org/w/index.php?title=%s&action=info" % article_title
        page = requests.get(info_url)
        parser = html.HTMLParser(encoding='utf-8')
        tree = html.document_fromstring(page.content, parser=parser)
        cell = tree.xpath("//tr[@id='mw-pageinfo-article-id']//td")[1]
        page_id = int(cell.text.strip())
        articles[i][2] = page_id
        clear_output()
except:
    log.error(sys.exc_info())
    raise

In [None]:
with open(exp.get_filename("importance.utf8.tsv"), "wb") as out:
    out.write("proj_title, proj_unique, page_id, importance\n")
    for article in articles:
        out.write("\t".join([
            article[0].encode('utf8'),
            article[1].encode('utf8'),
            str(article[2]),
            article[3].encode('utf8')
        ]) + "\n")

In [None]:
articles[-2:]