In [None]:
import codecs
from lxml import html
from multiprocessing import Process, Queue
from Queue import Empty
import requests
import sys
import time
import traceback
from urlparse import parse_qs, urlparse
import logbook
from IPython.display import clear_output

orig_stderr = sys.stderr
sys.stderr = open("/dev/null", "wb")

In [None]:
def ensure_unicode(s):
    if isinstance(s, unicode):
        return s
    else:
        return s.decode('utf-8')

In [None]:
exp = logbook.Experiment("05b_find_importance")
log = exp.get_logger()

In [None]:
page = requests.get("https://tools.wmflabs.org/enwp10/cgi-bin/pindex.fcgi?sec=[All]")
parser = html.HTMLParser(encoding='utf-8')
tree = html.document_fromstring(page.content, parser=parser)
rows = tree.xpath("//table[@class='wikitable']//tr")

log.info("Parsing projects")
projects = []
for i, row in enumerate(rows):
    cells = row.xpath("td")
    if len(cells) < 3:
        continue
    # Parse project title
    if (len(cells[0][0]) == 0):
        title = ensure_unicode(cells[0][0].text)
        project_title = title
        project_unique = title
    else:
        title = ensure_unicode(cells[0][0][0].text)
        try:
            url = cells[0][0][0].attrib['href']
            query = urlparse(url).query
            unique = parse_qs(query)['title'][0].decode('utf8')
            project_title = title
            project_unique = unique
        except KeyError:
            # No title
            project_title = title
            project_unique = title
    # Parse list url
    project_list = cells[2].xpath("a[1]")[0].attrib['href']
    projects.append( (project_title, project_unique, project_list))

In [None]:
# Parse articles
log.info("Parsing project articles")
try:
    with open(exp.get_filename("importance_url.utf8.tsv"), "wb") as out:
        out.write("proj_title\tproj_unique\tpage_url\timportance\n")
        articles = [] # [project_title, project_unique, article_url, importance]
        for project in projects:
            # Get list of articles in project
            query = urlparse(project[2]).query
            project_query = parse_qs(query)['project'][0]
            url = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=%s&namespace=&pagename=&quality=&importance=&score=&limit=250&offset=1&sorta=Importance&sortb=Quality" % project_query
            page = requests.get(url)
            parser = html.HTMLParser(encoding='utf-8')
            tree = html.document_fromstring(page.content, parser=parser)
            rows = tree.xpath("//table[@class='wikitable']//tr")
            if len(rows) == 0:
                print "No rows in " + project[0]
                continue
            for row in rows:
                cells = row.xpath("td")
                try:
                    article_href = cells[1][0].attrib['href']
                    article_name = cells[2][0].text
                    article_data = [project[0], project[1], article_href, article_name]
#                    out.write("\t".join([
#                        article_data[0].encode('utf8'),
#                        article_data[1].encode('utf8'),
#                        article_data[2].encode('utf8'),
#                        article_data[3].encode('utf8')
#                    ]) + "\n")
#                    articles.append(article_data)
                    try:
                        # Get talk link if it exists
                        links = cells[1].xpath("a")
                        talk_href = links[1].attrib['href']
                        talk_data = [project[0], project[1], talk_href, article_name]
                        out.write("\t".join([
                            talk_data[0].encode('utf8'),
                            talk_data[1].encode('utf8'),
                            talk_data[2].encode('utf8'),
                            talk_data[3].encode('utf8')
                        ]) + "\n")
                        articles.append(talk_data)
                    except IndexError:
                        pass
                except IndexError:
                    continue
                finally:
                    out.flush()
except:
    log.error(sys.exc_info())
    traceback.print_exc(file=open(exp.get_filename("error.txt"), "wb"))
    raise

In [None]:
# Alternative: Load articles from file
articles = []
article_file = "output/04b_find_importance/2017-09-15 15:01:52 14665f3/importance_url.utf8.tsv"
log.info("Loading articles")
finished = 0
with open(article_file, "rb") as f:
    f.next()
    for i, row_bytes in enumerate(f):
        row = row_bytes.decode('utf-8')
        article_data = row.strip().split(u"\t")
        articles.append(article_data)
        finished += 1
print len(articles)

In [None]:
def get_id_from_url(article):
    article = list(article)
    query = urlparse(article[2]).query
    article_title = parse_qs(query)['title'][0]
    info_url = "https://en.wikipedia.org/w/index.php?title=%s&action=info" % article_title
    page = requests.get(info_url)
    parser = html.HTMLParser(encoding='utf-8')
    tree = html.document_fromstring(page.content, parser=parser)
    cell = tree.xpath("//tr[@id='mw-pageinfo-article-id']//td")[1]
    page_id = int(cell.text.strip())
    article[2] = page_id
    del tree
    return article

def worker(worker_id, article_q, result_q, skipped_q, error_q, done_q):
    loop_num = 0
    try:
        while True:
            loop_num += 1
            if loop_num % 200 == 0:
                time.sleep(1)
            try:
                article = article_q.get(False, 1.0)
            except Empty:
                if article_q.qsize() > 0:
                    continue
                else:
                    break
            try:
                new_article = get_id_from_url(article)
            except IndexError:
                skipped_q.put(article)
                continue
            result_q.put(new_article)
    except Empty:
        print "Queue size is ", article_q.qsize()
    except:
        # Unknown error, push to queue and empty input queue
        exc = "".join(traceback.format_exception(sys.last_type, sys.last_value, sys.last_traceback))
        error_q.put(exc)
        while True:
            try:
                article_q.get(False)
            except Empty:
                break
    done_q.put(worker_id)

In [None]:
# Replace url with page id
num_workers = 5
article_q = Queue()
result_q = Queue()
error_q = Queue()
skipped_q = Queue()
done_q = Queue()
log_every = 200
next_log = log_every
num_articles = len(articles)
log.info("Replacing url with ids")
try:
    articles_put = 0
    for i, article in enumerate(articles):
        article_q.put(article)
        articles_put += 1
    log.info("  Put %d articles" % articles_put)
    workers = []
    log.info("  %d Articles to process" % article_q.qsize())
    log.info("  Starting workers")
    for i in range(num_workers):
        w = Process(target=worker, args=(i, article_q, result_q, skipped_q, error_q, done_q))
        w.daemon = True
        workers.append(w)
        w.start()
    log.info("  Workers started")
    while result_q.qsize() + skipped_q.qsize() < len(articles):
        time.sleep(0)
        if error_q.qsize() == 0:
            if result_q.qsize() > next_log:
                next_log += log_every
                log.info("  %d/%d articles complete (%d skipped)" % (result_q.qsize(), num_articles, skipped_q.qsize()))
        else:
            # Error occurred, empty queues
            log.error(error_q.get())
            try:
                while True:
                    result_q.get()
            except Empty:
                pass
    if error_q.qsize() > 0:
        log.error(error_q.get())
    log.info("Completed %d articles, skipped %d" % (result_q.qsize(), skipped_q.qsize()))
except:
    log.error(sys.exc_info())
    raise

In [None]:
exp.get_filename(""), result_q.qsize(), skipped_q.qsize(), article_q.qsize(), done_q.qsize()

In [None]:
id_articles = []

In [None]:
workers

In [None]:
while True:
    try:
        article = result_q.get(False, 10.0)
        id_articles.append(article)
    except Empty:
        clear_output()
        print len(id_articles)
        if result_q.qsize() == 0:
            break
        time.sleep(1)

In [None]:
len(id_articles)

In [None]:
with open(exp.get_filename("importance.utf8.tsv"), "wb") as out:
    out.write("proj_title\tproj_unique\tpage_id\timportance\n")
    for article in id_articles:
        out.write("\t".join([
            article[0].encode('utf8'),
            article[1].encode('utf8'),
            str(article[2]),
            article[3].encode('utf8')
        ]) + "\n")


In [None]:
492650 - 488935