In [16]:
import sys
import os
import time
import bz2
import urllib2

BZ2_CHUNK = 10*1000*1024
STR_CHUNK = 400*1000*1024

#url = 'http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles26.xml-p42567204p42663461.bz2'
#url = 'http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-meta-current1.xml-p10p30303.bz2'

sys.argv = ["",""]
sys.argv[1] = 'enwiki-latest-pages-articles26.xml-p42567204p42663461.bz2'
#sys.argv[1] = 'extracted/AA/wiki_00.bz2'
#sys.argv[1] = 'enwiki-latest-pages-meta-current1.xml-p10p30303.bz2'
#sys.argv[1] = 'http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-meta-current1.xml-p10p30303.bz2'
if len(sys.argv) > 1:
    url = sys.argv[1]

http = False
if len(url.split('://')) > 1:
    http = True

In [61]:
import re

def keepText(text):
    start = 0
    lst = text.split("==References==")
    if len(lst) == 1:
        return "" 
    text = lst[0]
    lst1 = [m.end(0) for m in re.finditer("}}\n", text)]
    if len(lst1) > 1:
        return text[lst1[-1]:]
    else:
        return ""

#parse one page using split(). Fast!
def parser(buf):
    keys_h_t = {'title': ('<title>', '</title>'), 
        'redirect': ('<redirect ', ' />'),
        'id': ('<id>', '</id>'), 
        'text': ('<text ', '</text>')}
    d = {k: '' for k in keys_h_t.keys()}
    for key, (head, tail) in keys_h_t.items():
        #get rid of front
        lst = buf.split(head)
        if len(lst) > 1:
            #get rid of tail
            lst = lst[1].strip().split(tail)
            if len(lst) > 1:
                d[key] = lst[0]
#    if d['id'] == '42630050':
#        print  d['text']
    d['text'] = keepText(d['text'])
    if d['redirect'] != '':
        d['text'] = '<redirect to ' + d['redirect'] + ' />'
#    if d['id'] == '42630050':
#        print "*****************************", start, d['text']    
    return d

#parse one page using regex. Slow!
import re
regex = ["<title>[\S\s]+?</title>", "<id>[\S\s]+?</id>", "<ns>[\S\s]+?</ns>","<text[\S\s]+?</text>"]
def parser1(buf):
    keys = ['title',  'id', 'ns', 'text']
    d = {k: '' for k in keys}
    for i, k in enumerate(keys): 
        #print i,k, regex[i], buf[:100]
        lst = re.findall(regex[i], buf, re.IGNORECASE)
        if len(lst) > 0:
            d[k] = lst[0]
    for k in keys: 
        len1 = len(k) + 2
        d[k]=d[k][len1:-len1-1]
    return d

In [62]:
##single thread

#support getPagesByUrl() and getPagesByPath()
def getPages(chunk):
    lst = chunk.split("<page>")
    #pages = map(lambda page: parser1(page), lst)
    pages = map(lambda page: parser(page), lst)  ############################################
    return pages

#parse page with url input
def getPagesByUrl(url, bytes = BZ2_CHUNK):

    decompressor = bz2.BZ2Decompressor()
    req = urllib2.urlopen(url)
    pages = []
    b = 0
    while True:
        t1 = time.time()
        chunk = req.read(bytes)
        if not chunk:
            break
        text = decompressor.decompress(chunk)
        pages += getPages(text)
        b += len(text)
        print "%.6f"%(time.time()-t1) ,len(pages), b
    req.close()

    return pages, b

#parse page with file_path input
def getPagesByPath(path, bytes = STR_CHUNK):

    fd = bz2.BZ2File(path, 'rb')
    pages = []
    b = 0
    raw = ""   ####################    
    while True:
        t1 = time.time()
        chunk = fd.read(bytes)
        if not chunk:
            break
        pages += getPages(chunk)
        b += len(chunk)
        print "%.6f"%(time.time()-t1) ,len(pages), b
        raw += chunk ################
    fd.close()

    return pages, b, raw

In [63]:
#multithreading parsing
import threading
import Queue
def threadwork(chunk, out_queue):
    lst = chunk.split("<page>")
    #pages = map(lambda page: parser1(page), lst)   #############################################
    pages = map(lambda page: parser(page), lst)
    return out_queue.put(pages )

#multithreading parsing pages with a file_path input
def getPagesByPath_multithread(path, bytes = STR_CHUNK):
    t0 = time.time()
    my_queue = Queue.Queue()
    thread_list = []
    n_thread = 0

    fd = bz2.BZ2File(path, 'rb')
    b = 0
    raw = ""   ####################
    while True:
        chunk = fd.read(bytes)
        if not chunk:
            break
        t = threading.Thread(target=threadwork, args=(chunk ,my_queue))
        thread_list.append(t)
        b += len(chunk)
        print "thread: %d start time: %.6f, buf: %d"%(n_thread, time.time()-t0 ,b)
        n_thread += 1
        t.start()
        raw += chunk ################
    fd.close()
    for t in thread_list:
        t.join()
    pages = []
    for i in xrange(n_thread):
        pages += my_queue.get()
        print "thread: %d done time: %.6f, page_len: %d"%(i, time.time()-t0 ,len(pages))

    return pages, b, raw   ##################

#multithreading parsing pages with an url input
def getPagesByUrl_multithread(url, bytes = BZ2_CHUNK):
    t0 = time.time()
    my_queue = Queue.Queue()
    thread_list = []
    n_thread = 0

    decompressor = bz2.BZ2Decompressor()
    req = urllib2.urlopen(url)
    b = 0
    while True:
        chunk = req.read(bytes)
        if not chunk:
            break
        text = decompressor.decompress(chunk)
        t = threading.Thread(target=threadwork, args=(text ,my_queue))
        thread_list.append(t)
        b += len(text)
        print "thread: %d start time: %.6f, buf: %d"%(n_thread, time.time()-t0 ,b)
        n_thread += 1
        t.start()
    req.close()
    for t in thread_list:
        t.join()
    pages = []
    for i in xrange(n_thread):
        pages += my_queue.get()
        print "thread: %d done time: %.6f, page_len: %d"%(i, time.time()-t0 ,len(pages))

    return pages, b

In [64]:
t0 = time.time()

if http:
    #reading data from url
    #pages, b = getPagesByUrl(url)
    pages, b = getPagesByUrl_multithread(url)
else:
    #reading data from local
    pages, b, raw = getPagesByPath(url)  
    #pages, b, raw = getPagesByPath_multithread(url)    ###################################
    len1 = os.path.getsize(url)
    print "file_size:",len1

print pages[0]
for kv in pages[3].items():
    print kv

print "total time: %.6f"%(time.time()-t0), len(pages), b, len(raw)

2.465536 29327 96847211
file_size: 20772450
{'redirect': '', 'text': '', 'id': '', 'title': ''}
('redirect', '')
('text', "'''Marko Virtanen''' (born December 10, 1968) is a [[Finland|Finnish]] former professional [[ice hockey]] player. He is currently the head coach of [[JYP Jyv\xc3\xa4skyl\xc3\xa4]] in the Finnish [[Liiga]].\n\nVirtanen assumed the position of head coach for JYP with the [[2013\xe2\x80\x9314 Liiga season]].&lt;ref name=&quot;jypliiga&quot;&gt;{{cite web | url=http://www.jypliiga.fi/uutiset/marko-virtanen-jypin-paavalmentajaksi-2013-2015 | title=Marko Virtanen JYPin p\xc3\xa4\xc3\xa4valmentajaksi 2013 \xe2\x80\x93 2015 | publisher=Jypliiga | accessdate=25 July 2014}}&lt;/ref&gt;\n\n")
('id', '42567219')
('title', 'Marko Virtanen')
total time: 2.467245 29327 96847211 96847211


In [65]:
print pages[1]
print pages[2]

{'redirect': 'title="Kim Hyeon-woo"', 'text': '<redirect to title="Kim Hyeon-woo" />', 'id': '42567205', 'title': 'Kim Hyeon-Woo'}
{'redirect': 'title="Lee Se-yeol"', 'text': '<redirect to title="Lee Se-yeol" />', 'id': '42567208', 'title': 'Lee Se-Yeol'}


In [66]:
id2page = {}
for i, p in enumerate(pages):
    p['pos'] = i
    id2page[p['id']]= p

In [67]:
pages_filtered = filter(lambda p: p['text'] != '', pages)
print len(pages_filtered)

17684


In [68]:
id2page_filtered = {p['id']: p for p in pages_filtered}
for k,v in id2page_filtered.items()[:3]:
    print k, v['title'],"\n",v['text']

42610148 Mahoş River 
<redirect to title="Mahoș River" />
42618301 8th Observation Squadron 
<redirect to title="914th Expeditionary Air Refueling Squadron" />
42615649 Tortyra contubernalis 
<redirect to title="Ornarantia contubernalis" />


In [69]:
#print id2page['42630050']
raw_lst = raw.split("<page>")
print len(pages), len(raw_lst), len(pages_filtered)
#print [raw_lst[19438]], "\n*******************\n",pages[19438]['text']


29327 29327 17684


In [70]:
print raw_lst[1]
print raw_lst[2]


    <title>Kim Hyeon-Woo</title>
    <ns>0</ns>
    <id>42567205</id>
    <redirect title="Kim Hyeon-woo" />
    <revision>
      <id>605512342</id>
      <timestamp>2014-04-23T21:02:00Z</timestamp>
      <contributor>
        <username>Mohsen1248</username>
        <id>3761856</id>
      </contributor>
      <comment>Mohsen1248 moved page [[Kim Hyeon-Woo]] to [[Kim Hyeon-woo]]</comment>
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text xml:space="preserve">#REDIRECT [[Kim Hyeon-woo]]
{{R from move}}</text>
      <sha1>16kop2unyres98bcaxkfj1674q7xcfd</sha1>
    </revision>
  </page>
  

    <title>Lee Se-Yeol</title>
    <ns>0</ns>
    <id>42567208</id>
    <redirect title="Lee Se-yeol" />
    <revision>
      <id>605512408</id>
      <timestamp>2014-04-23T21:02:32Z</timestamp>
      <contributor>
        <username>Mohsen1248</username>
        <id>3761856</id>
      </contributor>
      <comment>Mohsen1248 moved page [[Lee Se-Yeol]] to [[Lee Se-yeol]]</comm

In [71]:
fd = bz2.BZ2File('extracted/AA/wiki_00.bz2', 'rb')
uncompressed = fd.read()
print "reading from bz2 time: %.6f"%(time.time()-t0) ,len(uncompressed)
fd.close()
print uncompressed[:1000]

def parsePage(buf):
    keys_h_t = {'title': ('title="', '">'), 
        'ns': ('<ns>', '</ns>'),
        'id': ('id="', '" url='), 
        'text': ('">', '</doc>')}
    d = {k: '' for k in keys_h_t.keys()}
    for key, (head, tail) in keys_h_t.items():
        #get rid of front
        lst = buf.split(head)
        if len(lst) > 1:
            #get rid of tail
            lst = lst[1].strip().split(tail)
            if len(lst)> 1:
                d[key] = lst[0].strip()
    return d
        

pages1 = uncompressed.split("<doc ")
pages1 = map(lambda buf: parsePage(buf), pages1)

reading from bz2 time: 71.596046 16668087
<doc id="42567219" url="https://en.wikipedia.org/wiki?curid=42567219" title="Marko Virtanen">
Marko Virtanen

Marko Virtanen (born December 10, 1968) is a Finnish former professional ice hockey player. He is currently the head coach of JYP Jyväskylä in the Finnish Liiga.
Virtanen assumed the position of head coach for JYP with the 2013–14 Liiga season.

</doc>
<doc id="42567222" url="https://en.wikipedia.org/wiki?curid=42567222" title="File:David and Goliath -1700s.jpg">
File:David and Goliath -1700s.jpg


</doc>
<doc id="42567223" url="https://en.wikipedia.org/wiki?curid=42567223" title="2014–15 Northern Illinois Huskies men's basketball team">
2014–15 Northern Illinois Huskies men's basketball team

The 2014–15 Northern Illinois Huskies men's basketball team represented Northern Illinois University during the 2014–15 NCAA Division I men's basketball season. The Huskies, led by fourth year head coach Mark Montgomery, played their home games at

In [72]:
pages1 = filter(lambda p: p['id'] != '', pages1)
print len(pages1)
id2page1 = {p['id']: p for p in pages1}
for k,v in id2page1.items()[:2]:
    print k, v['title'],"\n",v['text']

14946
42662895 Thomas Ignatius McCarthy 
Thomas Ignatius McCarthy

Thomas Ignatius McCarthy, LRIBA (born 31 January 1880, died 13 Feb 1951) was an architect based in Coalville, Leicestershire.
Early in the twentieth century, Thomas Ignatius McCarthy set up a practice as a surveyor and architect in Coalville, which was a partnership shared with Henry Collings (1880 - 1960). Collings was responsible for the design of the Coalville Clock Tower war memorial - a building admired by Pevsner.
Examples of work by Thomas Ignatius McCarthy (some possibly in conjunction with Henry Collings):
42615641 Category:Bronx building and structure stubs 
Category:Bronx building and structure stubs


In [74]:
lst_filtered = sorted(id2page_filtered.keys())
lst1 = sorted(id2page1.keys())
print lst_filtered[:20]
print lst1[:20]

['42567205', '42567208', '42567219', '42567221', '42567223', '42567232', '42567236', '42567240', '42567245', '42567249', '42567253', '42567255', '42567257', '42567261', '42567270', '42567275', '42567280', '42567282', '42567283', '42567286']
['42567219', '42567222', '42567223', '42567226', '42567244', '42567248', '42567249', '42567253', '42567257', '42567261', '42567286', '42567287', '42567292', '42567295', '42567306', '42567319', '42567320', '42567337', '42567340', '42567343']


In [77]:
idx='42567222'
idx='42567226'
idx='42567244'
idx='42567248'
idx='42567249'
idx='42567287'
idx='42567292'
idx='42567306'
idx='42567320'
idx='42567337'
idx='42567343'
print "**bz3  ****\n",id2page[idx]['text']
print
print "**raw  ****\n",raw_lst[id2page[idx]['pos']]
print
print "**wikiex***\n",id2page1[idx]['text']
print 
#print id2page3[idx]['text']

**bz3  ****


**raw  ****

    <title>Category:Airdrieonians F.C. (1878) wartime guest players</title>
    <ns>14</ns>
    <id>42567343</id>
    <revision>
      <id>605514597</id>
      <timestamp>2014-04-23T21:17:57Z</timestamp>
      <contributor>
        <username>Jmorrison230582</username>
        <id>1894081</id>
      </contributor>
      <comment>[[WP:AES|←]]Created page with 'Players who guested during wartime matches for [[Airdrieonians F.C. (1878)|Airdrieonians]].  [[Category:Airdrieonians F.C. (1878) players| ]] Category:Scottish...'</comment>
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text xml:space="preserve">Players who guested during wartime matches for [[Airdrieonians F.C. (1878)|Airdrieonians]].

[[Category:Airdrieonians F.C. (1878) players| ]]
[[Category:Scottish Football League wartime guest players by club|Airdrie]]</text>
      <sha1>b7w1a86ggde42ddbj2msr1f1e2jkqld</sha1>
    </revision>
  </page>
  

**wikiex***
Category:Airdrieonians

In [None]:
import pycurl
from io import BytesIO

buf = BytesIO()

url = "https://en.wikipedia.org/wiki/Wikipedia:I just don't like it"
url = "https://en.wikipedia.org/?curid=42630050"
t0 = time.time()
with open('out.bz2', 'wb') as f: 
    c = pycurl.Curl()
    #c.setopt(c.URL, 'http://news.ycombinator.com')
    #c.setopt(c.URL, 'www.gutenberg.org/ebooks/4300')
    #c.setopt(c.URL, 'https://it.wikipedia.org/wiki/Armonium')
    #c.setopt(c.URL, "https://en.wikipedia.org/wiki/Wikipedia:I_just_don't_like_it")
    c.setopt(c.URL, url)    
    #c.setopt(c.WRITEDATA, f)
    c.setopt(c.WRITEDATA, buf)
    c.perform()
print "downloading time: %.6f"%(time.time()-t0)
body =  buf.getvalue()
buf.close()

print body
print body.decode('iso-8859-1')
"""
t0 = time.time()
input_file = bz2.BZ2File('out.bz2', 'rb')
uncompressed = input_file.read()

print "reading from bz2 time: %.6f"%(time.time()-t0) ,len(uncompressed),uncompressed
input_file.close()
"""

In [None]:


t0 = time.time()
with open('out.bz2', 'wb') as f: 
    c = pycurl.Curl()
    #c.setopt(c.URL, 'http://news.ycombinator.com')
    #c.setopt(c.URL, 'www.gutenberg.org/ebooks/4300')
    c.setopt(c.URL, 'https://it.wikipedia.org/wiki/Armonium')
    c.setopt(c.WRITEDATA, f)
    c.perform()
    buf.close()
print "downloading time: %.6f"%(time.time()-t0)

t0 = time.time()
input_file = bz2.BZ2File(fname, 'rb')
uncompressed = input_file.read()
input_file.close()
print "reading from bz2 time: %.6f"%(time.time()-t0) ,len(uncompressed)

In [None]:
import os
import time
import bz2
import urllib2
url = 'http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles26.xml-p42567204p42663461.bz2'
#url = 'http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-meta-current1.xml-p10p30303.bz2'
fname = url.split('/')[-1]
len1 = os.path.getsize(fname)
print "size:",len1

"""
t0 = time.time()
os.system("wget " + url)
print "downloading time: %.6f"%(time.time()-t0)
#""" #164 sec

"""
t0 = time.time()
input_file = bz2.BZ2File(fname, 'rb')
uncompressed = input_file.read()
input_file.close()
print "reading from bz2 time: %.6f"%(time.time()-t0) ,len(uncompressed)
#""" #16 sec

n_bytes = 100*1000*1024
t0 = time.time()
decompressor = bz2.BZ2Decompressor()
req = urllib2.urlopen(url)
text=""
while True:
    t1 = time.time()
    chunk = req.read(n_bytes)
    if not chunk:
        break
    decompressed = decompressor.decompress(chunk)
    text += decompressed
    print "%.6f"%(time.time()-t1) ,len(chunk), len(decompressed)
req.close()
print "reading and decompressing from url time: %.6f"%(time.time()-t0) ,len(chunk), len(decompressed)

print len(text),"\n",text[-1000:]
#t0 = time.time()
#with open("tmp.txt", "w") as fw:
#    fw.write(buf)
#print "writing time: %.6f"%(time.time()-t0)


In [None]:
print len(text),"\n",text[-100:]

In [None]:
t0=time.time()
lst =  text.split("<page>")
print "split pages time: %.6f"%(time.time()-t0) ,len(lst), len(lst[0])

In [None]:
print lst[3]

In [None]:
#extracting header and text from each page
import re
regex = u"<text xml:[\S\s]+?==References=="

header_lst = map(lambda page: (page[:500], re.findall(regex, page, re.IGNORECASE)), lst)

In [None]:
print header_lst[3]

In [None]:
#parsing header and text
reg_title = u"<title>[\S\s]+?</title>"
reg_idx = u"<id>[\S\s]+?</id>"
def fmap(header, lst):
    text = ""
    if len(lst) > 0:
        text = lst[0]
        
    title_lst = re.findall(reg_title, header, re.IGNORECASE)
    title = ""
    if len(title_lst) > 0:
        title = title_lst[0][7:-8] 
        
    idx_lst = re.findall(reg_idx, header, re.IGNORECASE)
    idx = ""
    if len(idx_lst) > 0:
        idx = idx_lst[0][4:-5]        
        
    #TODO other info: eg. id, links, tokenizing, selection .....
    
    return (title, idx, text)

map_tuples = map(lambda (head, lst):  fmap(head, lst), header_lst)
print len(map_tuples), len(map_tuples[0]), len(map_tuples[1])

In [None]:
print map_tuples[0]
print map_tuples[1]
print map_tuples[3]