# IEEE 802.16 Documents

In [327]:
import mechanicalsoup
from bs4 import BeautifulSoup
import re
import xlwt

In [320]:
# Connect to IEEE 802.16 Documents
browser = mechanicalsoup.StatefulBrowser()
root = "http://www.ieee802.org/16/docs/"
url = "http://www.ieee802.org/16/docs/index.html#contrib"
browser.open(url)

<Response [200]>

In [321]:
# Get html content
html = browser.get_current_page()

In [322]:
# Official/Contribution, ocs, 24
offiOrContri = html.find_all('p')
offiOrContri = offiOrContri[4:len(offiOrContri)-1]
ocs = []
for oc in offiOrContri:
    if oc.b is None:
        ocs.append(oc.strong.text)
    else:
        ocs.append(oc.b.text)

In [323]:
# Other features, 24
contexts = html.find_all('ul')

### Crawler all data into list

In [324]:
res = []
total = len(offiOrContri)
for (idx, oc, context) in zip(range(total), ocs, contexts):
    cnt = 0
    if "Official" in oc or "Documents" in oc:
        f = 0
    elif "Contributions" in oc:
        f = 1
    for li in context:
        cnt = cnt + 1
        if cnt == 1: 
            continue
        # each record
        [time, title, author, fileLink, addiInfo] = parse(li)
        res.append([f, time, title, author, fileLink, addiInfo])
print("Crawler Done!")

Crawler Done!


### Save list into excel file

In [330]:
filename = ""
book = xlwt.Workbook(encoding="utf-8")
sheet1 = book.add_sheet("Sheet 1")
sheet1.write(0, 0, "Official/Contribution")
sheet1.write(0, 1, "Time")
sheet1.write(0, 2, "Title")
sheet1.write(0, 3, "Author")
sheet1.write(0, 4, "File Link")
sheet1.write(0, 5, "Additional Information")

In [331]:
for idx1, line in zip(range(1, len(res)+1), res):
    for idx2, val in zip(range(0, len(line)), line):
        sheet1.write(idx1, idx2, val)

In [332]:
book.save("802_16_Documents.xls")

### Util functions

In [295]:
def parse(li):
    conts = li.contents
    tmpstr = conts[-1]
    if tmpstr.find('(') == -1 or tmpstr.find(')') == -1:
        authAndTimeCtx = ""
    else:
        authAndTimeCtx = tmpstr[tmpstr.find('(')+1:tmpstr.find(')')]
    # File link
    fileLink = root + li.find_all('a', href=True)[0]['href']
    # title
    title = tmpstr.strip()
    # time
    mat1 = re.search(r"(\d{4}-\d{2}-\d{2})",authAndTimeCtx)
    if mat1 is not None:
        time = mat1.group(0)
    else:
        mat2 = re.search(r"(\d{2}/\d{2}/\d{2})",authAndTimeCtx)
        if mat2 is not None:
            time = mat2.group(0)
            time = time.replace("/", "-");
            year = int(time[0:2])
            if year >= 80:
                time = "19" + time
            else:
                time = "20" + time
        else:
            time = None
    # author
    tmpsplit = re.split('[;,]', authAndTimeCtx)
    author = ""
    for ele in tmpsplit:
        mat1 = re.search(r"(\d{4}-\d{2}-\d{2})", ele)
        mat2 = re.search(r"(\d{2}/\d{2}/\d{2})",ele)
        if mat1 is None and mat2 is None:
            author += ele + ";"
    if author is "":
        author = None
    # addiInfo
    addiInfo = ""
    for idx in range(1, len(conts)-1):
        tmp = str(conts[idx])
        if tmp != '<br/>':
            addiInfo += tmp
    if addiInfo is "":
        addiInfo = None
    return [time, title, author, fileLink, addiInfo]

In [267]:
str1 = "<li><a href='11/80216-11_0051.pdf' name='11_0051'>IEEE 802.16-11/0051</a><br/> IEEE 802.16 Working Group Letter Ballot #36: Announcement (Muya Wachira, Roger Marks, Phil Whitehead, 802.16 TG2, 01/01/10)</li>"
str2 = "<li><a href='11/80216-11_0041r3.zip' name='11_0041'>IEEE 802.16-11/0041r3</a> (<a href='11/80216-11_0041r2.zip'>IEEE 802.16-11/0041r2</a>: 2011-11-10; <a href='11/80216-11_0041r1.zip'>IEEE 802.16-11/0041r1</a>: 2011-11-08; <a href='11/80216-11_0041.zip'>IEEE 802.16-11/0041</a>: 2011-11-07)<br/> IEEE 802.16 WG Letter Ballot #34 Comments (99/11/16)</li>"
soup = BeautifulSoup(str2)
parse(soup.li)

['1999-11-16',
 'IEEE 802.16 WG Letter Ballot #34 Comments (99/11/16)',
 None,
 'http://www.ieee802.org/16/docs/11/80216-11_0041r3.zip',
 ' (<a href="11/80216-11_0041r2.zip">IEEE 802.16-11/0041r2</a>: 2011-11-10; <a href="11/80216-11_0041r1.zip">IEEE 802.16-11/0041r1</a>: 2011-11-08; <a href="11/80216-11_0041.zip">IEEE 802.16-11/0041</a>: 2011-11-07)']