In [5]:

import requests
import re
import docx2txt
import pdftotext
from io import BytesIO
import fulltext
import tempfile
from urllib.parse import urlparse 
import os
import urllib.parse

from bs4 import BeautifulSoup as bs4
from tqdm import tqdm
from datetime import datetime, timedelta
import re, json

In [3]:
# from epub2txt

import os
import sys
import urllib
try:
    from urllib import unquote
except:
    from urllib.parse import unquote
import zipfile

import xml.parsers.expat
import html2text
from glob import glob


class ContainerParser():
    def __init__(self, xmlcontent=None):
        self.rootfile = ""
        self.xml = xmlcontent

    def startElement(self, name, attributes):
        if name == "rootfile":
            self.buffer = ""
            self.rootfile = attributes["full-path"]

    def parseContainer(self):
        parser = xml.parsers.expat.ParserCreate()
        parser.StartElementHandler = self.startElement
        parser.Parse(self.xml, 1)
        return self.rootfile


class BookParser():
    def __init__(self, xmlcontent=None):
        self.xml = xmlcontent
        self.title = ""
        self.author = ""
        self.inTitle = 0
        self.inAuthor = 0
        self.ncx = ""

    def startElement(self, name, attributes):
        if name == "dc:title":
            self.buffer = ""
            self.inTitle = 1
        elif name == "dc:creator":
            self.buffer = ""
            self.inAuthor = 1
        elif name == "item":
            if attributes["id"] == "ncx" or attributes["id"] == "toc" or attributes["id"] == "ncxtoc":
                self.ncx = attributes["href"]

    def characters(self, data):
        if self.inTitle:
            self.buffer += data
        elif self.inAuthor:
            self.buffer += data

    def endElement(self, name):
        if name == "dc:title":
            self.inTitle = 0
            self.title = self.buffer
            self.buffer = ""
        elif name == "dc:creator":
            self.inAuthor = 0
            self.author = self.buffer
            self.buffer = ""

    def parseBook(self):
        parser = xml.parsers.expat.ParserCreate()
        parser.StartElementHandler = self.startElement
        parser.EndElementHandler = self.endElement
        parser.CharacterDataHandler = self.characters
        parser.Parse(self.xml, 1)
        return self.title, self.author, self.ncx


class NavPoint():
    def __init__(self, id=None, playorder=None, level=0, content=None, text=None):
        self.id = id
        self.content = content
        self.playorder = playorder
        self.level = level
        self.text = text


class TocParser():
    def __init__(self, xmlcontent=None):
        self.xml = xmlcontent
        self.currentNP = None
        self.stack = []
        self.inText = 0
        self.toc = []

    def startElement(self, name, attributes):
        if name == "navPoint":
            level = len(self.stack)
            self.currentNP = NavPoint(
                attributes["id"], attributes["playOrder"], level)
            self.stack.append(self.currentNP)
            self.toc.append(self.currentNP)
        elif name == "content":
            self.currentNP.content = unquote(attributes["src"])
        elif name == "text":
            self.buffer = ""
            self.inText = 1

    def characters(self, data):
        if self.inText:
            self.buffer += data

    def endElement(self, name):
        if name == "navPoint":
            self.currentNP = self.stack.pop()
        elif name == "text":
            if self.inText and self.currentNP:
                self.currentNP.text = self.buffer
            self.inText = 0

    def parseToc(self):
        parser = xml.parsers.expat.ParserCreate()
        parser.StartElementHandler = self.startElement
        parser.EndElementHandler = self.endElement
        parser.CharacterDataHandler = self.characters
        parser.Parse(self.xml, 1)
        return self.toc


class epub2txt():
    def __init__(self, epubfile=None):
        self.epub = epubfile

    def convert(self):
        # print "Processing %s ..." % self.epub
        file = zipfile.ZipFile(self.epub, "r") if isinstance(self.epub,str) else zipfile.ZipFile(self.epub)
        rootfile = ContainerParser(
            file.read("META-INF/container.xml")).parseContainer()
        title, author, ncx = BookParser(file.read(rootfile)).parseBook()
        ops = "/".join(rootfile.split("/")[:-1])
        if ops != "":
            ops = ops+"/"
        toc = TocParser(file.read(ops + ncx)).parseToc()

        # fo = open("%s_%s.txt" % (title, author), "w")
        content = []
        for t in toc:
            html = file.read(ops + t.content.split("#")[0])
            text = html2text.html2text(html.decode("utf-8"))
            # fo.write("*"*(t.level+1) + " " + t.text.encode("utf-8")+"\n")
            # fo.write(t.text.encode("utf-8")+"{{{%d\n" % (t.level+1))
            # fo.write(text.encode("utf-8")+"\n")
            content.append("*" * (t.level+1) + " " +
                           t.text + "\n")
            content.append(t.text + "{{{%d\n" % (t.level+1))
            content.append(text + "\n")
        # fo.close()
        if isinstance(self.epub,str):
            file.close()
        return {
            'title': title,
            'author': author,
            'content':''.join(content),
        }


In [9]:
headers

{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.25 Safari/537.36'}

In [11]:
import io
from tqdm import tqdm
import PyPDF2


import urllib.parse

def url2extenstion(url):
    path = urllib.parse.urlparse(url).path
    ext = os.path.splitext(path)[1]
    return ext

url = 'https://camelot-py.readthedocs.io/en/master/_static/pdf/us-030.pdf'
url = 'http://www.ouhk.edu.hk/REG/reg_ftae/FT_Student_Handbook.pdf'
url = 'http://library.umac.mo/ebooks/b28113706.pdf'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.25 Safari/537.36'}

def get_doc_from_url(url):

    r = requests.get(url, headers=headers, stream=True)

    # Total size in bytes.
    total_size = int(r.headers.get('content-length', 0)); 
    block_size = 1024
    wrote = 0
    f = io.BytesIO()


    for data in tqdm(r.iter_content(block_size), total=-(-total_size//block_size) , unit='MB', unit_scale=True):
        wrote = wrote  + len(data)
        f.write(data)

    
    return f


f = get_doc_from_url(url)
_type = url2extenstion(url).lstrip('.')
_type
f.seek(0)
fileReader = PyPDF2.PdfFileReader(f)
num_pages = fileReader.numPages

temp_fn = 'temp_fn.pdf'
f.seek(0)
with open(temp_fn, 'wb') as f2:
    f2.write(f.read())

100%|██████████| 21.4K/21.4K [00:18<00:00, 1.18KMB/s]


<module 'camelot' from '/usr/local/lib/python3.5/dist-packages/camelot/__init__.py'>

In [None]:
# this one is slow as hell
import camelot
tables_raw = camelot.read_pdf(temp_fn, pages=','.join(str(i) for i in range(num_pages+1)))
tables_raw

In [None]:
# TODO
# 
# 1. Some tables found are actually figures
# 2. Some lines are pagination, or section title
# 


In [6]:

try:
    f.seek(0)
    if _type in ['txt','html','htm']:
        content = f.read().decode('utf-8')
    elif _type == 'doc':
        tmp = tempfile.NamedTemporaryFile()

        with open(tmp.name, 'wb') as f:
            f.write(f.read()) # where `stuff` is, y'know... stuff to write (a string)

        with open(tmp.name,'rb') as f:
            content = fulltext.get(f, name='foo.doc', mime='application/doc',backend='doc')

    elif _type == 'docx':
        content = docx2txt.process(f)

    elif _type == 'pdf':
        content = "\n\n".join(pdftotext.PDF(f))
    else:
        print('type unknown')
        index += 1
        raise
except Exception as e:
    content = ''
    print('error! >>',e)


<TableList n=49>

In [7]:
tables = []
for t in tables_raw:
    t = t.df.values.tolist()

    plain = ''.join(''.join(e.strip() for e in e) for e in t)
    if plain == '':
        continue
        
    tables.append(t)
[]
tables_filtered = [[[re.sub(r'( )\n(.)|([\u4e00-\u9fff。；：，？！～])\n([\u4e00-\u9fff])',r'\1\2',e) for e in row] for row in tab ] 
 for tab in tables]# if not any(e.count('\n') > 20 for e in tab for e in e)]
tables_filtered

[[['Step',
   'Total protein (mg)',
   'Total activity (unit)',
   'Specific activity (Unit/mg)',
   'Yield (%)',
   'Purification (fold)'],
  ['Cell-free extract', '1381', '18.8', '0.0136', '100', '1'],
  ['(NH4)2SO4', '412', '12.2', '0.0296', '65', '2'],
  ['Ether-Toyopearl', '68.9', '16.1', '0.234', '85', '17'],
  ['DEAE-Toyopearl', '3.28', '13.3', '4.06', '71', '299'],
  ['DEAE-Sepharose', '1.65', '7.00', '4.24', '37', '312'],
  ['Phenyl-Toyopearl', '0.360', '6.38', '17.7', '34', '1302'],
  ['Butyl-Toyopearl', '0.198', '3.80', '19.2', '20', '1412']],
 [['cofactor', 'Relative activity (%)', 'kcat/Km (s-1/mM)'],
  ['NAD+', '100', '22.4'],
  ['NADP+', '<0.1', '-'],
  ['NADH', '100', '210.3'],
  ['NADPH', '<0.1', '-']],
 [['Substrate', 'Relative activity (%)', 'Km (mM)'],
  ['2-Phenylethanol', '100', '0.025'],
  ['(S)-2-Phenylpropanol', '156', '0.157'],
  ['(R)-2-Phenylpropanol', '63', '0.020'],
  ['Benzyl alcohol', '199', '0.012'],
  ['3-Phenylpropanol', '135', '0.033'],
  ['Ethanol',

In [12]:
spans = []

for tab in tables:
    span = None
    
    target_words = [e.strip() for row in tab for e in row for e in re.split(r'[ \n]',e) if e.strip()]
    num_words = len(target_words)
    target_words = sorted(list(set(target_words)),reverse=True)

    target_words_regex = re.compile(r'(?:'+'|'.join(re.escape(e) for e in target_words)+r')')
    target_words_only_regex = re.compile(r'^(\s*(?:%s))+\s*$'%'|'.join(re.escape(e) for e in target_words))


    lines = content.split('\n')
    start = 0
    accum = 0
    i = -1
    span = None
    last_i = None
    accum_i = 0
    failed_count = 0
    for line in lines:
        i += 1

        if target_words_only_regex.match(line):
            last_failed = None
            failed_count = 0
            matched = target_words_regex.findall(line)
            if accum == 0:
                start = i
            accum += len(matched)
            accum_i += 1
            last_i = i
            if accum == num_words:
                span = (start, i)
                spans.append(span)
                break
        else:
            failed_count += 1
            if accum_i > 2 and failed_count <= 1:
                continue

            if last_i and accum/num_words > 0.9 and lines[last_i].endswith(re.split(r'[ \n]',tab[-1][-1])[-1] ):
                span = (start, last_i)
                spans.append(span)
                break
            accum = 0
            accum_i = 0
            last_i = None
            
    if span == None:
        print('?')
        start = 0
        accum = 0
        i = -1
        span = None
        last_i = None
        accum_i = 0
        failed_count = 0
        for line in lines:
            i += 1
            if accum > 0:
                print(i, ':', accum,'/',num_words)

            if target_words_only_regex.match(line):
                last_failed = None
                failed_count = 0
                matched = target_words_regex.findall(line)
                if accum == 0:
                    start = i
                accum += len(matched)
                accum_i += 1
                last_i = i
                if accum == num_words:
                    span = (start, i)
                    spans.append(span)
                    break
            else:
                failed_count += 1
                if accum_i > 2 and failed_count <= 1:
                    continue

                if last_i and lines[last_i].endswith(re.split(r'[ \n]',tab[-1][-1])[-1] ):
                    span = (start, last_i)
                    spans.append(span)
                    print(span)
                    break
                accum = 0
                accum_i = 0
                last_i = None

        break
spans = sorted(spans)

?
159 : 1 / 6
(158, 158)


In [13]:
def table2text(tab):
    h = []
    for e in tab:
        k = []
        for c in e:
            k.append(re.sub(r'([ \u4e00-\u9fff])\n',r'\1',(c or '<pad>')))
        h.append(' ||| '.join(k))
        
    result = ' |||\n'.join(h) + ' |||\n'
    return result

new_lines = []

#filled_spans = []
i = 0
last = 0
for a, b in spans:
    if last < a:
        #filled_spans.append((last,a))
        new_lines.extend(lines[last:a])
        
    tab = tables_filtered[i]
    new_lines.append(table2text(tab))
    
    
    #filled_spans.append(i)
    last = b
    i += 1
if last < len(lines):
    new_lines.extend(lines[last:len(lines)])

    
text = '\n'.join(e.strip().replace('\t',' ') for e in new_lines)
print(text)

CHEMICAL BIOLOGY
Edited by Deniz Ekinci


Chemical Biology
Edited by Deniz Ekinci
Published by InTech
Janeza Trdine 9, 51000 Rijeka, Croatia
Copyright © 2012 InTech
All chapters are Open Access distributed under the Creative Commons Attribution 3.0
license, which allows users to download, copy and build upon published articles even for
commercial purposes, as long as the author and publisher are properly credited, which
ensures maximum dissemination and a wider impact of our publications. After this work
has been published by InTech, authors have the right to republish it, in whole or part, in
any publication of which they are the author, and to make other personal use of the
work. Any republication, referencing or personal use of the work must explicitly identify
the original source.
As for readers, this license allows users to download, copy and build upon published
chapters even for commercial purposes, as long as the author and publisher are properly
credited, which ensures maximum

In [291]:
len(spans), len(tables)

(30, 30)

In [11]:
print(content.replace('\t',' '))


CHEMICAL BIOLOGY
     Edited by Deniz Ekinci


Chemical Biology
Edited by Deniz Ekinci
Published by InTech
Janeza Trdine 9, 51000 Rijeka, Croatia
Copyright © 2012 InTech
All chapters are Open Access distributed under the Creative Commons Attribution 3.0
license, which allows users to download, copy and build upon published articles even for
commercial purposes, as long as the author and publisher are properly credited, which
ensures maximum dissemination and a wider impact of our publications. After this work
has been published by InTech, authors have the right to republish it, in whole or part, in
any publication of which they are the author, and to make other personal use of the
work. Any republication, referencing or personal use of the work must explicitly identify
the original source.
As for readers, this license allows users to download, copy and build upon published
chapters even for commercial purposes, as long as the author and publisher are properly
credited, which ensures ma

In [None]:

buffer = io.BytesIO()
buffer.write(r.content)
book = epub2txt(buffer).convert()

In [None]:
from glob import glob
for fn in glob('The-Economist/*/*.epub'):
    print(fn)

In [None]:


try:
    response = fetch_url(url, timeout = 10)
except Exception as e:
    print('outgoing', url, 'unsuccessful', e)
    continue

_type = url2extenstion(url)


In [None]:
def process_link(url):
    try:
        response = fetch_url(url, timeout = 10)
    except Exception as e:
        print('outgoing', redirect, 'unsuccessful', e)
        index += 1
        continue
        
    _type = url2extenstion(url)

    if _type not in ['txt','doc','docx','pdf']:
        try:
            soup = BeautifulSoup(response.text,'html.parser')
            for filename in [each['href'] for each in soup.select('a["href"]') if any('.'+e in each['href'] for e in ['txt','doc','pdf'])]:
                redirect = urllib.parse.urljoin(response.url,filename)

                _type = [x for x in ['txt','doc','pdf'] if '.'+x in filename][0]
                response = fetch_url(redirect)
                break
        except Exception as e:
            print('outgoing', redirect, 'unsuccessful', e)
            index += 1
            continue


    print(url, _type)
    
    try:

        if _type in ['txt','html','htm']:
            content = response.text
        elif _type == 'doc':
            tmp = tempfile.NamedTemporaryFile()

            with open(tmp.name, 'wb') as f:
                f.write(response.content) # where `stuff` is, y'know... stuff to write (a string)

            with open(tmp.name,'rb') as f:
                content = fulltext.get(f, name='foo.doc', mime='application/doc',backend='doc')

        elif _type == 'docx':
            content = docx2txt.process(BytesIO(response.content))

        elif _type == 'pdf':
            content = "\n\n".join(pdftotext.PDF(BytesIO(response.content)))
        else:
            print('type unknown')
            index += 1
            continue
    except Exception as e:
        content = ''
        print('error! >>',e)
        
    index += 1
    
    content = content.strip()
    

In [58]:
import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='', caching=True, check_extractable=True):
  interpreter = PDFPageInterpreter(rsrcmgr, device)
  for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,
              caching=caching, check_extractable=check_extractable):
    interpreter.process_page(page)
  return

from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
import io
laparams = LAParams()
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = 'text'
imagewriter = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)

outfp = io.BytesIO()

if outtype == 'text':
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                           imagewriter=imagewriter)
elif outtype == 'xml':
    device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                          imagewriter=imagewriter)
elif outtype == 'html':
    device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                           layoutmode=layoutmode, laparams=laparams,
                           imagewriter=imagewriter)
elif outtype == 'tag':
    device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
    raise
fp = f
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
            caching=caching, check_extractable=True)
device.close()

In [59]:
outfp.seek(0)
content = outfp.read()

In [61]:
print(content.decode())

香港公開大學
THE OPEN UNIVERSITY OF HONG KONG

香港九龍何文田
Ho Man Tin, Kowloon, Hong Kong

大學網站  OUHK website: www.ouhk.edu.hk

本
科
生
課
程
學
生
手
冊

2
0
1
8
–
2
0
1
9

U
n
d
e
r
g
r
a
d
u
a
t
e
 
P
r
o
g
r
a
m
m
e
s
 
S
t
u
d
e
n
t
 

H
a
n
d
b
o
o
k

Undergraduate Programmes

Student 
Handbook

本 科 生 課 程 學 生 手 冊

2018–2019

本科生課程

學生手冊

Undergraduate Programmes

Student
Handbook

2018–2019 

©  THE OPEN UNIVERSITY OF HONG KONG, 2018

香港公開大學2018

Ho Man Tin, Kowloon, Hong Kong

香港九龍何文田

As the content of this Handbook may change from time to time, readers are 
advised to refer to the online version (http://www.ouhk.edu.hk/ug_student_ 
handbook/eng) for the most updated information.

本手冊的內容會隨時因應需要而作出修訂，學生可以閱覽網上版本 
（http://www.ouhk.edu.hk/ug_student_handbook/chi）查閱最新資訊。

This Handbook is printed on environmentally friendly paper.
本手冊使用環保紙張印製。

Contents 目 錄

1  Academic Calendar 2018–2019 

2018–2019年度校曆表 

2  General Regulations for Undergraduate Programmes 

本科生課程的一般規例 

3  Programme Requiremen

In [57]:
text = html2passage(content)
print(text)

Page 1 
 香港公開大學
 
 THE OPEN UNIVERSITY OF HONG KONG
 
 香港九龍何文田
 
 Ho Man Tin, Kowloon, Hong Kong
 
 大學網站   OUHK website: www.ouhk.edu.hk
 
 本
 
 科
 
 生
 
 課
 
 程
 
 學
 
 生
 
 手
 
 冊
 
 2
 
 0
 
 1
 
 8
 
 –
 
 2
 
 0
 
 1
 
 9
 
 U
 
 n
 
 d
 
 e
 
 r
 
 g
 
 r
 
 a
 
 d
 
 u
 
 a
 
 t
 
 e
 
 P
 
 r
 
 o
 
 g
 
 r
 
 a
 
 m
 
 m
 
 e
 
 s
 
 S
 
 t
 
 u
 
 d
 
 e
 
 n
 
 t
 
 H
 
 a
 
 n
 
 d
 
 b
 
 o
 
 o
 
 k
 
 Undergraduate Programmes
 
 Student 
 
 Handbook
 
 本 科 生 課 程 學 生 手 冊
 
 2018–2019
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Page 2 
 
 Page 3 
 本科生課程
 
 學生手冊
 
 Undergraduate Programmes
 
 Student
 
 Handbook
 
 2018–2019 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Page 4