In [11]:
import re, requests, os, time, random, csv, tqdm

In [7]:
def createBookBlocks(fn= './data/GUTINDEX.ALL.iso-8859-1.txt'):
    gut_all= []
    book_block= []
    for line in open(fn):
        if not line.rstrip():  # book end
            if book_block: 
                gut_all.append(book_block)
            book_block= []
            continue
        if line.rstrip()[-1].isdigit():  # new book
            book_block.append(line.rstrip())
        elif book_block:
            book_block.append(line.rstrip())
    return gut_all

In [25]:
def getBookInd(book_block):
    return book_block[0].split()[-1]

def getBookTitle(book_block):
    return ' '.join(book_block[0].split()[:-1]).encode('ascii', 'ignore').decode()
    
def getBookText(book_ind):
    base_url= 'http://gutenberg.readingroo.ms/{path}/{ind}/'.format(path='/'.join(book_ind[:-1]), ind=book_ind)
    text_fnl= re.findall('>(\S+.txt)', requests.get(base_url).text)
    if text_fnl:
        text_url= os.path.join(base_url, text_fnl[0])
        time.sleep(random.random()/10)
        req= requests.get(text_url)
        if not req.status_code==200:
            print( req.status_code)
            print( text_url)
            print( 'book_ind:%s'%book_ind)
        return req.text.lower()
    else:
        return None



In [22]:
def getTextsMatchingKeywords(gut_all, 
                             keywords= [], 
                             outfn='./data/matchingTexts.csv', 
                             book_path_local='./data/texts'):
    books= []
    with open(outfn, 'w') as outf:
        outc= csv.DictWriter(outf, fieldnames='ind,title,mathing'.split(','))
        outc.writeheader()
        for book_block in tqdm.tqdm(gut_all):
            book_ind= getBookInd(book_block)
            book_title= getBookTitle(book_block)
            book_text= getBookText(book_ind)
            if book_text:
                matches= [s for s in keywords if s.lower() in book.text.lower()]
                if matches:
                    if book_path_local:
                        open(os.path.join(book_path_local, book_ind+'.txt'), 'w').write(book_text.encode('ascii', 'ignore').decode())
                    outc.writerow({'ind':int(book_ind), 
                                   'title':book_title, 
                                   'matching':';'.join(matches)})

In [27]:
def getTextsByAuthor(gut_all, author= 'Oscar Wilde', outfn='./data/matchingTexts.csv', book_path_local='./data/texts'):
    books= []
    with open(outfn, 'w') as outf:
        outc= csv.DictWriter(outf, fieldnames='ind,title'.split(','))
        outc.writeheader()
        for book_block in tqdm.tqdm(gut_all):
            if 'by %s'%author in book_block[0]:
                book_ind= getBookInd(book_block)
                book_title= getBookTitle(book_block)
                book_text= getBookText(book_ind)
                if book_text:
                    if book_path_local:
                        open(os.path.join(book_path_local, book_ind+'.txt'), 'w').write(book_text.encode('ascii', 'ignore').decode())
                    outc.writerow({'ind':int(book_ind), 
                                   'title':book_title})

In [14]:
gut_ind= parseIndex()

### Cheshire cat

In [5]:
keywords= set(['Cheshire %s'%i for i in 'cat,feline,tomcat,tom,kitten,mouser,puss,kitty,furball'.split(',')])

In [None]:
getTextsMatchingKeywords(gut_ind, 
                         keywords,
                         outfn='./data/matchingTexts.csv', 
                         book_path_local='./data/texts')

### Oscar Wilde

In [28]:
getTextsByAuthor(gut_ind, 
                 author= 'Oscar Wilde', 
                 outfn='./data/matchingTexts.csv', 
                 book_path_local='./data/texts_ow')


  0%|          | 0/43022 [00:00<?, ?it/s][A
  9%|▊         | 3692/43022 [00:00<00:01, 30462.26it/s][A
 11%|█         | 4606/43022 [00:00<00:04, 8018.32it/s] [A
 18%|█▊        | 7675/43022 [00:00<00:04, 8402.89it/s][A
 20%|█▉        | 8469/43022 [00:01<00:10, 3159.67it/s][A
 26%|██▌       | 11014/43022 [00:01<00:08, 3897.16it/s][A
 29%|██▉       | 12543/43022 [00:02<00:07, 4065.48it/s][A
 31%|███       | 13150/43022 [00:02<00:07, 3823.76it/s][A
 32%|███▏      | 13676/43022 [00:02<00:07, 3877.77it/s][A
 48%|████▊     | 20465/43022 [00:02<00:04, 5099.10it/s][A
 50%|████▉     | 21384/43022 [00:02<00:04, 4572.35it/s][A
 52%|█████▏    | 22552/43022 [00:03<00:04, 4447.30it/s][A
 58%|█████▊    | 25008/43022 [00:03<00:03, 5074.46it/s][A
 60%|█████▉    | 25692/43022 [00:03<00:04, 3853.09it/s][A
 61%|██████    | 26245/43022 [00:03<00:04, 3654.13it/s][A
 66%|██████▌   | 28415/43022 [00:04<00:03, 4304.74it/s][A
 67%|██████▋   | 28976/43022 [00:04<00:05, 2381.27it/s][A
 72%|███████