<a href="https://colab.research.google.com/github/daveshap/QuestionDetector/blob/main/DownloadGutenbergTop100.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Top 100 Ebooks
Gutenberg has a bunch of books for free. Fiction has a lot of dialog. 

In [None]:
import requests

prefix = 'https://www.gutenberg.org/files/'
# final address example: https://www.gutenberg.org/files/84/84-0.txt
books = ['84','1342','25344','2542','11','1952','16328','1250','41','1080','46','5200','98','43','1232','1661','2701','345','844','23','160','209','205','76','1064','42108','408','74','1260','174','2591','16','1497','1400','2852','120','219','215','42884','3207','6130','42324','38269','158','203','2814','514','2500','3825','2600','40074','55','4300','58585','15399','45','1184','36','5740','1727','140','768','902','2554','996','113','3600','2148','19942','57775','244','27827','135','28054','7370','132','20203','13701','1254','1001','1063','11030','32415','779','3296','4363','34901','22381','12122','236','308','4517','35','376','147','1251','766','16643','1998','730']

for book in books:
  bookurl = prefix + '%s/%s-0.txt' % (book, book)
  #print('Downloading:', bookurl)
  response = requests.get(bookurl)
  body = response.content.decode('utf-8')
  with open('/content/drive/My Drive/Gutenberg/%s.txt' % book, 'w', encoding='utf-8') as file:
    file.write(body)

# Cleanup Docs
- Split books based on sections delimited by vertical whitespace
- Remove short sections, sections full of all caps, and sections with too many symbols

In [18]:
import os
import re

gut_dir = '/content/drive/My Drive/Gutenberg/'
data = list()

def clean_chunk(text):
  text = text.strip()  # trim the ends
  text = re.sub(r'\t', ' ', text)  # replace tabs with spaces
  text = re.sub(r'\s+', ' ', text)  # replace any extra whitespace with single
  return text
  
def measure_text(text)  :
  # returns count of: chars, word chars, digits, whitespace
  if len(text) == 0:
    return 0, 0, 0, 0
  pct_wordchars = len(re.findall('\w', text)) / len(text)
  pct_digits = len(re.findall('\d', text)) / len(text)
  pct_whitespace = len(re.findall('\s', text)) / len(text)
  pct_caps = len(re.findall('[A-Z]', text)) / len(text)
  return pct_wordchars, pct_digits, pct_whitespace, pct_caps

def split_vertical(text):
  results = list()
  chunks = re.split('\n{2,}', text)
  for chunk in chunks:
    morechunks = re.split('[\r\n]{4,}', chunk)
    for more in morechunks:
      results.append(more)
  return results

for file in os.listdir(gut_dir):
  if 'data' in file:
    continue
  with open(gut_dir + file, 'r', encoding='utf-8') as infile:
    body = infile.read()
  if '<!DOCTYPE html>' in body:
    continue
  chunks = split_vertical(body)
  #print('Reading file:', file, '...found chunks:', len(chunks))
  for chunk in chunks:
    # TODO check chunk for all caps, too many symbols, quotation marks, etc
    if '"' in chunk or '“' in chunk:  # if text is dialog, put it in
      data.append(clean_chunk(chunk))
      continue
    if '!' in chunk or '?' in chunk:
      data.append(clean_chunk(chunk))
      continue
    if len(chunk) < 15:  # line is just too short
      continue
    if '.' not in chunk:  # no punctuation at all
      continue 
    pwc, pd, pws, pc = measure_text(chunk)
    if pwc < 0.8:  # if too many symbols
      continue
    if pc > 0.6:  # if too many caps
      continue 
    data.append(clean_chunk(chunk))

with open(gut_dir + 'all_data.txt', 'w', encoding='utf-8') as file:
  for i in data:
    file.write(i + '\n\n')

print('All done, data saved!')

All done, data saved!


# Split Sentences
Use SpaCy to split sentences along boundaries, save the result with one sentence per line. 

In [21]:
!pip install spacy --quiet
!pip install pysbd --quiet

import spacy
from pysbd.utils import PySBDFactory
import os
import re

gut_dir = '/content/drive/My Drive/Gutenberg/'
data = list()

with open(gut_dir + 'all_data.txt', 'r', encoding='utf-8') as infile:
  body = infile.read()

nlp = spacy.blank('en')
nlp.add_pipe(PySBDFactory(nlp))

chunks = re.split('\n\n', body)
print('Processing chunks:', len(chunks))
count = 0
total = 1
for chunk in chunks:
  count += 1
  total += 1
  if count > 5000:
    count = 0
    print('Processed:', total)
  doc = nlp(chunk)
  for sent in list(doc.sents):
    data.append(sent)

with open(gut_dir + 'sentence_data.txt', 'w', encoding='utf-8') as outfile:
  for i in data:
    outfile.write(str(i) + '\n\n')

Processing chunks: 113896
Processed: 5002
Processed: 10003
Processed: 15004
Processed: 20005
Processed: 25006
Processed: 30007
Processed: 35008
Processed: 40009
Processed: 45010
Processed: 50011
Processed: 55012
Processed: 60013
Processed: 65014
Processed: 70015
Processed: 75016
Processed: 80017
Processed: 85018
Processed: 90019
Processed: 95020
Processed: 100021
Processed: 105022
Processed: 110023
