In [None]:
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|edu|me)"
digits = "([0-9])"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [None]:
import zipfile
import os
from torchvision import datasets

def extract_zip(zip_path, remove_finished=True):
    print('Extracting {}'.format(zip_path))
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(zip_path.replace('.zip', ''))
    if remove_finished:
        os.remove(zip_path)

def download_dataset(url, root='../data/data2'):
    if not os.path.exists(os.path.join(root, 'text')):
      os.makedirs(os.path.join(root))
    datasets.utils.download_url(url, root, 'text.zip', None)
    extract_zip(os.path.join(root, 'text.zip'))
    return os.path.join(root, 'text')


# A Tale of Two Cities
url = 'https://www.gutenberg.org/files/98/98-0.zip'
download_dataset(url)

# A Christmas Carol
url = 'https://www.gutenberg.org/files/24022/24022.zip'
download_dataset(url)

# Our Mutual Friend
url = 'https://www.gutenberg.org/files/883/883-0.zip'
download_dataset(url)

# David Copperfield
url = 'https://www.gutenberg.org/files/766/766-0.zip'
download_dataset(url)

# Oliver Twist v1
url = 'https://www.gutenberg.org/files/47529/47529-0.zip'
download_dataset(url)

# Oliver Twist v2
url = 'https://www.gutenberg.org/files/47530/47530-0.zip'
download_dataset(url)

# Oliver Twist v3
url = 'https://www.gutenberg.org/files/47531/47531-0.zip'
download_dataset(url)

# Hard Times
url = 'https://www.gutenberg.org/files/786/786-0.zip'
download_dataset(url)

# Bleak House
url = 'https://www.gutenberg.org/files/1023/1023.zip'
download_dataset(url)

!ls ../data/data2/text

# Move text files to working directory
!mv ../data/data2/text/98-0.txt ./a_tale_of_two_cities.txt
!mv ../data/data2/text/24022.txt ./a_christmas_carol.txt
!mv ../data/data2/text/883-0.txt ./our_mutual_friend.txt
!mv ../data/data2/text/766-0.txt ./david_copperfield.txt
!mv ../data/data2/text/47529-0.txt ./oliver_twist_1.txt
!mv ../data/data2/text/47530-0.txt ./oliver_twist_2.txt
!mv ../data/data2/text/47531-0.txt ./oliver_twist_3.txt
!mv ../data/data2/text/786-0.txt ./hard_times.txt
!mv ../data/data2/text/1023.txt ./bleak_house.txt

In [None]:
# Tokenize all sentences and compile into a dataframe for each book
# All dataframes have a single column 'lines' -- one sentence per line

# T2C
ttc_df = pd.DataFrame()
f = open('./a_tale_of_two_cities.txt')
txt = f.read()
f.close()
ttc_df['lines'] = split_into_sentences(txt)

# CC
cc_df = pd.DataFrame()
f = open('./a_christmas_carol.txt')
txt = f.read()
f.close()
cc_df['lines'] = split_into_sentences(txt)

# OMF
omf_df = pd.DataFrame()
f = open('./our_mutual_friend.txt')
txt = f.read()
f.close()
omf_df['lines'] = split_into_sentences(txt)

# DC
dc_df = pd.DataFrame()
f = open('./david_copperfield.txt')
txt = f.read()
f.close()
dc_df['lines'] = split_into_sentences(txt)

# OT1
ot1_df = pd.DataFrame()
f = open('./oliver_twist_1.txt')
txt = f.read()
f.close()
ot1_df['lines'] = split_into_sentences(txt)

# OT2
ot2_df = pd.DataFrame()
f = open('./oliver_twist_2.txt')
txt = f.read()
f.close()
ot2_df['lines'] = split_into_sentences(txt)

# OT3
ot3_df = pd.DataFrame()
f = open('./oliver_twist_3.txt')
txt = f.read()
f.close()
ot3_df['lines'] = split_into_sentences(txt)

# HT
ht_df = pd.DataFrame()
f = open('./hard_times.txt')
txt = f.read()
f.close()
ht_df['lines'] = split_into_sentences(txt)

# BH
bh_df = pd.DataFrame()
f = open('./bleak_house.txt')
txt = f.read()
f.close()
bh_df['lines'] = split_into_sentences(txt)

In [None]:
# Remove temp text files
!rm ./a_tale_of_two_cities.txt
!rm ./a_christmas_carol.txt
!rm ./our_mutual_friend.txt
!rm ./david_copperfield.txt
!rm ./oliver_twist_1.txt
!rm ./oliver_twist_2.txt
!rm ./oliver_twist_3.txt
!rm ./hard_times.txt
!rm ./bleak_house.txt

In [None]:
# Concatenate all book dataframes into single dataframe
df = pd.concat([ttc_df, cc_df, omf_df, dc_df, ot1_df, ot2_df, ot3_df, ht_df, bh_df], 
               axis = 0, ignore_index=True)

In [None]:
# Save dataframe as a csv dataset
df.to_csv('./charles_dickens.csv')

In [None]:
print(df.lines[15000])

’  ‘Don’t you fear ME no more, ma’am,’ said Betty; ‘I thought of it for good yesterday.
