# Preprocessor for Text 3
Preprocessing includes removing special characters, lowercasing all the letters and reducing the text to the main content.

### Setting Up

In [1]:
import regex as re                  # Used for matching words in the text
import unidecode                    # To remove Greek accents

### Reading the Book

In [2]:
book_name = "The Secret of Toni by Molly Elliot Seawell.txt"
with open(book_name, encoding="utf-8") as book:
    lines = book.readlines()

print(lines[:10])

['\ufeffThe Project Gutenberg eBook of The Secret of Toni, by Molly Elliot\n', 'Seawell\n', '\n', 'This eBook is for the use of anyone anywhere in the United States and\n', 'most other parts of the world at no cost and with almost no restrictions\n', 'whatsoever. You may copy it, give it away or re-use it under the terms\n', 'of the Project Gutenberg License included with this eBook or online at\n', 'www.gutenberg.org. If you are not located in the United States, you\n', 'will have to check the laws of the country where you are located before\n', 'using this eBook.\n']


### Preprocessing

In [3]:
# Extracting the main content from the text

begin_index = lines.index("CHAPTER I\n")
end_index = len(lines) - 1 - lines[::-1].index("THE END\n")
print("The main content is from line numbers {} to {}".format(begin_index, end_index))

lines = lines[begin_index:end_index]        # Reducing lines to main content

The main content is from line numbers 140 to 7366


In [4]:
# Removing chapter headings, part headings and empty lines

part_pattern = r"PART [IVX]+"
chapter_pattern = r"CHAPTER [IVX]+"

temp = []
for line in lines:
    is_valid = ((line == '\n') or re.match(part_pattern, line) or re.match(chapter_pattern, line))
    if(not is_valid):               # If the line is neither a chapter number nor a part heading nor an empty line
        temp.append(line)           # include it in the final list

lines = temp
print(lines[:10])


["Toni's name was Antoine Marcel, but he was never called by it but once\n", 'in his life, and that was at his baptism, when he was eight days old.\n', 'He had a shock of black hair and a snub nose, and the tan and freckles\n', 'on his face were an inch thick, but he had a pair of black eyes so soft\n', 'and bright and appealing that they might have belonged to one of the\n', 'houris of Paradise. His wide mouth was full of sharp, white teeth, and\n', 'when he smiled, which was very often, his smile began with his black\n', 'eyes and ended with his white teeth.\n', 'At ten years of age Toni was a complete man of the world--of his world,\n', 'that is. This consisted of a gay, sunny little old garrison town,\n']


In [5]:
# Removing special characters and combining all the lines into one string

joined_book = ''.join(lines)                            # Combining all the lines to a single string
joined_book = unidecode.unidecode(joined_book)          # Removing Greek accents
joined_book = joined_book.lower()                       # Turing all the characters to lower case
joined_book = re.sub('[\s]+', '_', joined_book)         # Replacing spaces with '_'
joined_book = re.sub('-{2,}', '_', joined_book)         # Replacing spaces with '_'
joined_book = re.sub(r'\W+', '_', joined_book)           # Removing non-alphanumeric characters
joined_book = re.sub('[_]+', ' ', joined_book)             # Replacing '_' back to ' '

print(joined_book[:1000])

with open("T3.txt", "w") as T3:
    T3.write(joined_book)


toni s name was antoine marcel but he was never called by it but once in his life and that was at his baptism when he was eight days old he had a shock of black hair and a snub nose and the tan and freckles on his face were an inch thick but he had a pair of black eyes so soft and bright and appealing that they might have belonged to one of the houris of paradise his wide mouth was full of sharp white teeth and when he smiled which was very often his smile began with his black eyes and ended with his white teeth at ten years of age toni was a complete man of the world of his world that is this consisted of a gay sunny little old garrison town bienville by name in the south of france he had his friends his foes his lady love and also he had arranged his plan of life he knew himself to be the most fortunate person in all bienville in the first place his mother madame marcel kept the only candy shop in the town and toni being the only child of his mother and she a widow enjoyed all the ad