# Preprocessor for Text 1
Preprocessing includes removing special characters, lowercasing all the letters and reducing the text to the main content.

### Setting Up

In [1]:
import regex as re                  # Used for matching words in the text
import unidecode                    # To remove Greek accents

### Reading the Book

In [2]:
book_name = "The Kingmakers, by Burton E. Stevenson.txt"
with open(book_name, encoding="utf-8") as book:
    lines = book.readlines()

print(lines[:10])

['\ufeffThe Project Gutenberg eBook of The Kingmakers, by Burton E. Stevenson\n', '\n', 'This eBook is for the use of anyone anywhere in the United States and\n', 'most other parts of the world at no cost and with almost no restrictions\n', 'whatsoever. You may copy it, give it away or re-use it under the terms\n', 'of the Project Gutenberg License included with this eBook or online at\n', 'www.gutenberg.org. If you are not located in the United States, you\n', 'will have to check the laws of the country where you are located before\n', 'using this eBook.\n', '\n']


### Preprocessing

In [3]:
# Extracting the main content from the text

begin_index = lines.index("CHAPTER I\n")
end_index = len(lines) - 1 - lines[::-1].index("THE END\n")
print("The main content is from line numbers {} to {}".format(begin_index, end_index))

lines = lines[begin_index:end_index]        # Reducing lines to main content

The main content is from line numbers 157 to 9729


In [4]:
# Removing chapter headings, part headings and empty lines

part_pattern = r"PART [IVX]+"
chapter_pattern = r"CHAPTER [IVX]+"

temp = []
for line in lines:
    is_valid = ((line == '\n') or re.match(part_pattern, line) or re.match(chapter_pattern, line))
    if(not is_valid):               # If the line is neither a chapter number nor a part heading nor an empty line
        temp.append(line)           # include it in the final list

lines = temp
print(lines[:10])


['THE COUNTESS RÃ‰MOND\n', 'Selden, entering from the dining-room, saw that the lounge was crowded,\n', 'and he paused for a moment to look about him. It was the half-hour\n', 'between dinner and the Sporting Club, and he was pleasantly aware of\n', 'the odours of good coffee and super-excellent tobacco, mingled with the\n', 'delicate and very expensive perfumes rising from the clothes, the hair,\n', 'the shoulders of the women lying indolently back in the deep chairs.\n', 'It was the women who dominated the scene. There were men present, to be\n', 'sure, but they were as unobtrusive to the eye, as strictly utilitarian,\n', 'as the donor kneeling humbly in the corner of the picture before the\n']


In [5]:
# Removing special characters and combining all the lines into one string

joined_book = ''.join(lines)                            # Combining all the lines to a single string
joined_book = unidecode.unidecode(joined_book)          # Removing Greek accents
joined_book = joined_book.lower()                       # Turing all the characters to lower case
joined_book = re.sub(r'\W+', '_', joined_book)           # Removing non-alphanumeric characters
joined_book = re.sub('[\s]+', '_', joined_book)            # Replacing spaces with '_'
joined_book = re.sub('[_]+', ' ', joined_book)             # Replacing '_' back to ' '

print(joined_book[:1000])

with open("T1.txt", "w") as T1:
    T1.write(joined_book)


the countess remond selden entering from the dining room saw that the lounge was crowded and he paused for a moment to look about him it was the half hour between dinner and the sporting club and he was pleasantly aware of the odours of good coffee and super excellent tobacco mingled with the delicate and very expensive perfumes rising from the clothes the hair the shoulders of the women lying indolently back in the deep chairs it was the women who dominated the scene there were men present to be sure but they were as unobtrusive to the eye as strictly utilitarian as the donor kneeling humbly in the corner of the picture before the madonna he had paid to have painted these men were donors too of many things besides paint but the resemblance ended there for there was nothing madonna like about the women they differed in being blonde or brune of various contours and of all ages but some subtle quality of spirit bound them together in a common sisterhood their gowns ran the gamut of the r