In [3]:
!pip install pdfminer
!pip install pdfminer.six
!pip install PyPDF2

Collecting pdfminer
  Downloading pdfminer-20191125.tar.gz (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycryptodome (from pdfminer)
  Downloading pycryptodome-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m75.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pdfminer
  Building wheel for pdfminer (setup.py) ... [?25l[?25hdone
  Created wheel for pdfminer: filename=pdfminer-20191125-py3-none-any.whl size=6140084 sha256=89e8bff7a21641588fd713dc4268a54fe5bf315da60d3e7bec0c5f4ffabf1f2b
  Stored in directory: /root/.cache/pip/wheels/4e/c1/68/f7bd0a8f514661f76b5cbe3b5f76e0033d79f1296012cbbf72
Successfully built pdfminer
Installing collected packages: pycryptodome, pdfminer
Successfully installed pdfmine

# 1. CONVERT PDF TO TEXT

In [1]:
from pdfminer.high_level import extract_text
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
import re
import os
import shutil

In [2]:
#/content/the_return_of_sherlock_holmes.pdf
doc_name = "the_return_of_sherlock_holmes"

process_folder = doc_name
if not os.path.exists(process_folder):
    os.makedirs(process_folder)

path = process_folder + "/" + doc_name + ".pdf"
text_filename = process_folder + "/" + doc_name + ".txt"

# Attempt to copy the PDF
try:
    shutil.copy(f"{doc_name}.pdf", process_folder)
    print(f"File '{doc_name}.pdf' successfully copied to '{process_folder}'")
except FileNotFoundError:
    print(f"File '{doc_name}.pdf' not found. Please download it from the Resources Area.")
except Exception as e:
    print(f"An error occurred: {e}")


File 'the_return_of_sherlock_holmes.pdf' successfully copied to 'the_return_of_sherlock_holmes'


### Save the entire text to a text file.

In [3]:
import os
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text

document = PdfReader(open(path, "rb"))  # replace with your PDF file


all_pages_text = []
page_num = 0

for i in range(len(document.pages)):
    # Convert page to PDF File Writer object
    page = document.pages[i]

    # Extract text from page
    page_text = page.extract_text()

    start_page = i - 1

    all_pages_text.append(page_text)

with open(text_filename, 'w', encoding="utf-8") as file:
    for item in all_pages_text:
        file.write(str(item) + '\n')

### Split text by sentence

In [4]:
import os
import pandas as pd
import nltk

# Download the punkt tokenizer models
nltk.download('punkt')

def file_to_sentences(filepath):
    """
    Given a filepath, read the text file and split it into sentences.
    Returns a list of sentences.
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        content = file.read()
        sentences = nltk.tokenize.sent_tokenize(content)
    return sentences

def add_sentences_to_dataframe(sentences, dataframe):
    """
    Add sentences to the given dataframe.
    """
    for sentence in sentences:
        dataframe = dataframe.append({'Sentence': sentence}, ignore_index=True)
    return dataframe

# Initialize an empty DataFrame with one column "Sentence"
df = pd.DataFrame(columns=["text"])

# Initialize a list to hold all sentences
all_sentences = []

# Set the directory where the text files are located
filepath = f"{doc_name}/{doc_name}.txt"
all_sentences.extend(file_to_sentences(filepath))


# Convert the list of sentences to a DataFrame
df = pd.DataFrame(all_sentences, columns=["Sentence"])

# Save the DataFrame to a CSV file
df.to_csv(f"{doc_name}_sentences.csv", index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 2. PRE-PROCESS THE TEXT

 - Load as unicode (which is usually the default of Python anyways)
 - Lowercase the entire text
 - Exclude all symbols except these: . , ; ? ! ' "

In [5]:
import re
import unicodedata

# Function to convert to ASCII
def to_ascii(text):
    normalized = unicodedata.normalize('NFKD', text)
    return normalized.encode('ascii', 'ignore').decode('ascii')

# Function to clean the text
def clean_text(text):
    # Convert to lowercase and to ASCII
    text = to_ascii(text.lower())
    # Keep only alphabetic characters and spaces
    text = re.sub(r'[^a-z\s]+', ' ', text)
    # Normalize spaces to a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Read the content of the file
filepath = f"{doc_name}/{doc_name}.txt"

with open(filepath, 'r', encoding='utf-8') as file:
    content = file.read()

# Clean the content
cleaned_content = clean_text(content)

# Save the cleaned content back to a file
filepath_clean = f"{doc_name}/{doc_name}_clean.txt"
with open(filepath_clean, 'w', encoding='ascii') as file:
    file.write(cleaned_content)

print(f"The text has been cleaned and saved to {filepath_clean}.")



The text has been cleaned and saved to the_return_of_sherlock_holmes/the_return_of_sherlock_holmes_clean.txt.


# 3. TOKENIZE THE TEXT

In [6]:
# Function to extract unique words from the text
def extract_unique_words(text):
    # Split text into words based on whitespace
    words = text.split()
    # Use a set to remove duplicates, extracting the unique words
    unique_words = set(words)
    return unique_words

# Read the cleaned content of the file
filepath_clean = f"{doc_name}/{doc_name}_clean.txt"
with open(filepath_clean, 'r', encoding='utf-8') as file:
    content = file.read()

# Get the list of unique words
unique_words_set = extract_unique_words(content)
unique_words_list = sorted(list(unique_words_set))  # Convert to a sorted list if order is needed

# Optionally save the unique words to a file
filepath_words = f"{doc_name}/{doc_name}_words.txt"
with open(filepath_words, 'w', encoding='utf-8') as file:
    for word in unique_words_list:
        file.write(word + '\n')

print(f"Total unique words: {len(unique_words_list)}")
print(f"The list of unique words has been saved to {filepath_words}")


Total unique words: 9088
The list of unique words has been saved to the_return_of_sherlock_holmes/the_return_of_sherlock_holmes_words.txt
