In [5]:
import os
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Define additional stop words
ADDITIONAL_STOPWORDS = {""}

def clean_text(file_path, output_path):
    # Ensure NLTK stopwords are downloaded
    try:
        stop_words = set(stopwords.words('english'))
    except LookupError:
        import nltk
        nltk.download('stopwords')
        nltk.download('punkt')
        stop_words = set(stopwords.words('english'))

    stop_words.update(ADDITIONAL_STOPWORDS)

    # Prepare punctuation to exclude periods
    punctuation_to_remove = string.punctuation.replace('.', '')

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    cleaned_lines = []

    for line in lines:
        # Remove numbers from the line
        line = re.sub(r'\d+', '', line)
        # Tokenize line
        tokens = word_tokenize(line)
        # Filter out unwanted tokens
        tokens = [
            token.lower() for token in tokens
            if token.lower() not in stop_words
            and token not in punctuation_to_remove
            and len(token) > 1
        ]
        # Reconstruct cleaned sentence
        cleaned_line = ' '.join(tokens)
        cleaned_lines.append(cleaned_line)

    # Save cleaned content to a new file
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write('\n'.join(cleaned_lines))

# File paths
input_file_path = '../../data/txt/pdf_contents.txt'
output_file_path = '../../data/txt/cleaned_contents.txt'

# Run the function
clean_text(input_file_path, output_file_path)