In [None]:
import pandas as pd
import os
import re
import nltk
import spacy
from spacy.language import Language

# Customizing spaCy's sentence boundary detection
@Language.component("custom_boundary_component")
def custom_boundary_component(doc):
    for token in doc:
        # Prevent period after 'F.' from being treated as a sentence boundary
        if token.text == "f." and token.i + 1 < len(doc):
            doc[token.i + 1].is_sent_start = False  # This stops the period from being a sentence boundary
    return doc

# Load resources
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Optional: load spaCy model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Add the custom boundary detection to the pipeline
nlp.add_pipe("custom_boundary_component", before="parser")

# Setup paths and columns
input_path = "Data/sp500_10k_items/items_filtered_10K_filings_2023_1.csv"
output_path = "Data/Data_Cleaning/items_cleaned_10K_filings_2023_1.csv"
columns_to_clean = ["Item_1", "Item_1A", "Item_7", "Item_8"]

# Cleaning function
def clean_text(text, lemmatize=True):
    if pd.isna(text):
        return ""

    # Lowercase
    text = text.lower()

    # Remove URLs (http://, https://, www.)
    text = re.sub(r'http[s]?://\S+|www\.\S+', ' ', text)

    # Remove punctuation and numbers (except the period at the end of sentences)
    text = re.sub(r"[^\w\s.]", " ", text)  # Remove punctuation except period
    text = re.sub(r"\d+", " ", text)       # Remove digits

    # Process with spaCy
    doc = nlp(text)

    cleaned_sentences = []
    for sent in doc.sents:
        words = [token.lemma_ if lemmatize else token.text for token in sent
                 if not token.is_stop and not token.is_punct and not token.is_space]

        # Remove any single character words (like "k", "q", etc.)
        words = [word for word in words if len(word) > 1]

        # Remove sentences with fewer than 4 words
        if len(words) >= 4:
            sentence = " ".join(words)
            cleaned_sentences.append(sentence + ".")  # Add period back to the sentence

    return " ".join(cleaned_sentences)

# Read the entire input file
df = pd.read_csv(input_path, sep=";")

# Apply cleaning function to each specified column
for col in columns_to_clean:
    df[col] = df[col].apply(lambda x: clean_text(x))

# Make sure output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Save cleaned data
df.to_csv(output_path, index=False, sep=";")
print(f"Cleaned data saved to {output_path}")
