### Text Preprocessing: Normalization and Sentence Splitting ###

In [2]:
import json, re
from pathlib import Path
import spacy

# Load English model for sentence splitting + lemmatization
nlp = spacy.load("en_core_web_sm")

# Paths
corpus_path = Path("data/cleaned_v2/combined_corpus.jsonl")
output_path = Path("data/cleaned_v2/preprocessed_corpus.jsonl")

def normalize_numbers_units(text: str) -> str:
    # Remove thousands separators in numbers: 50,000 -> 50000
    text = re.sub(r"(\d),(\d{3})", r"\1\2", text)
    # Normalize percentages: 20 % -> 20%
    text = re.sub(r"(\d+)\s?%", r"\1%", text)
    return text

def preprocess_text(text: str) -> list:
    # Lowercase
    text = text.lower()
    # Normalize numbers and units
    text = normalize_numbers_units(text)
    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text)
    # Sentence splitting with spaCy
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    return sentences

with open(corpus_path, "r", encoding="utf-8") as f_in, \
     open(output_path, "w", encoding="utf-8") as f_out:
    
    for line in f_in:
        rec = json.loads(line)
        rec["sentences"] = preprocess_text(rec["text"])
        # Drop original full text if you want to save space
        # del rec["text"]
        f_out.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Preprocessed corpus saved to:", output_path)

Preprocessed corpus saved to: data/cleaned_v2/preprocessed_corpus.jsonl


### Text Preprocessing: Stopword Removal and Lemmatization

In [3]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Load spaCy model
nlp = spacy.load("en_core_web_sm", disable=["ner"])  # NER kapalı, hız için

def preprocess_with_lemma_and_stopwords(text: str):
    """
    Apply lowercasing, number normalization, lemmatization,
    and stopword removal using spaCy.
    """
    # Lowercase
    text = text.lower()
    
    # Normalize numbers
    text = re.sub(r"(\d),(\d{3})", r"\1\2", text)   # 50,000 → 50000
    text = re.sub(r"(\d+)\s?%", r"\1%", text)       # 20 % → 20%
    text = re.sub(r"\s+", " ", text)

    # Process with spaCy
    doc = nlp(text)
    
    # Keep only non-stopwords, alphabetic tokens, and lemmatize
    tokens = [
        token.lemma_ for token in doc 
        if not token.is_stop and token.is_alpha
    ]
    
    return tokens

# Example usage
sample_text = "We are reducing our CO2 emissions by 20% through renewable initiatives."
print(preprocess_with_lemma_and_stopwords(sample_text))

['reduce', 'emission', 'renewable', 'initiative']


### Creating Lemmatized Corpus for Lexicon/Snippet Analysis

In [4]:
import json, re
from pathlib import Path
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm", disable=["ner"])

# Paths
input_file = Path("data/cleaned_v2/combined_corpus.jsonl")
output_file_tokens = Path("data/cleaned_v2/preprocessed_with_lemma.jsonl")

def normalize_numbers(text: str) -> str:
    text = re.sub(r"(\d),(\d{3})", r"\1\2", text)   # 50,000 -> 50000
    text = re.sub(r"(\d+)\s?%", r"\1%", text)       # 20 % -> 20%
    text = re.sub(r"\s+", " ", text)
    return text

def preprocess_tokens(text: str):
    """Lowercasing + number normalization + lemmatization + stopword removal"""
    text = text.lower()
    text = normalize_numbers(text)
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and token.is_alpha
    ]
    return tokens

# Create corpus
with open(input_file, "r", encoding="utf-8") as f_in, \
     open(output_file_tokens, "w", encoding="utf-8") as f_out:
    
    for line in f_in:
        rec = json.loads(line)
        rec_tok = {
            "company": rec["company"],
            "year": rec["year"],
            "file": rec["file"],
            "tokens": preprocess_tokens(rec["text"])
        }
        f_out.write(json.dumps(rec_tok, ensure_ascii=False) + "\n")

print("Lexicon/Snippet corpus created at:", output_file_tokens)

Lexicon/Snippet corpus created at: data/cleaned_v2/preprocessed_with_lemma.jsonl
