### Extracting the pdfs into .txt ###

In [4]:
import os
import pdfplumber

input_dir = "Datasets"
output_dir = "data/raw_reports/"
os.makedirs(output_dir, exist_ok=True)

for file in os.listdir(input_dir):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(input_dir, file)
        txt_path = os.path.join(output_dir, file.replace(".pdf", ".txt"))
        
        with pdfplumber.open(pdf_path) as pdf, open(txt_path, "w", encoding="utf-8") as out:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    out.write(text + "\n")
        
        print(f"Extracted: {file} -> {txt_path}")


Extracted: HSBC 2024.pdf -> data/raw_reports/HSBC 2024.txt
Extracted: HSBC 2022.pdf -> data/raw_reports/HSBC 2022.txt
Extracted: HSBC 2023.pdf -> data/raw_reports/HSBC 2023.txt
Extracted: Nestle 2024.pdf -> data/raw_reports/Nestle 2024.txt
Extracted: Nestle 2023.pdf -> data/raw_reports/Nestle 2023.txt
Extracted: Nestle 2022.pdf -> data/raw_reports/Nestle 2022.txt
Extracted: Google 2024.pdf -> data/raw_reports/Google 2024.txt
Extracted: Google 2022.pdf -> data/raw_reports/Google 2022.txt
Extracted: Google 2023.pdf -> data/raw_reports/Google 2023.txt


### Cleaning all the extracted .txt files ###

In [5]:
import os, re, unicodedata
from pathlib import Path

base = Path("data")
input_dir = base / "raw_reports"     # points to data/raw_reports/C
output_dir = base / "cleaned_v2"     # cleaned files will be written here
output_dir.mkdir(parents=True, exist_ok=True)

# Replacement tables
LIGATURES = {
    "ﬁ": "fi", "ﬂ": "fl", "ﬀ": "ff", "ﬃ": "ffi", "ﬄ": "ffl",
    "’": "'", "‘": "'", "“": '"', "”": '"',
    "‐": "-", "–": "-", "—": " - ", "−": "-",
    "…": "...",
}
CO2_VARIANTS = ["CO₂", "C0₂", "CO₂e", "CO2e", "tCO₂e", "tCO2e"]

# Regex patterns
HEADER_PATTERNS = [
    r"^\s*Page\s+\d+\s*$",
    r"^\s*\d{1,4}\s*/\s*\d{1,4}\s*$",
    r"^\s*\d{1,4}\s*$",
    r"^HSBC Holdings plc Annual Report.*$",
    r"^Google(’s|'s)? 20\d{2} (Environmental|Sustainability).*",
    r"^Nestl[eé] .*Sustainability.*$",
]
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
BULLET_RE = re.compile(r"^\s*(?:[-•▪◦◻▶»]+|\d+\.)\s+")

def normalize_unicode(s: str) -> str:
    """Normalize Unicode characters and replace ligatures."""
    s = unicodedata.normalize("NFKC", s)
    for k, v in LIGATURES.items():
        s = s.replace(k, v)
    for v in CO2_VARIANTS:
        s = s.replace(v, "CO2e" if "e" in v.lower() else "CO2")
    return s

def fix_hyphenation(s: str) -> str:
    """Join words split across line breaks with hyphens."""
    return re.sub(r"(?<=\w)-\n(?=\w)", "", s)

def clean_text(raw: str) -> str:
    """Clean a raw text string from sustainability reports."""
    s = normalize_unicode(raw)
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = fix_hyphenation(s)
    s = URL_RE.sub("", s)

    lines = s.split("\n")
    out = []
    for ln in lines:
        if any(re.match(p, ln.strip(), flags=re.I) for p in HEADER_PATTERNS):
            continue
        out.append(ln.rstrip())
    lines = out

    # Rejoin broken lines into full paragraphs
    joined = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if not line:
            joined.append("")
            i += 1
            continue
        buf = line
        while i + 1 < len(lines):
            nxt = lines[i + 1].strip()
            if not nxt or BULLET_RE.match(nxt) or re.search(r"[.!?]$", buf):
                break
            buf += " " + nxt
            i += 1
        joined.append(buf)
        i += 1

    cleaned = "\n".join(joined).strip()
    return cleaned

# Process all TXT files in the input directory
for fn in os.listdir(input_dir):
    if not fn.endswith(".txt"):
        continue
    raw = Path(input_dir, fn).read_text(encoding="utf-8", errors="ignore")
    cleaned = clean_text(raw)
    Path(output_dir, fn).write_text(cleaned, encoding="utf-8")

print("Cleaning completed. Output directory:", output_dir)


Cleaning completed. Output directory: data/cleaned_v2


### Creating a corpus by combining all files ###

In [6]:
import os, re, json
from pathlib import Path

# Directories
base = Path("data")
input_dir = base / "cleaned_v2"
output_file = input_dir / "combined_corpus.jsonl"

def parse_metadata(filename: str):
    """Extract company name and year from filename."""
    year_match = re.search(r"(20\d{2})", filename)
    year = int(year_match.group(1)) if year_match else None
    company = None
    if filename.lower().startswith("google"):
        company = "Google"
    elif filename.lower().startswith("hsbc"):
        company = "HSBC"
    elif filename.lower().startswith("nestle"):
        company = "Nestle"
    return company, year

with open(output_file, "w", encoding="utf-8") as out:
    for fn in sorted(os.listdir(input_dir)):
        if not fn.endswith(".txt"):
            continue
        company, year = parse_metadata(fn)
        text = Path(input_dir, fn).read_text(encoding="utf-8", errors="ignore")
        record = {
            "company": company,
            "year": year,
            "file": fn,
            "text": text
        }
        out.write(json.dumps(record, ensure_ascii=False) + "\n")

print("Corpus created at:", output_file)

Corpus created at: data/cleaned_v2/combined_corpus.jsonl


### Making a quick simple sanity check to see everything works properly ###

In [7]:
import json
import pandas as pd
from pathlib import Path

# Load the combined corpus
corpus_path = Path("data/cleaned_v2/combined_corpus.jsonl")

records = []
with open(corpus_path, "r", encoding="utf-8") as f:
    for line in f:
        records.append(json.loads(line))

# Convert to DataFrame for inspection
df = pd.DataFrame(records)

# Quick checks
print("Total records:", len(df))
print("Companies:", df['company'].unique())
print("Years:", df['year'].unique())

# Basic stats
summary = df.groupby(["company","year"]).agg(
    n_chars = ("text", lambda x: sum(len(t) for t in x)),
    n_docs = ("file", "count")
).reset_index()

display(summary)

Total records: 9
Companies: ['Google' 'HSBC' 'Nestle']
Years: [2022 2023 2024]


Unnamed: 0,company,year,n_chars,n_docs
0,Google,2022,43444,1
1,Google,2023,347288,1
2,Google,2024,347750,1
3,HSBC,2022,294158,1
4,HSBC,2023,312433,1
5,HSBC,2024,233796,1
6,Nestle,2022,321825,1
7,Nestle,2023,357262,1
8,Nestle,2024,124295,1
