In [4]:
import os
print(os.listdir('../data/reports'))

['McKinsey_State.pdf', 'EY_FPA.pdf', 'BCG_Reckoning.pdf', 'KPMG_Insights.pdf', 'McKinsey_Bank.pdf']


In [5]:
import os
print(os.getcwd())

/workspaces/genai-finance-nlp-analysis/notebooks


In [None]:
import os
import PyPDF2
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Set workspace root
WORKSPACE_ROOT = '/workspaces/genai-finance-nlp-analysis'

# Set NLTK data path and download to project directory
nltk_data_dir = os.path.join(WORKSPACE_ROOT, 'nltk_data')
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('tokenizers/punkt_tab')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt', download_dir=nltk_data_dir)
    nltk.download('punkt_tab', download_dir=nltk_data_dir)
    nltk.download('stopwords', download_dir=nltk_data_dir)
    nltk.download('wordnet', download_dir=nltk_data_dir)

# Paths (scalable for 10+ reports)
REPORT_DIR = os.path.join(WORKSPACE_ROOT, 'data/reports')  
reports = {
    'McKinsey_State': 'McKinsey_State.pdf',
    'BCG_Reckoning': 'BCG_Reckoning.pdf',
    'EY_FPA': 'EY_FPA.pdf',
    'McKinsey_Bank': 'McKinsey_Bank.pdf',
    'KPMG_Insights': 'KPMG_Insights.pdf'  
}

# Initialize NLTK resources
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def extract_text_from_pdf(file_path):
    """Extract text from a PDF file."""
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            if reader.is_encrypted:
                reader.decrypt('')  # Provide password if known
            text = ''
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
            return text
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ''

def clean_text(text):
    """Clean: lower, remove non-alpha, tokenize, lemmatize, remove stops."""
    if not isinstance(text, str) or not text.strip():
        return ''
    try:
        text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
        tokens = word_tokenize(text)
        cleaned = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 3]
        return ' '.join(cleaned)
    except Exception as e:
        print(f"Error cleaning text: {e}")
        return ''

# Extract and clean texts
extracted_texts = {name: extract_text_from_pdf(os.path.join(REPORT_DIR, file)) for name, file in reports.items()}
cleaned_texts = {name: clean_text(text) for name, text in extracted_texts.items() if text}

# Save to CSV
output_dir = os.path.join(WORKSPACE_ROOT, 'data')
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'cleaned_texts.csv')
df = pd.DataFrame(cleaned_texts.items(), columns=['Firm_Report', 'Text'])
df.to_csv(output_path, index=False)
print(f"Saved cleaned texts to {output_path}")

[nltk_data] Downloading package punkt to /workspaces/genai-finance-
[nltk_data]     nlp-analysis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /workspaces/genai-
[nltk_data]     finance-nlp-analysis/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /workspaces/genai-
[nltk_data]     finance-nlp-analysis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /workspaces/genai-finance-
[nltk_data]     nlp-analysis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Saved cleaned texts to /workspaces/genai-finance-nlp-analysis/data/cleaned_texts.csv
