In [4]:
import os
print(os.listdir('../data/reports'))

['McKinsey_State.pdf', 'EY_FPA.pdf', 'BCG_Reckoning.pdf', 'KPMG_Insights.pdf', 'McKinsey_Bank.pdf']


In [5]:
import os
print(os.getcwd())

/workspaces/genai-finance-nlp-analysis/notebooks


In [None]:
import os
import PyPDF2
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Set workspace root
WORKSPACE_ROOT = '/workspaces/genai-finance-nlp-analysis'

# Set NLTK data path and download to project directory
nltk_data_dir = os.path.join(WORKSPACE_ROOT, 'nltk_data')
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('tokenizers/punkt_tab')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt', download_dir=nltk_data_dir)
    nltk.download('punkt_tab', download_dir=nltk_data_dir)
    nltk.download('stopwords', download_dir=nltk_data_dir)
    nltk.download('wordnet', download_dir=nltk_data_dir)

# Paths (scalable for 10+ reports)
REPORT_DIR = os.path.join(WORKSPACE_ROOT, 'data/reports')  
reports = {
    'McKinsey_State': 'McKinsey_State.pdf',
    'BCG_Reckoning': 'BCG_Reckoning.pdf',
    'EY_FPA': 'EY_FPA.pdf',
    'McKinsey_Bank': 'McKinsey_Bank.pdf',
    'KPMG_Insights': 'KPMG_Insights.pdf'  
}

# Initialize NLTK resources
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def extract_text_from_pdf(file_path):
    """Extract text from a PDF file."""
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            if reader.is_encrypted:
                reader.decrypt('')  # Provide password if known
            text = ''
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
            return text
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ''

def clean_text(text):
    """Clean: lower, remove non-alpha, tokenize, lemmatize, remove stops."""
    if not isinstance(text, str) or not text.strip():
        return ''
    try:
        text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
        tokens = word_tokenize(text)
        cleaned = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 3]
        return ' '.join(cleaned)
    except Exception as e:
        print(f"Error cleaning text: {e}")
        return ''

# Extract and clean texts
extracted_texts = {name: extract_text_from_pdf(os.path.join(REPORT_DIR, file)) for name, file in reports.items()}
cleaned_texts = {name: clean_text(text) for name, text in extracted_texts.items() if text}

# Save to CSV
output_dir = os.path.join(WORKSPACE_ROOT, 'data')
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'cleaned_texts.csv')
df = pd.DataFrame(cleaned_texts.items(), columns=['Firm_Report', 'Text'])
df.to_csv(output_path, index=False)
print(f"Saved cleaned texts to {output_path}")

[nltk_data] Downloading package punkt to /workspaces/genai-finance-
[nltk_data]     nlp-analysis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /workspaces/genai-
[nltk_data]     finance-nlp-analysis/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /workspaces/genai-
[nltk_data]     finance-nlp-analysis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /workspaces/genai-finance-
[nltk_data]     nlp-analysis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Saved cleaned texts to /workspaces/genai-finance-nlp-analysis/data/cleaned_texts.csv


ML Modeling – VADER Sentiment Analysis

In [13]:
# Import VADER and plotting libraries
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt

# Initialise VADER analyser
analyzer = SentimentIntensityAnalyzer()

# Calculate sentiment score for each report
df['Sentiment'] = df['Text'].apply(lambda x: analyzer.polarity_scores(x)['compound']) # Why: VADER gives sentiment (-1 to +1); compound score summarizes positivity/negativity
print(df[['Firm_Report', 'Sentiment']]) # Why: Display Firm_Report and Sentiment for verification

      Firm_Report  Sentiment
0  McKinsey_State     0.9999
1   BCG_Reckoning     0.9998
2          EY_FPA     1.0000
3   McKinsey_Bank     1.0000
4   KPMG_Insights     0.9999


In [21]:
# Create output directory for visualisations
os.makedirs('viz', exist_ok=True) # Ensure viz directory exists for saving plots

# Plot sentiment bar chart
plt.figure(figsize=(8, 15)) # Set figure size for readability
df.groupby('Firm_Report') ['Sentiment'].mean().plot(kind='bar') # Visualize average sentiment per report
plt.title('Sentiment on AI in FS')
plt.ylabel('VADER Sentiment Score') # Label y-axis (positive >0.05, negative <0)
plt.savefig('viz/sentiment_bar.png') # Save plot for Streamlit and portfolio
plt.close() # Close plot to free memory

# Generate Chart .js config for interactive visualisation
sentiment_data = df['Sentiment'].tolist() # Extract sentiment scores for Chart.js
print({
    "type": "chartjs",
    "data": {
        "labels": list(df['Firm_Report']), # Use report names as x-axis labels
        "datasets": [{
            "label": "Sentiment Score",
            "data": sentiment_data, # Plot actual sentiment scores
            "backgroundColor": ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd"], # Distinct colors for 5 reports
            "bordercolor": ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd"],
            "borderWidth": 1
        }]
    },
    "options": {
        "scales": {
            "y": {"beginAtZero": True, "title": {"display": True, "text": "Sentiment Score"}}, # Start y-axis at 0, label it
            "x": {"title": {"display": True, "text": "Consultancy Report"}} # Label x-axis
        },
       "plugins": {"title": {"display": True, "text": "Sentiment Analysis of AI/ FS Reports" }} # Add chart title
        }
    }
)




{'type': 'chartjs', 'data': {'labels': ['McKinsey_State', 'BCG_Reckoning', 'EY_FPA', 'McKinsey_Bank', 'KPMG_Insights'], 'datasets': [{'label': 'Sentiment Score', 'data': [0.9999, 0.9998, 1.0, 1.0, 0.9999], 'backgroundColor': ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'], 'bordercolor': ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'], 'borderWidth': 1}]}, 'options': {'scales': {'y': {'beginAtZero': True, 'title': {'display': True, 'text': 'Sentiment Score'}}, 'x': {'title': {'display': True, 'text': 'Consultancy Report'}}}, 'plugins': {'title': {'display': True, 'text': 'Sentiment Analysis of AI/ FS Reports'}}}}


Quick Verification for Credibility as the VADER score indicates extremely positive sentiment:

In [23]:
# Verify text quality and lengths
print(df[['Firm_Report', 'Text']].assign(Text_Length=df['Text'].str.len())) # Check cleaned text lengths to identify short texts
print(df[['Firm_Report', 'Sentiment']]) # Confirm sentiment scores for all reports

      Firm_Report                                               Text  \
0  McKinsey_State  state march alex singla alexander sukharevsky ...   
1   BCG_Reckoning  financial institution report bank reckoning st...   
2          EY_FPA  transforming practical guide maturity transfor...   
3   McKinsey_Bank  getty imagesglobal banking practice building b...   
4   KPMG_Insights  kpmg global tech report financial service insi...   

   Text_Length  
0        32174  
1        19712  
2        65518  
3       144125  
4        31996  
      Firm_Report  Sentiment
0  McKinsey_State     0.9999
1   BCG_Reckoning     0.9998
2          EY_FPA     1.0000
3   McKinsey_Bank     1.0000
4   KPMG_Insights     0.9999
