In [None]:
#!pip install transformers umap-learn matplotlib seaborn scikit-learn

from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


## PIP installs

In [None]:
!pip install requests
!pip install beautifulsoup4
!pip install PyPDF2

## WebScraper

In [None]:
from utils.web_scraper import WebScraper

scraper = WebScraper()
text = scraper.scrape_url("https://www.mckinsey.com/capabilities/operations/our-insights/future-proofing-the-supply-chain")

## PDF Scraper

In [None]:
from utils.pdf_extractor import extract_multiple_pdfs,process_pdf_folder

#processed_count = process_pdf_folder()
processed_count = process_pdf_folder(input_folder="sc_input", output_folder="sc_output", processed_folder="sc_processed")
    

## Process all Text files by referencing their directories

In [None]:
import importlib
import utils.process_text_files
importlib.reload(utils.process_text_files)


from utils.process_text_files import process_files

try:
    sentences, labels, numeric_labels = process_files(["sc_output", "fin_output", "scraped_content"])
    
    print(f"Found {len(sentences)} risk-related sentences")
    print("\nFirst few examples:")
    for i in range(min(50, len(sentences))):
        print(f"{i+1}. [{labels[i]}] {sentences[i][:100]}...")

    print(f"Found {len(sentences)} risk-related sentences")
    
    # Count sentences for each label
    from collections import Counter
    label_counts = Counter(labels)
    
    print("\nSentence count by label:")
    for label, count in sorted(label_counts.items()):
        print(f"{label}: {count}")
        
except FileNotFoundError:
    print(f"File not found: {file_path}")
    print("Make sure you've run the scraper first!")

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
model.eval()


In [None]:
def get_cls_embeddings(text_list):
    all_cls_layers = []

    for text in text_list:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        hidden_states = outputs.hidden_states  # (13 layers, batch, seq_len, hidden_dim)
        cls_per_layer = [layer[0][0].numpy() for layer in hidden_states]  # Get [CLS] at each layer
        all_cls_layers.append(cls_per_layer)

    return np.array(all_cls_layers)  # shape: (samples, layers, hidden_dim)


In [None]:
cls_embeddings = get_cls_embeddings(sentences)
# cls_embeddings.shape = (N samples, 13 layers, 768 dims)
label_map = {"Legal": 0, "Supply Chain": 1, "Financial": 2}

## Plotting UMAP of all sentences in Supply chain and finances

In [None]:
import umap
import plotly.express as px
import pandas as pd

def plot_umap_per_layer(embeddings, labels, label_names, sentences, layers=[1, 4, 8, 12]):
    for layer in layers:
        reducer = umap.UMAP(random_state=42)
        layer_embeds = embeddings[:, layer, :]
        umap_proj = reducer.fit_transform(layer_embeds)

        # Create a DataFrame for easier plotting
        df = pd.DataFrame({
            'x': umap_proj[:, 0],
            'y': umap_proj[:, 1],
            'label': [label_names[l] for l in labels],
            'sentence': sentences
        })

        fig = px.scatter(
            df, x='x', y='y',
            color='label',
            hover_data={'sentence': True, 'x': False, 'y': False},
            title=f"UMAP of Layer {layer} CLS Embeddings"
        )
        fig.update_traces(marker=dict(size=6))
        fig.show()


In [None]:

plot_umap_per_layer(cls_embeddings, numeric_labels, list(label_map.keys()),sentences)

In [None]:
!pip install bertopic

## Plot UMAP of BERT Topic of all sentences in Supply Chain and Finances

In [None]:
UMAP of all BETtopic != -1
    }
    topic_words[-1] = "Outlier"  # Handle outliers

    # Step 3: Build readable topic labels
    topic_labels = [topic_words[t] for t in topics]

    # Step 4: Plot UMAP per selected BERT layer
    for layer in layers:
        reducer = umap.UMAP(random_state=42)
        layer_embeds = cls_embeddings[:, layer, :]
        umap_proj = reducer.fit_transform(layer_embeds)

        df = pd.DataFrame({
            'x': umap_proj[:, 0],
            'y': umap_proj[:, 1],
            'label': topic_labels,
            'sentence': sentences
        })

        fig = px.scatter(
            df, x='x', y='y',
            color='label',
            hover_data={'sentence': True, 'x': False, 'y': False},
            title=f"UMAP of Layer {layer} CLS Embeddings (BERTopic Keywords)"
        )
        fig.update_traces(marker=dict(size=6))
        fig.show()



In [None]:
plot_bertopic_umap(cls_embeddings, sentences)

In [None]:
from bertopic import BERTopic
import umap
import plotly.express as px
import pandas as pd

def plot_bertopic_filtered_umap(cls_embeddings, sentences, layers=[1, 4, 8, 12]):
    # Step 1: Fit BERTopic
    topic_model = BERTopic(language="english")
    topics, _ = topic_model.fit_transform(sentences)

    # Step 2: Define keywords for filtering
    supply_keywords = {"shipment", "delay", "vendor", "logistics", "port", "customs", "invoice", "raw", "supply"}
    financial_keywords = {"revenue", "cost", "loss", "liability", "inflation", "price", "profit", "expense"}

    # Step 3: Extract topic words
    topic_words = {}
    topic_filter = set()
    for topic in set(topics):
        if topic == -1:
            continue
        words = [word for word, _ in topic_model.get_topic(topic)[:5]]
        topic_words[topic] = ", ".join(words)

        # Check if topic has relevant keywords
        if any(w in supply_keywords or w in financial_keywords for w in words):
            topic_filter.add(topic)

    topic_words[-1] = "Outlier"  # Fallback
    topic_filter.add(-1)         # Optional: include outliers

    # Step 4: Label sentences and filter
    filtered_indices = [i for i, t in enumerate(topics) if t in topic_filter]
    filtered_embeddings = cls_embeddings[filtered_indices]
    filtered_sentences = [sentences[i] for i in filtered_indices]
    filtered_labels = [topic_words[topics[i]] for i in filtered_indices]

    # Step 5: Plot for each layer
    for layer in layers:
        reducer = umap.UMAP(random_state=42)
        layer_embeds = filtered_embeddings[:, layer, :]
        umap_proj = reducer.fit_transform(layer_embeds)

        df = pd.DataFrame({
            'x': umap_proj[:, 0],
            'y': umap_proj[:, 1],
            'label': filtered_labels,
            'sentence': filtered_sentences
        })

        fig = px.scatter(
            df, x='x', y='y',
            color='label',
            hover_data={'sentence': True, 'x': False, 'y': False},
            title=f"UMAP of Layer {layer} CLS Embeddings (Filtered Topics)"
        )
        fig.update_traces(marker=dict(size=6))
        fig.show()


In [None]:
plot_bertopic_filtered_umap(cls_embeddings, sentences)

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from bertopic import BERTopic
from sklearn.preprocessing import normalize

# 👉 INPUT your real data here:
# cls_embeddings: shape (num_samples, num_layers, 768)
# sentences: List of strings

# Example dummy data:
# cls_embeddings = np.random.rand(100, 13, 768)
# sentences = ["your sentence 1", "your sentence 2", ..., "your sentence N"]

def calculate_cluster_tightness(cls_embeddings, sentences, layers=[12]):
    # Step 1: Fit BERTopic
    topic_model = BERTopic(language="english")
    topics, _ = topic_model.fit_transform(sentences)

    # Step 2: Define filtering keywords
    supply_keywords = {"shipment", "delay", "vendor", "logistics", "port", "customs", "invoice", "raw", "supply"}
    financial_keywords = {"revenue", "cost", "loss", "liability", "inflation", "price", "profit", "expense"}

    # Step 3: Get topic keywords and filter
    topic_words = {}
    topic_filter = set()
    for topic in set(topics):
        if topic == -1:
            continue
        words = [word for word, _ in topic_model.get_topic(topic)[:5]]
        topic_words[topic] = ", ".join(words)
        if any(w in supply_keywords or w in financial_keywords for w in words):
            topic_filter.add(topic)
    topic_words[-1] = "Outlier"
    topic_filter.add(-1)

    # Step 4: Filter sentences/embeddings by topic
    filtered_indices = [i for i, t in enumerate(topics) if t in topic_filter]
    filtered_embeddings = cls_embeddings[filtered_indices]
    filtered_sentences = [sentences[i] for i in filtered_indices]
    filtered_labels = [topic_words[topics[i]] for i in filtered_indices]

    # Step 5: Calculate tightness for each layer and topic
    results = []
    for layer in layers:
        layer_embeds = filtered_embeddings[:, layer, :]  # (N, 768)
        df = pd.DataFrame(layer_embeds)
        df['label'] = filtered_labels

        for label in df['label'].unique():
            cluster_points = df[df['label'] == label].drop(columns='label').values
            if len(cluster_points) < 2:
                continue
            centroid = cluster_points.mean(axis=0)
            dists = cdist(cluster_points, [centroid], metric='euclidean')
            mean_dist = dists.mean()
            results.append({
                'Topic': label,
                'Layer': layer,
                'Avg_Distance_to_Centroid': mean_dist,
                'Num_Points': len(cluster_points)
            })

    return pd.DataFrame(results)



In [None]:
results_df = calculate_cluster_tightness(cls_embeddings, sentences)
print(results_df.sort_values("Avg_Distance_to_Centroid"))

In [None]:
results_df.shape