In [1]:
#Extract text and metadata from PDF files in a directory and compile into a CSV file.
import pdfplumber
import pandas as pd
from pathlib import Path

#Iterate over all PDF files in the directory
PDF_DIR = Path("countries")
OUTPUT_CSV = "document.csv"
rows = []
for pdf_path in PDF_DIR.glob("*.pdf"):
    all_text = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_text.append(text)

    full_text = "\n".join(all_text)

#Extract metadata from filename
    stem = pdf_path.stem              
    parts = stem.split("_")

    country = parts[0] if len(parts) > 0 else None
    year = parts[1] if len(parts) > 1 and parts[1].isdigit() else None
    strategy_name = "_".join(parts[2:]) if len(parts) > 2 else None

    rows.append({
        "doc_id": f"{country}_{year}" if year else country,
        "country": country,
        "year": year,
        "strategy_name": strategy_name,
        "file_name": pdf_path.name,
        "text": full_text
    })

df = pd.DataFrame(rows)

# Convert year to numeric, setting errors to NaN for non-numeric values
df["year"] = pd.to_numeric(df["year"], errors="coerce")

df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 fl

In [2]:
import pdfplumber
import pandas as pd
from pathlib import Path
from bertopic import BERTopic
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer

# --- PART 1: EXTRACTION (Same as before) ---
PDF_DIR = Path("countries_edited")
rows = []

for pdf_path in PDF_DIR.glob("*.pdf"):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = "\n".join([p.extract_text() for p in pdf.pages if p.extract_text()])
        
        # Increase granularity: split by sentences or small paragraphs
        chunks = [c.strip() for c in full_text.split('\n') if len(c.strip()) > 50]
        
        stem = pdf_path.stem               
        parts = stem.split("_")
        country = parts[0] if len(parts) > 0 else "Unknown"

        for chunk in chunks:
            rows.append({"country": country, "text": chunk, "file": pdf_path.name})

df = pd.DataFrame(rows)
docs = df['text'].tolist()

print(f"Total documents for modeling: {len(docs)}")

# --- PART 2: THE "TINY DATA" CONFIGURATION ---

# 1. Use PCA instead of UMAP: PCA is linear and won't crash on small N
dim_model = PCA(n_components=min(2, len(docs)-1)) 

# 2. Use KMeans instead of HDBSCAN: 
# HDBSCAN requires a minimum density that small data doesn't have.
# n_clusters=3 (Adjust this based on how many topics you expect)
cluster_model = KMeans(n_clusters=min(3, len(docs)))

topic_model = BERTopic(
    umap_model=dim_model,
    hdbscan_model=cluster_model,
    vectorizer_model=CountVectorizer(stop_words="english"),
)

# --- PART 3: RUN MODELING ---
try:
    topics, probs = topic_model.fit_transform(docs)
    df['topic'] = topics
    print(topic_model.get_topic_info())
except Exception as e:
    print(f"Error encountered: {e}")
    print("If you have fewer than 5 docs, BERTopic might not be the right tool.")

Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value


Total documents for modeling: 4033
   Topic  Count                                          Name  \
0      0   2656  0_research_government_development_technology   
1      1    754   1_quantum_computing_technologies_technology   
2      2    623              2_quantum_technologies_uk_sector   

                                      Representation  \
0  [research, government, development, technology...   
1  [quantum, computing, technologies, technology,...   
2  [quantum, technologies, uk, sector, support, i...   

                                 Representative_Docs  
0  [National security needs often drive the advan...  
1  [quantum computing technologies. and software,...  
2  [commercial success of quantum technologies br...  


In [61]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
import pandas as pd

# 1. Prepare the Data
# If your PDFs are very long, BERTopic might struggle. 
# It's often better to use a list of strings (documents).
docs = df['text'].tolist()

# 2. Configure the sub-models (Optional but recommended for stability)
# UMAP reduces dimensions, HDBSCAN clusters them
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# 3. Initialize and Fit BERTopic
# We use 'all-MiniLM-L6-v2' as it is fast and accurate for general text
topic_model = BERTopic(
    embedding_model="all-MiniLM-L6-v2",
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(docs)

# 4. Add results back to your original DataFrame
df['topic'] = topics

# 5. Review the Topics
topic_info = topic_model.get_topic_info()
print(topic_info.head())

# 6. Save the results
df.to_csv("document_with_topics.csv", index=False)
topic_model.save("my_bertopic_model")


# ... (Keep your previous PDF extraction and BERTopic fitting code) ...

# 1. Get the topic information table
# This contains Topic ID, Count, Name, and Representation (keywords)
topic_info = topic_model.get_topic_info()

# 2. Save the topic-to-word mapping to a CSV
# This creates a file where you can see: Topic 1 -> "green, energy, hydrogen..."
topic_info.to_csv("topic_definitions.csv", index=False)

# 3. (Optional) Get an even more detailed word-score mapping
# This creates a "long" format CSV with words and their c-TF-IDF scores
topic_representation = topic_model.get_topics() # Dictionary format
rows_list = []
for topic_id, words in topic_representation.items():
    for word, score in words:
        rows_list.append({
            "topic_id": topic_id,
            "word": word,
            "score": score
        })

word_score_df = pd.DataFrame(rows_list)
word_score_df.to_csv("topic_words_with_scores.csv", index=False)

print("Topic definitions saved to 'topic_definitions.csv'")

2026-01-02 16:32:08,409 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 127/127 [00:16<00:00,  7.49it/s]
2026-01-02 16:32:27,764 - BERTopic - Embedding - Completed ✓
2026-01-02 16:32:27,766 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-02 16:32:58,778 - BERTopic - Dimensionality - Completed ✓
2026-01-02 16:32:58,778 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-02 16:32:59,695 - BERTopic - Cluster - Completed ✓
2026-01-02 16:32:59,697 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-02 16:32:59,812 - BERTopic - Representation - Completed ✓


   Topic  Count                                               Name  \
0     -1   1457                                   -1_and_the_to_of   
1      0    253                      0_quantum_technologies_of_the   
2      1    123  1_technologies_applications_technology_applica...   
3      2    119               2_canada_canadian_government_ontario   
4      3    113                           3_uk_quantum_leading_the   

                                      Representation  \
0  [and, the, to, of, for, in, quantum, research,...   
1  [quantum, technologies, of, the, sector, to, a...   
2  [technologies, applications, technology, appli...   
3  [canada, canadian, government, ontario, has, e...   
4  [uk, quantum, leading, the, business, ensure, ...   

                                 Representative_Docs  
0  [To continue to grow this offer and ensure qua...  
1  [across the different quantum technologies, ou...  
2  [applications, as well as an international lev...  
3  [work and live in C

In [63]:
# 1. Get the topic information table
# This contains Topic ID, Count, Name, and Representation (keywords)
topic_info = topic_model.get_topic_info()

# 2. Save the topic-to-word mapping to a CSV
# This creates a file where you can see: Topic 1 -> "green, energy, hydrogen..."
topic_info.to_csv("topic_definitions.csv", index=False)

In [62]:
seed_topic_list = [
    ["university", "research", "science", "academia", "phd", "knowledge", "education"], # Academia
    ["industry", "business", "innovation", "startup", "sme", "market", "commercialization"], # Industry
    ["government", "policy", "regulation", "ministry", "funding", "strategy", "public"], # Government
    ["society", "citizen", "public_engagement", "inclusion", "civil_society", "ngo", "participation"] # Civil Society
]

2026-01-02 16:27:55,739 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s]
2026-01-02 16:27:59,332 - BERTopic - Embedding - Completed ✓
2026-01-02 16:27:59,332 - BERTopic - Guided - Find embeddings highly related to seeded topics.
Batches: 100%|██████████| 1/1 [00:00<00:00, 41.59it/s]
2026-01-02 16:27:59,366 - BERTopic - Guided - Completed ✓
2026-01-02 16:27:59,366 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-02 16:27:59,397 - BERTopic - Dimensionality - Completed ✓
2026-01-02 16:27:59,397 - BERTopic - Cluster - Start clustering the reduced embeddings


ValueError: k must be less than or equal to the number of training points