In [3]:
#Extract text and metadata from PDF files in a directory and compile into a CSV file.
import pdfplumber
import pandas as pd
from pathlib import Path

#Iterate over all PDF files in the directory
PDF_DIR = Path("countries_edited")
OUTPUT_CSV = "document.csv"
rows = []
for pdf_path in PDF_DIR.glob("*.pdf"):
    all_text = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_text.append(text)

    full_text = "\n".join(all_text)

#Extract metadata from filename
    stem = pdf_path.stem              
    parts = stem.split("_")

    country = parts[0] if len(parts) > 0 else None
    year = parts[1] if len(parts) > 1 and parts[1].isdigit() else None
    strategy_name = "_".join(parts[2:]) if len(parts) > 2 else None

    rows.append({
        "doc_id": f"{country}_{year}" if year else country,
        "country": country,
        "year": year,
        "strategy_name": strategy_name,
        "file_name": pdf_path.name,
        "text": full_text
    })

df = pd.DataFrame(rows)



df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value


In [2]:
#Data Pipeline Source: https://garyeckstein.com/clean-text-for-data-analysis/ (slightly modified)
#Framework from Preprocessing Techniques for Text Mining - An Overview by Mohan, Vijayarani. (2015). 



In [2]:
# Import file
import pandas as pd
df = pd.read_csv("document.csv")

df.info()

df.head(10)

EmptyDataError: No columns to parse from file

In [4]:
#Preprocessing Steps: 
#1. Lowercase & Remove Hyphenation
df['text'] = df['text'].str.replace(r'-\s+', '', regex=True)
df['text'] = df['text'].str.lower()

#2. Remove Whitespaces
def remove_whitespaces(text):
    return  " ".join(text.split())
df['text']=df['text'].apply(remove_whitespaces)
df.head(10)



Unnamed: 0,doc_id,country,year,strategy_name,file_name,text
0,CAN_2022,CAN,2022,Canada's National Quantum Strategy,CAN_2022_Canada's National Quantum Strategy.pdf,-----------------io executive summary advances...
1,GER_2023,GER,2023,Quantum Technologies Conceptual Framework Prog...,GER_2023_Quantum Technologies Conceptual Frame...,1. exploiting the potential of quantum technol...
2,UK_2023,UK,2023,National Quantum Strategy,UK_2023_National Quantum Strategy.pdf,ministerial foreword the first generation of q...
3,USA_2018,USA,2018,NATIONAL STRATEGIC OVERVIEW FOR QUANTUM INFO...,USA_2018_NATIONAL STRATEGIC OVERVIEW FOR QUAN...,lsn report on 2017 operationalizing software d...


In [5]:
#Extraction Step: Tokenization
import nltk
nltk.download('punkt')
nltk.download("punkt_tab")
from nltk import word_tokenize
df['text']=df['text'].apply(lambda X: word_tokenize(X))
df.head(10)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,doc_id,country,year,strategy_name,file_name,text
0,CAN_2022,CAN,2022,Canada's National Quantum Strategy,CAN_2022_Canada's National Quantum Strategy.pdf,"[--, --, --, --, --, --, --, --, -io, executiv..."
1,GER_2023,GER,2023,Quantum Technologies Conceptual Framework Prog...,GER_2023_Quantum Technologies Conceptual Frame...,"[1., exploiting, the, potential, of, quantum, ..."
2,UK_2023,UK,2023,National Quantum Strategy,UK_2023_National Quantum Strategy.pdf,"[ministerial, foreword, the, first, generation..."
3,USA_2018,USA,2018,NATIONAL STRATEGIC OVERVIEW FOR QUANTUM INFO...,USA_2018_NATIONAL STRATEGIC OVERVIEW FOR QUAN...,"[lsn, report, on, 2017, operationalizing, soft..."


In [6]:
#Stop Word Elimination

#1. Load NLTK stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english'))
en_stopwords = stopwords.words('english')

## Load custom stopwords -> optional, maybe a specific one for Policy Documents?
# with open(r'C:\your-folder\custom_stopwords.csv', 'r') as f:
#    custom_stopwords = f.read().strip().split(",")
#custom_stopwords[-10:]


#2. Remove stopwords
def remove_stopwords(text):
    result = []
    for token in text:
        if token not in en_stopwords: #and token not in custom_stopwords:
            result.append(token)
            
    return result

df['text'] = df['text'].apply(remove_stopwords)
df.head(10)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Unnamed: 0,doc_id,country,year,strategy_name,file_name,text
0,CAN_2022,CAN,2022,Canada's National Quantum Strategy,CAN_2022_Canada's National Quantum Strategy.pdf,"[--, --, --, --, --, --, --, --, -io, executiv..."
1,GER_2023,GER,2023,Quantum Technologies Conceptual Framework Prog...,GER_2023_Quantum Technologies Conceptual Frame...,"[1., exploiting, potential, quantum, technolog..."
2,UK_2023,UK,2023,National Quantum Strategy,UK_2023_National Quantum Strategy.pdf,"[ministerial, foreword, first, generation, qua..."
3,USA_2018,USA,2018,NATIONAL STRATEGIC OVERVIEW FOR QUANTUM INFO...,USA_2018_NATIONAL STRATEGIC OVERVIEW FOR QUAN...,"[lsn, report, 2017, operationalizing, software..."


In [7]:
#Remove punctuation
from nltk.tokenize import RegexpTokenizer
def remove_punct(text):
    # This regex matches words (\w+) and includes hyphens inside them
    tokenizer = RegexpTokenizer(r"[a-zA-Z0-9]+(?:-[a-zA-Z0-9]+)*")
    lst = tokenizer.tokenize(' '.join(text))
    return lst

df['text'] = df['text'].apply(remove_punct)
df.head(10)

Unnamed: 0,doc_id,country,year,strategy_name,file_name,text
0,CAN_2022,CAN,2022,Canada's National Quantum Strategy,CAN_2022_Canada's National Quantum Strategy.pdf,"[io, executive, summary, advances, quantum, sc..."
1,GER_2023,GER,2023,Quantum Technologies Conceptual Framework Prog...,GER_2023_Quantum Technologies Conceptual Frame...,"[1, exploiting, potential, quantum, technologi..."
2,UK_2023,UK,2023,National Quantum Strategy,UK_2023_National Quantum Strategy.pdf,"[ministerial, foreword, first, generation, qua..."
3,USA_2018,USA,2018,NATIONAL STRATEGIC OVERVIEW FOR QUANTUM INFO...,USA_2018_NATIONAL STRATEGIC OVERVIEW FOR QUAN...,"[lsn, report, 2017, operationalizing, software..."


In [8]:
#Lemmatize (Improve or use Stemming!!)
import nltk
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

def lemmatization(text):
    
    result=[]
    wordnet = WordNetLemmatizer()
    for token,tag in pos_tag(text):
        pos=tag[0].lower()
        
        if pos not in ['a', 'r', 'n', 'v']:
            pos='n'
            
        result.append(wordnet.lemmatize(token,pos))
    
    return result

df['text']=df['text'].apply(lemmatization)
df.head(10)

# Convert list to string
df['text'] = [' '.join(map(str, l)) for l in df['text']]
df.head(10)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,doc_id,country,year,strategy_name,file_name,text
0,CAN_2022,CAN,2022,Canada's National Quantum Strategy,CAN_2022_Canada's National Quantum Strategy.pdf,io executive summary advance quantum science p...
1,GER_2023,GER,2023,Quantum Technologies Conceptual Framework Prog...,GER_2023_Quantum Technologies Conceptual Frame...,1 exploit potential quantum technology germany...
2,UK_2023,UK,2023,National Quantum Strategy,UK_2023_National Quantum Strategy.pdf,ministerial foreword first generation quantum ...
3,USA_2018,USA,2018,NATIONAL STRATEGIC OVERVIEW FOR QUANTUM INFO...,USA_2018_NATIONAL STRATEGIC OVERVIEW FOR QUAN...,lsn report 2017 operationalizing software defi...


In [9]:
# Remove whitespaces
def remove_whitespaces(text):
    return  " ".join(text.split())
df['text']=df['text'].apply(remove_whitespaces)
df.head(10)

Unnamed: 0,doc_id,country,year,strategy_name,file_name,text
0,CAN_2022,CAN,2022,Canada's National Quantum Strategy,CAN_2022_Canada's National Quantum Strategy.pdf,io executive summary advance quantum science p...
1,GER_2023,GER,2023,Quantum Technologies Conceptual Framework Prog...,GER_2023_Quantum Technologies Conceptual Frame...,1 exploit potential quantum technology germany...
2,UK_2023,UK,2023,National Quantum Strategy,UK_2023_National Quantum Strategy.pdf,ministerial foreword first generation quantum ...
3,USA_2018,USA,2018,NATIONAL STRATEGIC OVERVIEW FOR QUANTUM INFO...,USA_2018_NATIONAL STRATEGIC OVERVIEW FOR QUAN...,lsn report 2017 operationalizing software defi...


In [10]:
# Save results to file
df.to_csv("PreprocessedText.csv", index=False, encoding='utf-8')