# 1. Set-up

## 1.1 Library

In [None]:
import re
import pandas as pd
import os 
import json

# language detection
from langdetect import detect

# text pre-processing 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from cleantext import clean

## 1.2 Read in data

### 1.2.1 Stakeholder Scraped Texts

#### 1.2.1.1 Stakeholder HTML Texts

In [None]:
# first stakeholders html 
stakeholder_html_dir = "master_thesis_2025/stakeholder_data_extraction_pipeline/data/processed_data/html_text"

stakeholder_html_list = []
data_type = "stakeholder_html"
#languages_detected = []

for root, dirs, files in os.walk(stakeholder_html_dir):
    # check that files are detected
    file_count = sum(len(files) for _, _, files in os.walk(stakeholder_html_dir))
    print(f"Total files detected: {file_count}")

    for f in files:
        full_file_path = (os.path.join(root, f)) # save full file path
        
        # extract file_id, org_num, file_num using regex
        match = re.match(r"(\d+)_(\d+)\.json", f)
        if match:
            org_num, file_num = match.groups()
            file_id = f"{org_num}_{file_num}"
        else:
            print(f"Skipping {f}: Filename format not recognized")
            continue
        
        with open(full_file_path, "r", encoding="utf-8") as file:
            try:
                data = json.load(file)
                if "text" in data:
                    # extract text from json
                    text_content = data["text"]
                    
                    # detect language
                    lang = detect(text_content) 
                    #languages_detected.append(lang)
                    #print(lang)
                    
                    # FOR NOW ONLY EXTRACT ENGLISH TEXTS
                    if lang == 'en':
                        stakeholder_html_list.append({
                            "file_id": file_id,
                            "org": org_num,
                            "file_num": file_num,
                            "content": text_content, 
                            "data_type": data_type
                        })
                else: 
                    print(f'{full_file_path} no text')
            
            except json.JSONDecodeError as e:
                print(f"error {full_file_path}: {e}")

# convert the list to df
html_stakeholder = pd.DataFrame(stakeholder_html_list)
print(html_stakeholder.head())
print(html_stakeholder.tail())

Total files detected: 1487
    file_id  org file_num                                            content  \
0  102_1084  102     1084  Civil society organisations express concerns a...   
1  102_1088  102     1088  Societies and businesses face increasing uncer...   
2  102_1091  102     1091  EU’s ‘Fit for 55’ is unfit and unfair The Euro...   
3  102_1095  102     1095  WHEN: 12 December 2024 I TIME: 14:00 - 15:30 (...   
4  102_1096  102     1096  Climate and Energy WG meeting – 14 September –...   

          data_type  
0  stakeholder_html  
1  stakeholder_html  
2  stakeholder_html  
3  stakeholder_html  
4  stakeholder_html  
     file_id org file_num                                            content  \
1144   9_141   9      141  The past three years have been very crucial fo...   
1145   9_142   9      142  Minespider announces strategic partnership wit...   
1146   9_143   9      143  Executive Vice President and Chief Commercial ...   
1147   9_145   9      145  Aluminum MMI:

#### 1.2.1.2 PDF Texts

In [None]:
# then read in stakeholders pdf
stakeholder_pdf_dir = "master_thesis_2025/stakeholder_data_extraction_pipeline/data/processed_data/pdf_text"

# keywords to skip (files containg these will be ignored)
skip_keywords = ['Concerns-Based Adoption Model']  # found overlap manually, also called CBAM

# list to store processed data
stakeholder_pdf_list = []
data_type = 'stakeholder_pdf'

# walk through json directory
for root, dirs, files in os.walk(stakeholder_pdf_dir):
    # check that files are detected
    file_count = sum(len(files) for _, _, files in os.walk(stakeholder_pdf_dir))
    print(f"Total files detected: {file_count}")

    for f in files:
            full_file_path = os.path.join(root, f)  # save full file path
            # extract file_id, org_num, file_num using regex
            match = re.match(r"(\d+)_(\d+)\.json", f)
            if match:
                org_num, file_num = match.groups()
                file_id = f"{org_num}_{file_num}"
            else:
                print(f"Skipping {f}: Filename format not recognized")
                continue
            
            with open(full_file_path, "r", encoding="utf-8") as file:
                try:
                    data = json.load(file)
                    
                    # ensure the necessary keys exist and process only English-language documents
                    if 'language' in data and 'pages' in data and data['language'] == 'Languages.ENGLISH':
                        full_text = " ".join(data['pages'].values())
                        
                        # Skip if any of the skip keywords are found in the text
                        if any(keyword.lower() in full_text.lower() for keyword in skip_keywords):
                            print(f"Skipping {f} due to keywords")
                            continue
                        
                        stakeholder_pdf_list.append({
                            "file_id": file_id,
                            "org": org_num,
                            "file_num": file_num,
                            "content": full_text, 
                            "data_type": data_type
                        })
                except json.JSONDecodeError as e:
                    print(f"error {f}: {e}")

# convert the list to df
pdf_stakeholder = pd.DataFrame(stakeholder_pdf_list)
print(pdf_stakeholder.head())
print(pdf_stakeholder.tail())

Total files detected: 481
Skipping 222_2149.json due to keywords
    file_id  org file_num                                            content  \
0  100_1060  100     1060  if\n8:\nS\n| ‘|\n! ry\nZ,\nY\nUp\ni\nA\nPo\n\\...   
1  101_1066  101     1066  Carbon Border Adjustments: Climate Protection ...   
2  101_1068  101     1068  Carbon\nMarket\nWatch\nA brief explanation of ...   
3  101_1072  101     1072  Carbon\nMarket\nWatch\nA brief explanation of ...   
4  101_1073  101     1073  O Carbon\nMarket\nWatch\nCarbon Market Watch’s...   

         data_type  
0  stakeholder_pdf  
1  stakeholder_pdf  
2  stakeholder_pdf  
3  stakeholder_pdf  
4  stakeholder_pdf  
     file_id org file_num                                            content  \
385   91_998  91      998  ©) MATERIALS\nBOLIDEN\nBoliden — Metals for\ng...   
386  92_1008  92     1008  A Close Brothers\nNOT FOR RELEASE, PUBLICATION...   
387  92_1015  92     1015  A Close Brothers\nNOT FOR RELEASE, PUBLICATION...   
388  92_

### 1.2.2 EC Texts

In [None]:
# first EC legislation
ec_leg_file_path = r"master_thesis_2025\eu_data_extraction\EC\legislation\data\results\legislation_data.json"

# List to store extracted data
ec_leg_data_list = []
data_type = "ec_legislation"
#org = "org"

# Read JSON file
with open(ec_leg_file_path, "r", encoding="utf-8") as file:
    try:
        data = json.load(file)  # Load JSON file
        for entry in data:
            if "text" in entry:
                text_content = entry["text"]

                # Detect language
                lang = detect(text_content)

                # Only include English texts
                if lang == "en":
                    ec_leg_data_list.append({
                        "file_id": entry.get("title", "Unknown"),  # Use "title" as file_id
                        "content": text_content,
                        "data_type": data_type
                    })
            else:
                print(f"{ec_leg_file_path} has no text")
    
    except json.JSONDecodeError as e:
        print(f"Error reading {ec_leg_file_path}: {e}")

# Convert list to DataFrame
legislation_df = pd.DataFrame(ec_leg_data_list)

print(legislation_df.head())
print(legislation_df.tail())

                         file_id  \
0        CELEX_32023R0956_EN_TXT   
1        CELEX_32023R1773_EN_TXT   
2  COM_2025_87_1_EN_ACT_part1_v5   
3         COM_2025_87_annexes_EN   
4          OJ_L_202403210_EN_TXT   

                                             content       data_type  
0  L 130/52\nEN\n16.5.2023\nOfficial Journal of t...  ec_legislation  
1  L 228/94\nEN\nOfficial Journal of the European...  ec_legislation  
2  a\nEUROPEAN\nCOMMISSION\nBrussels, 26.2.2025\n...  ec_legislation  
3  pi\nEUROPEAN\nCOMMISSION\nBrussels, 26.2.2025 ...  ec_legislation  
4  EA Official Journal\nof the European Union\n20...  ec_legislation  
                         file_id  \
1        CELEX_32023R1773_EN_TXT   
2  COM_2025_87_1_EN_ACT_part1_v5   
3         COM_2025_87_annexes_EN   
4          OJ_L_202403210_EN_TXT   
5              SWD-Omnibus-87_En   

                                             content       data_type  
1  L 228/94\nEN\nOfficial Journal of the European...  ec_legislation 

In [None]:
# then EC press releases
ec_press_release_path = r"master_thesis_2025\eu_data_extraction\EC\press_release\data\EC_CBAM_articles.json"

# list to store extracted data
ec_press_release_list = []
data_type = "ec_press_release"

# read JSON file
with open(ec_press_release_path, "r", encoding="utf-8") as file:
    try:
        data = json.load(file)  # Load JSON file
        for entry in data:
            if "text" in entry:
                text_content = entry["text"]

                # detect language
                #lang = detect(text_content)

                # only include English texts (only english news included)
                #if lang == "en":
                ec_press_release_list.append({
                    "file_id": entry.get("title", "Unknown"),  # Use "title" as file_id
                    "content": text_content,
                    "data_type": data_type
                })
            else:
                print(f"{ec_press_release_path} has no text")
    
    except json.JSONDecodeError as e:
        print(f"Error reading {ec_press_release_path}: {e}")

# Convert list to DataFrame
ec_pressrelease_df = pd.DataFrame(ec_press_release_list)

print(ec_pressrelease_df.head())
print(ec_pressrelease_df.tail())

                                             file_id  \
0  Remarks by Executive Vice-President Dombrovski...   
1  Joint statement on the second meeting of the E...   
2  Commission simplifies rules on sustainability ...   
3  A Clean Industrial Deal for competitiveness an...   
4  Speech at European Economic and Social Committ...   

                                             content         data_type  
0  MerciBruno, dear colleagues,\nAs Russia's aggr...  ec_press_release  
1  The second meeting of the EU-India Trade and T...  ec_press_release  
2  The European Commission has adopted a new pack...  ec_press_release  
3  Today, the Commission presents theClean Indust...  ec_press_release  
4  Madame President, Honourable Members,\nI am re...  ec_press_release  
                                              file_id  \
56  Opening address by President von der Leyen on ...   
57  Frans Timmermans at Parliament Plenary session...   
58  Closing Remarks EVP Timmermans on Fit for 55 a... 

In [None]:
# then EC have your say (feedback)
ec_hys_path = r"master_thesis_2025\eu_data_extraction\EC\have_your_say\data\combined_hys.json"

# list to store extracted data
ec_hys_list = []
data_type = "ec_hys"

# read JSON file
with open(ec_hys_path, "r", encoding="utf-8") as file:
    try:
        data = json.load(file)  # Load JSON file
        for entry in data:
            if "feedback" in entry:
                # combine 'feedback' and 'pdf_text' if they exist
                combined_text = " ".join(filter(None, [entry.get("feedback", ""), entry.get("pdf_text", "")]))

            #    text_content = entry["text"]

                # detect language
                #lang = detect(text_content)

                # only include English texts
                #if lang == "en":
                ec_hys_list.append({
                    "file_id": entry.get("file_id"),  
                    "content": combined_text,
                    "data_type": data_type
                })
            else:
                print(f"{ec_hys_path} has no text")
    
    except json.JSONDecodeError as e:
        print(f"Error reading {ec_hys_path}: {e}")

# Convert list to DataFrame
ec_hys_df = pd.DataFrame(ec_hys_list)

print(ec_hys_df.head())
print(ec_hys_df.tail())

  file_id                                            content data_type
0  525305                                 See attached file.    ec_hys
1  525248                                               n.a.    ec_hys
2  525305                                 See attached file.    ec_hys
3  525248                                               n.a.    ec_hys
4  525246   AEGIS Europe is an industry alliance that bri...    ec_hys
     file_id                                            content data_type
398  3497854  Hello Dear Madam/Sir We are reaching you about...    ec_hys
399  3497758  The delivery of the information from the suppl...    ec_hys
400  3497706  The correct functioning of the CBAM Regulation...    ec_hys
401  3497693  Around chapter III, I believe the objective fo...    ec_hys
402  3497684  the register should promote sensible climate a...    ec_hys


### 1.2.4 EP Texts

In [None]:
# first the committee meetings 
ep_comm_path = r"master_thesis_2025\eu_data_extraction\EP\committee_meetings\committee_meetings_data.json"
# list to store extracted data
ep_comm_list = []
data_type = "ep_committee_meetings"

# read JSON file
with open(ep_comm_path, "r", encoding="utf-8") as file:
    try:
        data = json.load(file)  # Load JSON file
        for entry in data:
            if "text" in entry:
                text_content = entry["text"]

                ep_comm_list.append({
                    "file_id": entry.get("title"),  # use file title as file id
                    "content": text_content,
                    "data_type": data_type
                })
            else:
                print(f"{ep_comm_path} has no text")
    
    except json.JSONDecodeError as e:
        print(f"Error reading {ep_comm_path}: {e}")

# Convert list to DataFrame
ep_comm_df = pd.DataFrame(ep_comm_list)

print(ep_comm_df.head())
print(ep_comm_df.tail())

                            file_id  \
0                         1244098EN   
1                COM_COM20210564_EN   
2                         1270822EN   
3                COM_COM20220101_EN   
4  FINALVotingListMAR3FdR1270822_EN   

                                             content              data_type  
0  European Parliament\n2019-2024\n((( = =—r\n(4\...  ep_committee_meetings  
1  pa\nEUROPEAN\nCOMMISSION\nBrussels, 14.7.2021\...  ep_committee_meetings  
2  European Parliament\n2019-2024\n((( = =—r\n(4\...  ep_committee_meetings  
3  a\nEUROPEAN\nCOMMISSION\nBrussels, 14.3.2022\n...  ep_committee_meetings  
4  European Parliament\n2019-2024\n7 SN J ANA\nSN...  ep_committee_meetings  
               file_id                                            content  \
78           1248940EN  European Parliament\n2019-2024\n(( =—r\nSS w=\...   
79           1248941EN  European Parliament\n2019-2024\n(( =—r\nSS w=\...   
80         CAs_CBAM_EN  European Parliament\n2019 - 2024\nG G SS G

In [None]:
# then ep press releases
ep_pr_path = r"master_thesis_2025\eu_data_extraction\EP\press_release\data\ep_pressroom\EP_CBAM_articles.json"

# list to store extracted data
ep_pr_list = []
data_type = "ep_press_releases"

# read JSON file
with open(ep_pr_path, "r", encoding="utf-8") as file:
    try:
        data = json.load(file)  # Load JSON file
        for entry in data:
            if "text" in entry:
                text_content = entry["text"]

                ep_pr_list.append({
                    "file_id": entry.get("title"),  # use file title as file id
                    "content": text_content,
                    "data_type": data_type
                })
            else:
                print(f"{ep_pr_path} has no text")
    
    except json.JSONDecodeError as e:
        print(f"Error reading {ep_pr_path}: {e}")

# Convert list to DataFrame
ep_pr_df = pd.DataFrame(ep_pr_list)

print(ep_pr_df.head())
print(ep_pr_df.tail())

                                             file_id  \
0  European Parliament Press Kit for the European...   
1  "Own Resources": Parliament's position on new ...   
2  MEPs clear way for new EU revenue, call on EU ...   
3  MEPs urge member states to adopt EU income str...   
4  Fit for 55: Parliament adopts key laws to reac...   

                                             content          data_type  
0  In this press kit, you will find a selection o...  ep_press_releases  
1  Parliament has always been an advocate of new ...  ep_press_releases  
2  New EU revenue based on the Emissions Trading ...  ep_press_releases  
3  Following the presentation of updated proposal...  ep_press_releases  
4  Free allowances in the Emissions Trading Syste...  ep_press_releases  
                                              file_id  \
10  MEPs to G20: increase climate change targets b...   
11  CBAM: Parliament pushes for higher ambition in...   
12  Fit for 55 in 2030: Parliament wants a more 

In [None]:
# then politic group press releases
ep_group_pr_path = r"master_thesis_2025\eu_data_extraction\EP\press_release\data\political_groups\group_press_releases.json"
# list to store extracted data
ep_group_pr_list = []
data_type = "ep_group_press_releases"

# read JSON file
with open(ep_group_pr_path, "r", encoding="utf-8") as file:
    try:
        data = json.load(file)  # Load JSON file
        for entry in data:
            if "text" in entry:
                text_content = entry["text"]

                ep_group_pr_list.append({
                    "file_id": entry.get("title"),  # use file title as file id
                    "content": text_content,
                    "data_type": data_type
                })
            else:
                print(f"{ep_group_pr_path} has no text")
    
    except json.JSONDecodeError as e:
        print(f"Error reading {ep_group_pr_path}: {e}")

# Convert list to DataFrame
ep_group_pr_df = pd.DataFrame(ep_group_pr_list)

print(ep_group_pr_df.head())
print(ep_group_pr_df.tail())

                                             file_id  \
0  Prevent unfair competition by non-European cli...   
1          De-carbonise, not de-industrialise Europe   
2  Fit for 55: we want to de-carbonise, not de-in...   
3         We are increasing CO2 cuts from 55% to 57%   
4                  EU Budget is running out of money   

                                             content                data_type  
0  The EPP Group wants to introduce a Carbon Bord...  ep_group_press_releases  
1  The EPP Group fully supports the move towards ...  ep_group_press_releases  
2  On Tuesday, the European Parliament’s plenary ...  ep_group_press_releases  
3  The EPP Group has greatly improved the Fit for...  ep_group_press_releases  
4  In just four years’ time, vital EU funds suppo...  ep_group_press_releases  
                                              file_id  \
39  European Parliament approves catalyst for clea...   
40  Commission proposals welcome but not enough to...   
41  MEPs pre

## 1.3 Combine data

In [36]:
all_docs = pd.concat([html_stakeholder,     # scraped stakeholders
                      pdf_stakeholder,      # scraped stakeholders 
                      legislation_df,       # legislation 
                      ec_hys_df,            # ec feedback
                      ec_pressrelease_df,   # ec press releases
                      ep_comm_df,           # ep committee docs
                      ep_pr_df,             # ep press releases
                      ep_group_pr_df        # political group press releases   
                      ], ignore_index=True)
print(all_docs.head())
#print(all_docs.tail())

# Convert 'content' column to a list of Unicode strings
docs = all_docs["content"].astype(str).tolist()
print(f'number of documents: {len(docs)}')
#print(docs[0][:500])
#print(docs[:2])

    file_id  org file_num                                            content  \
0  102_1084  102     1084  Civil society organisations express concerns a...   
1  102_1088  102     1088  Societies and businesses face increasing uncer...   
2  102_1091  102     1091  EU’s ‘Fit for 55’ is unfit and unfair The Euro...   
3  102_1095  102     1095  WHEN: 12 December 2024 I TIME: 14:00 - 15:30 (...   
4  102_1096  102     1096  Climate and Energy WG meeting – 14 September –...   

          data_type  
0  stakeholder_html  
1  stakeholder_html  
2  stakeholder_html  
3  stakeholder_html  
4  stakeholder_html  
number of documents: 2151


# 2. Clean text


In [38]:
import re
from textblob import TextBlob
from cleantext import clean
import wordninja  # for word segmentation

# Function to clean OCR noise
def clean_ocr_noise(text):
    text = re.sub(r'-\s*\n\s*', '', text)          # Remove hyphenated line breaks
    text = re.sub(r'\n+', ' ', text)               # Replace newlines with space
    text = re.sub(r'\s{2,}', ' ', text)            # Reduce multiple spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)      # Remove non-ASCII chars (optional)
    return text

# Function to clean repeated characters
def clean_repeated_chars(text):
    text = re.sub(r'\b(\w)\1{2,}\b', '', text)  # Remove entire words with repeated chars (e.g., "aaa")
    #text = re.sub(r'(\w)\1{2,}', r'\1\1', text) # Limit repeated characters within words to 2 max (e.g., "soooon" -> "soon")
    text = re.sub(r'\s+', ' ', text).strip()    # Remove extra spaces
    return text

# Function to remove junk tokens (e.g., short tokens, numbers, non-alphabetic tokens)
def remove_junk_tokens(doc):
    return [
        token for token in doc
        if len(token) > 2                          # remove very short tokens
        and not re.match(r'^[a-z]{1,2}$', token)   # remove tokens like "ae", "yy", etc.
        and not re.search(r'\d', token)            # remove tokens with numbers
        and token.isalpha()                        # keep only alphabetic tokens
    ]

# Function to remove hex strings and IDs (e.g., long hex strings, long numeric IDs)
def remove_hex_and_ids(doc):
    return [
        token for token in doc
        if not re.fullmatch(r'[a-f0-9]{8,}', token)   # long hex/hash strings
        and not re.fullmatch(r'\d{5,}', token)        # long numeric IDs
        #and not re.fullmatch(r'[a-z]{1,2}_[a-z]{1,2}(_[a-z]{1,2})*', token)  # ee_ee_ee etc
    ]

# Function for keyword-specific British-to-American spelling remapping
def british_to_american_keywords(text):
    british_to_american_dict = {
        "organisations": "organizations",
        "organisation": "organization",
        "realisation": "realization",
        "digitalisation": "digitalization",
        "decarbonisation": "decarbonization",
        "recognise": "recognize",
        "analyse": "analyze",
        "labour": "labor",
        "centre": "center",
        "theatre": "theater",
        "favour": "favor",
        "colour": "color",
        "honour": "honor",
        "metre": "meter",
        "defence": "defense",
        "licence": "license",
        "programme": "program",
        "travelling": "traveling",
        "realise": "realize",
        "defence": "defense",
        "theatre": "theater", 
        "modernise": "modernize"
    }
    
    for british, american in british_to_american_dict.items():
        text = re.sub(r'\b' + british + r'\b', american, text)
    
    return text

# Function to clean the text (apply all the steps)
def clean_text(text):
    # Use clean() from cleantext to remove URLs, emails, phone numbers, etc.
    text = clean(
        text=text,
        fix_unicode=True,
        to_ascii=False, # has to be removed, too strong
        no_emails=True,
        no_urls=True,
        no_line_breaks=True,
        no_phone_numbers=True,
        lower=False,            
        #replace_with_punct="", 
        replace_with_url="", 
        replace_with_email=""
    )
    
    # Remove OCR noise and fix repeated characters
    text = clean_ocr_noise(text)
    text = clean_repeated_chars(text)

    # Handle missing whitespace in concatenated words (e.g., "adesire" -> "a desire")
    #text = ' '.join(wordninja.split(text))  # This will split concatenated words

    # Apply keyword-specific British-to-American spelling remapping
    text = british_to_american_keywords(text)
    
    # Tokenize the text and apply junk removal functions
    tokens = text.split()  # Simple split by whitespace to get tokens
    
    # Remove junk tokens (short tokens, numbers, etc.)
    #tokens = remove_junk_tokens(tokens)
    
    # Remove hex and ID-like tokens
    tokens = remove_hex_and_ids(tokens)
    
    # Rejoin tokens back into a cleaned string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

# Example usage
#sample_text = """EU's Civil society organisations express concerns about the agreement reached in COREPER II on a Carbon Border Adjustment Mechanism (CBAM) compromise text that does not address important issues linked to the EU Emission Trading System (ETS) Directive and other key aspects of the Fit for 55 package. Together with other NGOs, we support a fair and effective CBAM, designed and implemented as an alternative to current EU ETS carbon leakage measures such as free allowances and indirect cost compensation."""
sample_text = """
EU’s ‘Fit for 55’ is unfit and unfair and must modernise. The European Commission is missing another historic opportunity to phase out fossil fuels in the ‘Fit for 55’ package, 
leaving the door open for coal, gas and oil to stay in the EU energy system for at least another two decades while sending the “polluter pays” bill to EU citizens."""

cleaned_text = clean_text(sample_text)
print(cleaned_text)

EU's 'Fit for 55' is unfit and unfair and must modernize. The European Commission is missing another historic opportunity to phase out fossil fuels in the 'Fit for 55' package, leaving the door open for coal, gas and oil to stay in the EU energy system for at least another two decades while sending the "polluter pays" bill to EU citizens.


In [39]:
# Apply the cleaning function to the "content" column and store in a new column:
# Assuming `all_docs` is your dataframe:
all_docs['cleaned_content'] = all_docs['content'].apply(clean_text)

# Optionally, inspect the cleaned content:
print(all_docs[['file_id', 'content', 'cleaned_content']].head())

    file_id                                            content  \
0  102_1084  Civil society organisations express concerns a...   
1  102_1088  Societies and businesses face increasing uncer...   
2  102_1091  EU’s ‘Fit for 55’ is unfit and unfair The Euro...   
3  102_1095  WHEN: 12 December 2024 I TIME: 14:00 - 15:30 (...   
4  102_1096  Climate and Energy WG meeting – 14 September –...   

                                     cleaned_content  
0  Civil society organizations express concerns a...  
1  Societies and businesses face increasing uncer...  
2  EU's 'Fit for 55' is unfit and unfair The Euro...  
3  WHEN: 12 December 2024 I TIME: 14:00 - 15:30 (...  
4  Climate and Energy WG meeting 14 September Onl...  


In [None]:
# save to excel 
import openpyxl
all_docs.to_excel("master_thesis_2025/RAW_all_comms.xlsx")

In [23]:
docs = all_docs["cleaned_content"].astype(str).tolist()
print(f'number of documents: {len(docs)}')

number of documents: 1539
