# 1. Set-Up

In [5]:
# import nescessary libraries
import re
from collections import Counter
from fuzzywuzzy import fuzz, process
from concurrent.futures import ThreadPoolExecutor
import functools
import pandas as pd
import json

In [None]:
# function to open data
def get_files(file_path):
    with open(file_path, 'r', encoding='utf-8') as file: 
        data = json.load(file)
    return data

# read in: 
# a) meeting data
path_9th_term = 'master_thesis_2025/cbam_meeting_information/data/all_meetings/cleaned_meetings_9term.json'
path_10th_term = 'master_thesis_2025/cbam_meeting_information/data/cbam_specific_meetings/test/MEP_MEETINGS_CBAM_01.04.2025.json'

meetings_9term = get_files(path_9th_term)
meetings_10term = get_files(path_10th_term)

# b) transparency registry data
path_t_reg = 'master_thesis_2025/cbam_meeting_information/data/transparency_registry/07.2024_registered_orgs_grouped.json'
t_reg = get_files(path_t_reg)

# 2. Match meeting log with transparency registry

In [None]:
# define a list of search keywords
keywords = ["CBAM", "cbam"]

In [9]:
import json
import re
from concurrent.futures import ThreadPoolExecutor
import functools
from fuzzywuzzy import fuzz, process

# Helper function to clean text
def clean_text(text):
    if isinstance(text, str):
        return re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace and line breaks
    return text

# Helper function to find similar names or acronyms (fuzzy matching)
org_match_cache = {}
def find_similar_names(name, org_list):
    if name in org_match_cache:
        return org_match_cache[name]
    
    matches = process.extract(name, org_list, scorer=fuzz.partial_ratio, limit=1)
    
    if matches and matches[0][1] > 90:  # Set matching threshold to 90
        org_match_cache[name] = (matches[0][0], matches[0][1])  # Cache the result
        return matches[0][0], matches[0][1]
    
    org_match_cache[name] = None  # Cache the result for non-matches
    return None

# Function to check if any keyword is present in the text
def contains_keywords(text, keywords):
    if not isinstance(text, str):
        return False
    # Use a case-insensitive search for any of the keywords
    pattern = re.compile(r'\b(?:' + '|'.join(re.escape(keyword) for keyword in keywords) + r')\b', re.IGNORECASE)
    return bool(pattern.search(text))

# Function to analyze individual meeting data
def analyze_meeting(entry, org_list, keywords):
    qualifying_meetings = []  # Store qualifying meetings
    meetings = entry.get("meetings", {})
    mep_name = entry.get("name", "Unknown MEP")

    # Skip processing if meetings is not a dictionary
    if isinstance(meetings, str):
        return qualifying_meetings

    for meeting_id, meeting in meetings.items():
        if isinstance(meeting, dict):
            reason = clean_text(meeting.get("reason", ""))
            meeting_with = clean_text(meeting.get("meeting_with", ""))
            date = meeting.get("date", "Date not provided")

            # Check for keyword match
            reason_has_keywords = contains_keywords(reason, keywords)
            meeting_with_has_keywords = contains_keywords(meeting_with, keywords)

            # Proceed only if a keyword is found
            if reason_has_keywords or meeting_with_has_keywords:
                # Check for match in organization list
                matched_org = find_similar_names(meeting_with, org_list)

                # Prepare output
                meeting_entry = {
                    "MEP": mep_name,
                    "Meeting ID": meeting_id,
                    "Date": date,
                    "Meeting With": meeting_with,
                    "Reason": reason,
                    "Org Match": matched_org[0] if matched_org else None,  # Include matched org name
                    "Org Match Similarity": matched_org[1] if matched_org else None  # Include similarity score
                }
                qualifying_meetings.append(meeting_entry)

    return qualifying_meetings

# Function to analyze data with parallel processing
def analyze_data(data, org_list, keywords, output_file):
    all_qualifying_meetings = []

    with ThreadPoolExecutor() as executor:
        results = list(executor.map(functools.partial(analyze_meeting, org_list=org_list, keywords=keywords), data))

        for qualifying_meetings in results:
            all_qualifying_meetings.extend(qualifying_meetings)

    # Save to JSON if any qualifying meetings are found
    if all_qualifying_meetings:
        with open(output_file, 'w', encoding='utf-8') as json_file:
            json.dump(all_qualifying_meetings, json_file, ensure_ascii=False, indent=4)
        print(f"Qualifying meeting data saved to {output_file}")
    else:
        print("No qualifying meetings found. No file was created.")

# Function to extract organization names and acronyms from JSON data
def extract_org_names_and_acronyms(json_data):
    org_list = []
    for org_name, details in json_data.items():
        org_list.append(org_name)  # Add the full name
        for detail in details:
            acronym = detail.get("transparency_no")
            if acronym:
                org_list.append(acronym)  # Add the acronym
    return org_list

In [None]:
org_list = extract_org_names_and_acronyms(t_reg)

term9_matches = analyze_data(meetings_9term, org_list, 
                              keywords, output_file='master_thesis_2025/cbam_meeting_information/data/cbam_specific_meetings/ep/cbam_term9_meetings.json')
term10_matches = analyze_data(meetings_10term, org_list, 
                              keywords, output_file='master_thesis_2025/cbam_meeting_information/data/cbam_specific_meetings/ep/cbam_term10_meetings.json')

# 3. Transform into Excel and add website

In [12]:
def extract_meeting_data(json_files, registry_file, output_excel):
    # Load transparency registry data
    with open(registry_file, 'r', encoding='utf-8') as f:
        registry_data = json.load(f)

    # Create a lookup for organization details
    org_to_details = {
        org: {
            "Website": details[0].get("website", "N/A"),
            "Category": details[0].get("registration_category", "N/A"),
            "Members": details[0].get("members", "N/A"),
            "Budget": details[0].get("total_budget", "N/A"),
            "Mission": details[0].get("mission", "N/A"),
        }
        for org, details in registry_data.items()
    }

    # Combine data from all JSON files
    all_meetings = []
    for file in json_files:
        with open(file, 'r', encoding='utf-8') as f:
            all_meetings.extend(json.load(f))

    # Create a dictionary to store aggregated data
    org_data = {}
    for meeting in all_meetings:
        org_name = meeting["Org Match"]
        if not org_name:
            continue  # Skip if there's no org match

        if org_name not in org_data:
            org_details = org_to_details.get(org_name, {})
            org_data[org_name] = {
                "Organization Name": org_name,
                "Website": org_details.get("Website", "N/A"),
                "Category": org_details.get("Category", "N/A"),
                "Members": org_details.get("Members", "N/A"),
                "Budget": org_details.get("Budget", "N/A"),
                "Mission": org_details.get("Mission", "N/A"),
                "Reasons for Meeting": set(),
                "MEPs Met With": set()
            }

        # Add meeting reasons and MEP names as strings to sets
        org_data[org_name]["Reasons for Meeting"].add(str(meeting["Reason"]))
        org_data[org_name]["MEPs Met With"].add(str(meeting["MEP"]))

    # Prepare data for the Excel file
    final_data = []
    for org, details in org_data.items():
        final_data.append({
            "Organization Name": details["Organization Name"],
            "Website": details["Website"],
            "Category": details["Category"],
            "Members": details["Members"],
            "Budget": details["Budget"],
            "Mission": details["Mission"],
            "Reasons for Meeting": "; ".join(details["Reasons for Meeting"]),
            "MEPs Met With": "; ".join(details["MEPs Met With"])
        })

    # Create a DataFrame and save to Excel
    df = pd.DataFrame(final_data)
    df.to_excel(output_excel, index=False)
    print(f"Data saved to {output_excel}")

In [None]:
# output Excel file
output_excel = "master_thesis_2025/cbam_meeting_information/data/cbam_specific_meetings/ep/aggregated_cbam_meeting_data.xlsx"

json_files = ['master_thesis_2025/cbam_meeting_information/data/cbam_specific_meetings/ep/cbam_term9_meetings.json', 
              'master_thesis_2025/cbam_meeting_information/data/cbam_specific_meetings/ep/cbam_term10_meetings.json']
registry_file = 'master_thesis_2025/cbam_meeting_information/data/transparency_registry/07.2024_registered_orgs_grouped.json'

# run the extraction and save to Excel
extract_meeting_data(json_files, registry_file, output_excel)