# 1. CLEAN JSONS

First, import and clean the meeting data. 

In [6]:
import json
import re

# Helper function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace and line breaks
    return text

# Function to clean a meeting entry
def clean_meeting(meeting):
    if isinstance(meeting, dict):
        return {key: clean_text(value) for key, value in meeting.items()}
    elif isinstance(meeting, str):
        return clean_text(meeting)
    else:
        return meeting

# Function to clean the entire dataset
def clean_data(data, term):
    for entry in data:
        entry["name"] = clean_text(entry.get("name", ""))
        entry["group"] = clean_text(entry.get("group", ""))
        entry["origin_country"] = clean_text(entry.get("origin_country", ""))
        entry["national_party"] = clean_text(entry.get("national_party", ""))
        entry["assistants"] = entry.get("assistants", {})

        # Clean meetings if it exists and is a dictionary
        meetings = entry.get("meetings", {})
        if isinstance(meetings, dict):
            for meeting_id, meeting in meetings.items():
                meetings[meeting_id] = clean_meeting(meeting)
        elif isinstance(meetings, str):
            # If meetings is a string like "No meetings for this MEP", just clean it
            entry["meetings"] = clean_text(meetings)

        # Add the term information
        entry["term"] = term

    return data

# File paths
path_9th_term = r"C:\Users\Emilia\Documents\Uni Helsinki\Year Three\AMO Freelance\transparency register\9 term\raw data\9term_meetings_ALL_INFO.json"
path_10th_term = r"C:\Users\Emilia\Documents\Uni Helsinki\Year Three\AMO Freelance\transparency register\10 term\raw data\national party 10-11-2024\mep_meetings_FULL_w_nationalParty.json"

# Load and clean the 9th term data
with open(path_9th_term, 'r', encoding='utf-8') as file:
    data_9th_term = json.load(file)
    cleaned_9th_term = clean_data(data_9th_term, term=9)

# Load and clean the 10th term data
with open(path_10th_term, 'r', encoding='utf-8') as file:
    data_10th_term = json.load(file)
    cleaned_10th_term = clean_data(data_10th_term, term=10)

# Save the cleaned data to new JSON files
with open('cleaned_9th_term.json', 'w', encoding='utf-8') as file:
    json.dump(cleaned_9th_term, file, indent=4, ensure_ascii=False)

with open('cleaned_10th_term.json', 'w', encoding='utf-8') as file:
    json.dump(cleaned_10th_term, file, indent=4, ensure_ascii=False)

print("Data cleaned successfully and saved to 'cleaned_9th_term.json' and 'cleaned_10th_term.json'.")

Data cleaned successfully and saved to 'cleaned_9th_term.json' and 'cleaned_10th_term.json'.


Next, import the transparency registry and clean it. 

In [10]:
path_transparency_register = "C:/Users/Emilia/Documents/Uni Helsinki/Year Three/AMO Freelance/transparency register/2024_registered_orgs_grouped.json"

with open(path_transparency_register, 'r', encoding='utf-8') as file:
    transparency_register = json.load(file)

print(transparency_register)

{'"Együtt könnyebb" Női Egészségért Alapítvány': [{'transparency_no': '788724251901-07', 'reg_date': '2023-11-10T01:40:24.574+00:00', 'registration_category': 'Other organisations, public or mixed entities', 'acronym': 'NEA', 'hq_city': 'Budapest', 'hq_country': 'HUNGARY'}], '"NURSING UP" IL SINDACATO DEGLI INFERMIERI ITALIANI': [{'transparency_no': '657585813025-52', 'reg_date': '2014-02-21T09:22:49.118+00:00', 'registration_category': 'Trade unions and professional associations', 'acronym': 'NURSING UP', 'hq_city': 'ROMA', 'hq_country': 'ITALY'}], '"Асоциация на търговците на нехранителни стоки" Сдружение': [{'transparency_no': '927291749114-62', 'reg_date': '2023-02-16T15:13:12.481+00:00', 'registration_category': 'Trade and business associations', 'acronym': 'N/A', 'hq_city': 'Sofia', 'hq_country': 'BULGARIA'}], '#DiasporaVote!': [{'transparency_no': '417597743204-78', 'reg_date': '2021-06-17T05:40:45.707+00:00', 'registration_category': 'Non-governmental organisations, platforms a

# 2. ANALYSE NAME SIMILARITIES/LANGUAGES USED 

In [5]:
import re
from collections import Counter
from langdetect import detect
from fuzzywuzzy import fuzz, process
from concurrent.futures import ThreadPoolExecutor
import functools

Set up keyword and company lists

In [7]:
company_list = [
    "Tiktok", "Shein", "Temu", "MG Motor", "Volvo Cars", "ZTE Corporation",  
    "Pirelli", "KUKA Robotics", "Huawei Technologies", "Nuctech", "BYD (Build Your Dreams)",  
    "Lenovo Group", "Geely Automobile Holdings", "SAIC Motor Corporation", "NIO Inc.", 
    "Xiaomi Corporation", "Haier Group Corporation", "Hisense Group", "China National Chemical Corporation (ChemChina)",
    "China COSCO Shipping Corporation", "China Three Gorges Corporation", "State Grid Corporation of China", 
    "China General Nuclear Power Group (CGN)", "Tencent Holdings", "Alibaba Group", "JD.com", "Drone DJI", 
    "Ant Group", "DJI Europe B.V.", "Hangzhou Hikvision Digital Technology Co., Ltd.", "Taiwan Semiconductor Manufacturing Company Ltd" # taken from transparency registry
    ]

In [None]:
c_keywords = [
    "China", "Chinese", "Beijing", "Mandarin", "Confucius", # English
    "Китай", "Китайски", "Пекин", "Мандарин", "Конфуций", # Bulgarian
    "Kina", "Kineski", "Peking", "Mandarinski", "Konfucije", # Croatian
    "Čína", "Čínský", "Peking", "Mandarínština", "Konfucius", # Czech
    "Kina", "Kinesisk", "Beijing", "Mandarin", "Konfucius", # Danish
    "China", "Chinees", "Peking", "Mandarijn", "Confucius", # Dutch
    "Hiina", "Hiina keel", "Peking", "Mandariin", "Konfutsius", # Estonian
    "Kiina", "Kiinalainen", "Peking", "Mandariini", "Kungfutse", # Finnish
    "Chine", "Chinois", "Pékin", "Mandarin", "Confucius", # French
    "China", "Chinesisch", "Peking", "Mandarin", "Konfuzius", # German
    "Κίνα", "Κινέζικα", "Πεκίνο", "Μανδαρινικά", "Κομφούκιος", # Greek
    "Kína", "Kínai", "Peking", "Mandarin", "Konfuciusz", # Hungarian
    "An tSín", "Sínis", "Béising", "Mandairínis", "Confucius", # Irish
    "Cina", "Cinese", "Pechino", "Mandarino", "Confucio", # Italian
    "Ķīna", "Ķīniešu", "Pekina", "Mandarīnu", "Konfūcijs", # Latvian
    "Kinija", "Kinų", "Pekinas", "Mandarinų", "Konfucijus", # Lithuanian
    "Ċina", "Ċiniż", "Beijing", "Mandarin", "Konfuzju", # Maltese
    "Chiny", "Chiński", "Pekin", "Mandaryński", "Konfucjusz", # Polish
    "China", "Chinês", "Pequim", "Mandarim", "Confúcio", # Portuguese
    "China", "Chinez", "Beijing", "Mandarină", "Confucius", # Romanian
    "Čína", "Čínsky", "Peking", "Mandarínčina", "Konfucius", # Slovak
    "Kitajska", "Kitajski", "Peking", "Mandarinščina", "Konfucij", # Slovenian
    "China", "Chino", "Pekín", "Mandarín", "Confucio", # Spanish
    "Kina", "Kinesiska", "Peking", "Mandarin", "Konfucius" # Swedish
]

In [1]:
hk_keywords = [
    "Hong Kong", # English
    "Hong Kong", # Dutch
    "Хонконг",   # Bulgarian
    "Hong Kong", # Croatian
    "Hong Kong", # Czech
    "Hong Kong", # Danish
    "Hong Kong", # Estonian
    "Hong Kong", # Finnish
    "Hong Kong", # French
    "Hong Kong", # German
    "Χονγκ Κονγκ", # Greek 
    "Hong Kong", # Hungarian
    "Hong Kong", # Irish
    "Hong Kong", # Italian
    "Hong Kong", # Latvian
    "Hong Kong", # Lithuanian
    "Hong Kong", # Maltese
    "Hong Kong", # Polish
    "Hong Kong", # Portuguese
    "Hong Kong", # Romanian
    "Hong Kong", # Slovak
    "Hong Kong", # Slovenian
    "Hong Kong", # Spanish
    "Hong Kong", # Swedish¨
]


In [3]:
t_keywords = [
    "Taiwan", "Taipei", # English
    "Тайван", "Тайпе", # Bulgarian
    "Tajvan", "Taipei", # Croatian
    "Tchaj-wan", "Tchaj-pej", # Czech
    "Taiwan", "Taipei", # Danish
    "Taiwan", "Taipei", # Dutch
    "Taiwan", "Taipei", # Estonian
    "Taiwan", "Taipei", # Finnish
    "Taïwan", "Taipei", # French
    "Taiwan", "Taipeh", # German
    "Ταϊβάν", "Ταϊπέι", # Greek
    "Tajvan", "Tajpej", # Hungarian
    "Taiwan", "Taipei", # Irish
    "Taiwan", "Taipei", # Italian
    "Taivāna", "Taibei", # Latvian
    "Taivanas", "Taipėjus", # Lithuanian
    "Tajwan", "Taipei", # Maltese
    "Tajwan", "Tajpej", # Polish
    "Taiwan", "Taipé", # Portuguese
    "Taiwan", "Taipei", # Romanian
    "Taiwan", "Tchaj-pej", # Slovak
    "Tajvan", "Tajpej", # Slovenian
    "Taiwán", "Taipéi", # Spanish
    "Taiwan", "Taipei", # Swedish
]

Define functions that will be used to analyse the meeting data

In [69]:
# Helper function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace and line breaks
    return text

# Helper function to detect language (use a cache to improve speed for repeated texts)
language_cache = {}

def detect_language(text):
    if text in language_cache:
        return language_cache[text]
    
    try:
        language = detect(text)
        language_cache[text] = language  # Cache the result
        return language
    except Exception:
        return "unknown"

# Helper function to find similar names (with fuzz score threshold)
company_match_cache = {}

def find_similar_names(name, company_list):
    if name in company_match_cache:
        return company_match_cache[name]
    
    matches = process.extract(name, company_list, scorer=fuzz.partial_ratio, limit=1)
    if matches and matches[0][1] > 95:  # Threshold for similarity (adjustable)
        company_match_cache[name] = (matches[0][0], matches[0][1])  # Cache the result
        return matches[0][0], matches[0][1]
    
    company_match_cache[name] = None  # Cache the result for non-matches
    return None

# Function to check if any keyword is present in the text
def contains_keywords(text):
    if not isinstance(text, str):
        return False
    # Use a case-insensitive search for any of the keywords
    pattern = re.compile(r'\b(?:' + '|'.join(re.escape(keyword) for keyword in keywords) + r')\b', re.IGNORECASE)
    return bool(pattern.search(text))

# Function to analyze individual meeting data (this will be used in parallel)
def analyze_meeting(entry, company_list):
    merged_meetings = []  # To store merged meetings with keyword matches
    meetings = entry.get("meetings", {})
    mep_name = entry.get("name", "Unknown MEP")  # Assuming this is where the MEP name is stored

    # If meetings is a string (e.g., "No meetings for this MEP"), skip further processing
    if isinstance(meetings, str):
        return merged_meetings

    # Ensure meetings is a dictionary before iterating
    for meeting_id, meeting in meetings.items():
        # Ensure each meeting is a dictionary
        if isinstance(meeting, dict):
            # Clean and detect language
            reason = clean_text(meeting.get("reason", ""))
            meeting_with = clean_text(meeting.get("meeting_with", ""))
            language = detect_language(reason)
            date = meeting.get("date", "Date not provided")  # Assumed field for date

            # Check if any keywords are present in the reason or meeting_with
            reason_has_keywords = contains_keywords(reason)
            meeting_with_has_keywords = contains_keywords(meeting_with)

            # Check for company name match
            company_match_result = find_similar_names(meeting_with, company_list)
            company_match = None
            score = None
            if company_match_result:
                company_match, score = company_match_result

            if reason_has_keywords or meeting_with_has_keywords or company_match:
                # Create human-readable summary
                meeting_summary = f"MEP: {mep_name}\n"  # Include the MEP's name
                meeting_summary += f"Meeting ID: {meeting_id}\n"
                meeting_summary += f"Date: {date}\n"  # Include the date the meeting took place
                meeting_summary += f"Meeting With: {meeting_with}\n"
                meeting_summary += f"Reason: {reason}\n"
                meeting_summary += f"Language: {language}\n"

                # Check for keyword matches
                if reason_has_keywords:
                    meeting_summary += "  - Keyword match found in reason.\n"
                if meeting_with_has_keywords:
                    meeting_summary += "  - Keyword match found in meeting with.\n"

                # Check for company match
                if company_match:
                    meeting_summary += f"  - Company match found: {company_match} (Score: {score})\n"
                else:
                    meeting_summary += "  - No company match found.\n"

                # Add the formatted meeting to the merged list
                merged_meetings.append(meeting_summary)

    return merged_meetings

# Function to analyze data with parallel processing
def analyze_data(data, company_list):
    all_merged_meetings = []

    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(functools.partial(analyze_meeting, company_list=company_list), data))

        # Combine all merged meetings from all results
        for merged_meetings in results:
            all_merged_meetings.extend(merged_meetings)

    # Format the output in a human-readable format
    readable_output = "\n\n".join(all_merged_meetings)  # Add extra newlines to separate meetings

    return readable_output

Run the functions on the clean meeting data

In [None]:
ninth_term_matches = analyze_data(cleaned_9th_term, company_list)



In [70]:
tenth_term_matches = analyze_data(cleaned_10th_term, company_list)





explore the analyis outputs

In [73]:
# Print summary of results for 9th term
print("9th Term Analysis:")
print(ninth_term_matches)


9th Term Analysis:
MEP: Alex AGIUS SALIBA
Meeting ID: 0
Date: 08-07-2024
Meeting With: Mr Lu Kang, Vice Minister of the International Department of Communist Party of China
Reason: Relationship between Malta and China and how both countries can continue to collaborate in various areas
Language: en
  - Keyword match found in reason.
  - Keyword match found in meeting with.
  - No company match found.


MEP: Alex AGIUS SALIBA
Meeting ID: 2
Date: 20-02-2024
Meeting With: Huawei
Reason: To discuss the latest innovative solutions in e-health as well as our solutions for smart hospitals and for bringing telemedicine to rural, mountainous and island areas
Language: en
  - Company match found: Huawei Technologies (Score: 100)


MEP: Alex AGIUS SALIBA
Meeting ID: 11
Date: 17-08-2023
Meeting With: Ambassador for China in Malta
Reason: EU - China relations
Language: en
  - Keyword match found in reason.
  - Keyword match found in meeting with.
  - No company match found.


MEP: Alex AGIUS SALIBA


In [76]:
# Print summary of results for 10th term
print("\n10th Term Analysis:")
print(tenth_term_matches)


10th Term Analysis:
MEP: Mika AALTOLA
Meeting ID: 17
Date: 15-10-2024
Meeting With: ONE
Reason: Development Policy
Language: nl
  - Company match found: Drone DJI (Score: 100)


MEP: Alex AGIUS SALIBA
Meeting ID: 0
Date: 21-10-2024
Meeting With: H.E. Mr Fu Ziying, Vice-Chairman on Foreign Affairs of National Congress, China and Chinese Delegation
Reason: The long standing collaboration between Malta and China, Russian aggression on Ukraine and the current conflict in the Middle East
Language: en
  - Keyword match found in reason.
  - Keyword match found in meeting with.
  - No company match found.


MEP: Mathilde ANDROUËT
Meeting ID: 4
Date: 17-09-2024
Meeting With: Représentation de la Chine auprès de l'UE
Reason: Les enjeux économiques et commerciaux UE-Chine
Language: fr
  - Keyword match found in reason.
  - Keyword match found in meeting with.
  - No company match found.


MEP: Marc ANGEL
Meeting ID: 8
Date: 08-10-2024
Meeting With: Taipei Representative Office in the European Un

summarize the output 

In [86]:
import re

# Function to parse the meeting information into a structured dictionary
def parse_meeting_info(meeting_text):
    meetings = []
    
    # Split the text into individual blocks by MEP
    entries = meeting_text.strip().split("\n\n")
    
    for entry in entries:
        # Initialize a dictionary for each meeting entry
        meeting_data = {}
        
        # Extract MEP name
        mep_match = re.search(r"MEP:\s*(.*?)\n", entry)
        if mep_match:
            meeting_data['mep_name'] = mep_match.group(1).strip()
        
        # Extract meeting ID
        meeting_id_match = re.search(r"Meeting ID:\s*(\d+)", entry)
        if meeting_id_match:
            meeting_data['meeting_id'] = meeting_id_match.group(1).strip()
        
        # Extract date
        date_match = re.search(r"Date:\s*(\d{2}-\d{2}-\d{4})", entry)
        if date_match:
            meeting_data['date'] = date_match.group(1).strip()
        
        # Extract meeting with
        meeting_with_match = re.search(r"Meeting With:\s*(.*?)\n", entry)
        if meeting_with_match:
            meeting_data['meeting_with'] = meeting_with_match.group(1).strip()
        
        # Extract reason
        reason_match = re.search(r"Reason:\s*(.*?)\n", entry)
        if reason_match:
            meeting_data['reason'] = reason_match.group(1).strip()
        
        # Extract language
        language_match = re.search(r"Language:\s*(\w+)", entry)
        if language_match:
            meeting_data['language'] = language_match.group(1).strip()
        
        # Extract company matches and keyword matches
        company_matches = re.findall(r"Company match found: (.*?) \(Score: \d+\)", entry)
        keyword_matches = re.findall(r"Keyword match found", entry)
        
        meeting_data['company_matches'] = company_matches
        meeting_data['keyword_matches'] = len(keyword_matches)  # Count of keyword matches
        
        # Append the parsed meeting data to the list
        meetings.append(meeting_data)
    
    return meetings

# Example usage:
nine_parsed_meetings = parse_meeting_info(ninth_term_matches)
ten_parsed_meetings = parse_meeting_info(tenth_term_matches)

In [84]:
import pandas as pd

# Function to generate the aggregated summary table
def generate_and_save_aggregated_summary(meetings, output_file):
    # Dictionary to store aggregated data per MEP
    aggregated_data = {}

    # Iterate over each parsed meeting
    for meeting in meetings:
        mep_name = meeting['mep_name']
        company_matches = meeting['company_matches']
        keyword_matches = meeting['keyword_matches']

        # Initialize data for the MEP if not already present
        if mep_name not in aggregated_data:
            aggregated_data[mep_name] = {
                'Total Meetings': 0,
                'Total Company Matches': 0,
                'Total Keyword Matches': 0
            }

        # Update the counts
        aggregated_data[mep_name]['Total Meetings'] += 1
        aggregated_data[mep_name]['Total Company Matches'] += len(company_matches)
        aggregated_data[mep_name]['Total Keyword Matches'] += keyword_matches

    # Create a list for the aggregated summary
    aggregated_summary = [
        {
            'MEP': mep,
            'Total Meetings': counts['Total Meetings'],
            'Total Company Matches': counts['Total Company Matches'],
            'Total Keyword Matches': counts['Total Keyword Matches']
        }
        for mep, counts in aggregated_data.items()
    ]

    # Convert aggregated summary to a DataFrame
    aggregated_df = pd.DataFrame(aggregated_summary)

    # Sort the DataFrame by 'Total Meetings' in descending order
    aggregated_df = aggregated_df.sort_values(by='Total Meetings', ascending=False)

    # Save the sorted DataFrame to an Excel file
    aggregated_df.to_excel(output_file, index=False)

    return aggregated_df

In [85]:
# Example usage:
output_file_path = 'nine_aggregated_summary.xlsx'
nine_aggregated_df = generate_and_save_aggregated_summary(nine_parsed_meetings, output_file_path)

# Print the sorted aggregated summary table
print("\nAggregated Summary Table (Sorted by Total Meetings):")
print(nine_aggregated_df.to_string(index=False))



Aggregated Summary Table (Sorted by Total Meetings):
                        MEP  Total Meetings  Total Company Matches  Total Keyword Matches
         Reinhard BÜTIKOFER             164                      0                    172
                 David LEGA              23                      1                     30
              Helmut SCHOLZ              18                      0                     27
              Iuliu WINKLER              18                      0                     25
           Johan DANIELSSON              16                     16                      0
        Kathleen VAN BREMPT              15                      2                     16
                Svenja HAHN              13                      2                     12
             Jörgen WARBORN              11                      6                      6
     Ibán GARCÍA DEL BLANCO              11                      9                      2
      Marie-Pierre VEDRENNE              10   

In [87]:
# Example usage:
output_file_path = 'ten_aggregated_summary.xlsx'
ten_aggregated_df = generate_and_save_aggregated_summary(ten_parsed_meetings, output_file_path)

# Print the sorted aggregated summary table
print("\nAggregated Summary Table (Sorted by Total Meetings):")
print(ten_aggregated_df.to_string(index=False))


Aggregated Summary Table (Sorted by Total Meetings):
                     MEP  Total Meetings  Total Company Matches  Total Keyword Matches
             Lukas MANDL               8                      2                     13
           Pierre PIMPIE               3                      0                      6
     Kathleen VAN BREMPT               2                      0                      3
    Borja GIMÉNEZ LARRAZ               2                      0                      3
            Beata SZYDŁO               2                      0                      3
        Dario TAMBURRANO               2                      1                      1
     Isabel WISELER-LIMA               2                      0                      2
          Bernard GUETTA               2                      0                      4
             César LUENA               2                      0                      2
     Sebastian TYNKKYNEN               2                      1             