# 1. SET-UP

In [1]:
import json
import pandas as pd
import json
import os
from fuzzywuzzy import process

## 1.1 Merge manual JSON and other json

In [6]:
# Load the JSON data (replace with your actual file paths)
with open('C:/Users/Emilia/Documents/Uni Helsinki/Year Three/AMO Freelance/assistant task/9 term/raw data/national party included/9term_apas_w_nationalParty.json', 'r', encoding='utf-8') as f1, open('C:/Users/Emilia/Documents/Uni Helsinki/Year Three/AMO Freelance/assistant task/9 term/raw data/national party included/manual additions/mep_assistants_manually_added.json', 'r', encoding='utf-8') as f2:
    data1 = json.load(f1)
    data2 = json.load(f2)

# Concatenate the two lists
merged_data = data1 + data2

# Save the merged data to a new file
with open('ALL_9TERM_TO_10TERM_MEPS.json', 'w', encoding='utf-8') as f_out:
    json.dump(merged_data, f_out, ensure_ascii=False, indent=4)

#print("The JSON files have been concatenated successfully!")

## 1.2 Open Merged Data

In [7]:
# 1. READ IN DATA
file_path = "C:/Users/Emilia/Documents/Uni Helsinki/Year Three/AMO Freelance/assistant task/9 term/raw data/final national party merged/ALL_9TERM_TO_10TERM_MEPS.json"

# Check if file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"The file at {file_path} does not exist.")

# Open and load the JSON data from the file using UTF-8 encoding
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        nine_term_data = json.load(file)
except UnicodeDecodeError as e:
    raise ValueError(f"Encoding error: {e}")
except json.JSONDecodeError as e:
    raise ValueError(f"Error decoding JSON: {e}")

## 1.3 Explore JSON file

In [8]:
def get_unique_keys(data, prefix=''):
    unique_keys = set()

    if isinstance(data, dict):
        for key, value in data.items():
            full_key = f"{prefix}.{key}" if prefix else key
            unique_keys.add(full_key)
            unique_keys.update(get_unique_keys(value, full_key))
    elif isinstance(data, list):
        for item in data:
            unique_keys.update(get_unique_keys(item, prefix))

    return unique_keys

# Load your JSON file (replace 'file.json' with your file path)
with open(file_path, 'r', encoding='utf-8') as f:
    json_data = json.load(f)

# Get all unique keys
unique_keys = get_unique_keys(json_data)

# Print the unique keys
print("Unique entry categories (keys) in the JSON file:")
for key in sorted(unique_keys):
    print(key)

Unique entry categories (keys) in the JSON file:
assistants
assistants.Accredited assistants
assistants.Accredited assistants (grouping)
assistants.Local assistants
assistants.Local assistants (grouping)
assistants.Paying agents
assistants.Paying agents (grouping)
assistants.Service providers
assistants.Trainees
country
date_scraped
group
name
national_party


In [9]:
# Initialize sets for unique entries
unique_names = set()
unique_groups = set()
unique_countries = set()

# Iterate through the data and add entries to sets
for entry in json_data:
    unique_names.add(entry.get('name'))
    unique_groups.add(entry.get('group'))
    unique_countries.add(entry.get('country'))

# Print the count of unique entries
print(f"Unique 'name' entries: {len(unique_names)}")
print(f"Unique 'group' entries: {len(unique_groups)}")
print(f"Unique 'country' entries: {len(unique_countries)}")

# print unique groups 
for group in unique_groups:
    print(f'{group}\n')

Unique 'name' entries: 746
Unique 'group' entries: 13
Unique 'country' entries: 28
None

European Conservatives and Reformists Group

Non-attached Members

The Left group in the European Parliament - GUE/NGL

Confederal Group of the European United Left - Nordic Green Left

Renew Europe Group (04.02.2020-16.06.2024), Group of the European People's Party (Christian Democrats) (17.06.2024-15.07.2024)

Group of the Greens/European Free Alliance

Group of the European People's Party (Christian Democrats)

Renew Europe Group

Group of the Progressive Alliance of Socialists and Democrats in the European Parliament

Renew Europe Group (until 12-06-2024), Group of the European People's Party (Christian Democrats)

Group of the European United Left - Nordic Green Left

Identity and Democracy Group



# 2. CLEAN

## 2.1 Convert json to dictionary

In [18]:
# 2. JSON TO PANDAS DATAFRAME
assistant_to_details = {}  # Dictionary to track which assistants work for which MEPs and parties

# Iterate over each MEP's data
for mep in nine_term_data:
    mep_name = mep['name']
    mep_group = mep['group']
    mep_party = mep['national_party']
    mep_country = mep['country']
    mep_date_scraped = mep.get('date_scraped', None)  # Extract the date scraped

    # Check for Accredited assistants
    if 'Accredited assistants' in mep['assistants']:
        assistants = mep['assistants']['Accredited assistants']
        
        for assistant in assistants:
            if assistant not in assistant_to_details:
                assistant_to_details[assistant] = {
                    'assistant_type': 'apa',
                    'meps': set(),
                    'groups': set(),
                    'parties': set(),
                    'countries': set(),
                    'dates_scraped': set()  # Add a set for dates
                }
            assistant_to_details[assistant]['meps'].add(mep_name)
            assistant_to_details[assistant]['groups'].add(mep_group)
            assistant_to_details[assistant]['parties'].add(mep_party)
            assistant_to_details[assistant]['countries'].add(mep_country)
            if mep_date_scraped:  # Add date if it exists
                assistant_to_details[assistant]['dates_scraped'].add(mep_date_scraped)

    # Check for Accredited assistants (grouping)
    if 'Accredited assistants (grouping)' in mep['assistants']:
        assistants_grouping = mep['assistants']['Accredited assistants (grouping)']
        
        for assistant in assistants_grouping:
            if assistant not in assistant_to_details:
                assistant_to_details[assistant] = {
                    'assistant_type': 'apa grouped',
                    'meps': set(),
                    'groups': set(),
                    'parties': set(),
                    'countries': set(),
                    'dates_scraped': set()  # Add a set for dates
                }
            else:
                # If already exists, change to "both" if it's a grouped assistant
                if assistant_to_details[assistant]['assistant_type'] == 'apa':
                    assistant_to_details[assistant]['assistant_type'] = 'both'
                    
            assistant_to_details[assistant]['meps'].add(mep_name)
            assistant_to_details[assistant]['groups'].add(mep_group)
            assistant_to_details[assistant]['parties'].add(mep_party)
            assistant_to_details[assistant]['countries'].add(mep_country)
            if mep_date_scraped:  # Add date if it exists
                assistant_to_details[assistant]['dates_scraped'].add(mep_date_scraped)

## 2.2 Explore similar assistant names

In [19]:
# 3. Fuzzy Matching for Manual Exploration of Names
def explore_similar_assistants(assistant_details):
    assistants_list = list(assistant_details.keys())
    similar_assistants_dict = {}

    for assistant in assistants_list:
        # Find similar assistants
        similar_assistants = process.extract(assistant, assistants_list, limit=None)
        # Filter out matches between 92% and 98% (exclude 100%)
        similar_assistants = [(a, score) for a, score in similar_assistants if 92 <= score < 100]
        
        # Store results in the dictionary if there are any matches
        if similar_assistants:
            similar_assistants_dict[assistant] = similar_assistants
            
    return similar_assistants_dict

# Get similar assistants for merging
similar_assistants_for_merging = explore_similar_assistants(assistant_to_details)

In [20]:
print(similar_assistants_for_merging)
for assistant in similar_assistants_for_merging: 
    print(f'{assistant}\n')

{'Maria Magdalena GONZALEZ GOZALBO': [('Magdalena GONZALEZ GOZALBO', 95)], 'Josep MERCADAL BAQUERO': [('Josep/Pepe MERCADAL BAQUERO', 95)], 'PAULA SENDIN RODRIGUEZ': [('Paula SENDÍN RODRIGUEZ', 98)], 'Maria Immaculada IBANEZ LANA': [('Maria Inmaculada IBANEZ LANA', 96)], 'Gilles Willy SEGERS': [('Gilles Willy B SEGERS', 95)], 'Magdalena GONZALEZ GOZALBO': [('Maria Magdalena GONZALEZ GOZALBO', 95)], 'Josep/Pepe MERCADAL BAQUERO': [('Josep MERCADAL BAQUERO', 95), ('Pepe MERCADAL BAQUERO', 95)], 'Claudia MARTINEZ MUNOZ': [('CLAUDIA MARTÍNEZ MUÑOZ', 95)], 'MARIA MERCEDES GARCIA MUNOZ': [('MARIA MERCEDES GARCIA MUÑOZ', 98)], 'Maria Inmaculada IBANEZ LANA': [('Maria Immaculada IBANEZ LANA', 96)], 'Gilles Willy B SEGERS': [('Gilles Willy SEGERS', 95)], 'Paula SENDIN RODRIGUEZ': [('Paula SENDÍN RODRIGUEZ', 98)], 'Paula SENDÍN RODRIGUEZ': [('PAULA SENDIN RODRIGUEZ', 98), ('Paula SENDIN RODRIGUEZ', 98)], 'CLAUDIA MARTÍNEZ MUÑOZ': [('Claudia MARTINEZ MUNOZ', 95)], 'MARIA MERCEDES GARCIA MUÑOZ': [

## 2.3 Merge assistants with similar names

In [21]:
# 4. Merge assistants with high similarity scores
merged_assistants = {}

# Function to merge details
def merge_assistant_details(assistant_names):
    merged_details = {
        'assistant_type': None,
        'meps': set(),
        'groups':set(),
        'parties': set(),
        'countries': set(),
        'dates_scraped': set(),
    }
    
    for name in assistant_names:
        if name in assistant_to_details:
            details = assistant_to_details[name]
            merged_details['meps'].update(details['meps'])
            merged_details['groups'].update(details['groups'])
            merged_details['parties'].update(details['parties'])
            merged_details['countries'].update(details['countries'])
            merged_details['dates_scraped'].update(details['dates_scraped'])
            # Set the assistant_type to the most specific type found (apa > apa grouped > both)
            if merged_details['assistant_type'] is None:
                merged_details['assistant_type'] = details['assistant_type']
            else:
                # Determine the type hierarchy
                if details['assistant_type'] == 'apa grouped':
                    merged_details['assistant_type'] = 'both'
                elif merged_details['assistant_type'] == 'apa grouped':
                    merged_details['assistant_type'] = 'both'

    return merged_details

# Handle exact matches by checking lowercase equivalence first
lowercase_dict = {}
for assistant in assistant_to_details.keys():
    lower_name = assistant.lower()
    if lower_name not in lowercase_dict:
        lowercase_dict[lower_name] = [assistant]
    else:
        lowercase_dict[lower_name].append(assistant)

# Merge exact matches (lowercase)
for assistants in lowercase_dict.values():
    if len(assistants) > 1:  # Only merge if there are duplicates
        merged_details = merge_assistant_details(assistants)
        # Keep the name with ASCII characters if available
        ascii_names = [name for name in assistants if all(ord(char) < 128 for char in name)]
        merged_assistant_name = max(ascii_names, key=lambda x: (x.lower(), x)) if ascii_names else assistants[0]
        merged_assistants[merged_assistant_name] = merged_details

# Now merge based on fuzzy matches (92% to 98%)
for assistant, similar in similar_assistants_for_merging.items():
    # Get the set of unique assistants in this group
    similar_assistants_set = set([assistant] + [name for name, score in similar])
    
    # Check for existing lowercase matches to avoid duplicates
    merged_details = merge_assistant_details(similar_assistants_set)

    # Handle the naming preference with ASCII
    ascii_names = [name for name in similar_assistants_set if all(ord(char) < 128 for char in name)]
    
    if ascii_names:
        # Choose the name with ASCII characters to keep
        merged_assistant_name = max(ascii_names, key=lambda x: (x.lower(), x))
    else:
        # Just keep the first one in the set if no ASCII names are found
        merged_assistant_name = next(iter(similar_assistants_set))

    # If this merged assistant already exists, combine the details
    if merged_assistant_name not in merged_assistants:
        merged_assistants[merged_assistant_name] = merged_details
    else:
        existing_details = merged_assistants[merged_assistant_name]
        existing_details['meps'].update(merged_details['meps'])
        existing_details['groups'].update(merged_details['groups'])
        existing_details['parties'].update(merged_details['parties'])
        existing_details['countries'].update(merged_details['countries'])
        existing_details['dates_scraped'].update(merged_details['dates_scraped'])

In [22]:
# 5. Add non-merged assistants to the final output
for assistant, details in assistant_to_details.items():
    if assistant not in merged_assistants:
        merged_assistants[assistant] = details

# 3. CONVERT TO DATAFRAME

In [25]:
# 6. Create the final DataFrame
rows = []
for assistant, details in merged_assistants.items():
    meps_list = ', '.join(details['meps'])
    groups_list = ', '.join(details['meps'])
    parties_list = ', '.join(details['parties'])
    countries_list = ', '.join(details['countries'])
    term = '9'
    dates_scraped_list = ', '.join(sorted(details['dates_scraped']))  # Join unique dates, sorted
    
    rows.append({
        'assistant_name': assistant,
        'assistant_type': details['assistant_type'],  # Maintain original type
        'mep(s)': meps_list,
        'mep(s) country': countries_list,
        'political_group(s)': groups_list,
        'mep(s) national parties': parties_list,
        'date_scraped': dates_scraped_list,  # Add the dates scraped here
        'term': term
    })

assistants_9term = pd.DataFrame(rows)

In [31]:
#print(assistants_9term.head())
print(assistants_9term['mep(s) country'].unique())

['Ireland' 'Spain' 'Germany' 'Netherlands' 'Belgium' 'Sweden' 'Poland'
 'Romania' 'France' 'Italy' 'Austria' 'Czechia' 'United Kingdom' 'Greece'
 'Malta' 'Hungary' 'Greece, Italy' 'Bulgaria' 'Latvia' 'Lithuania'
 'Estonia' 'United Kingdom, Romania' 'Denmark' 'Finland, United Kingdom'
 'Croatia' 'Portugal' 'Luxembourg' 'United Kingdom, Ireland' 'Cyprus'
 'Finland' 'Germany, Austria' 'Slovakia' 'Slovenia' 'Spain, France'
 'United Kingdom, France' 'Spain, Greece' 'Greece, Austria, Malta']
