## 1. SET-UP

In [31]:
import pandas as pd
import json
import os

In [32]:
# 1. Load JSON files for each term
def load_mep_data(json_path):
    """Load MEP data from a JSON file."""
    print(f"Loading MEP data from: {json_path}")
    if not os.path.exists(json_path):
        print(f"File not found: {json_path}")
        exit(1)
    
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            file_content = f.read().strip()  # Strip any extra whitespace
            if not file_content:
                raise ValueError("File is empty")
            
            # Validate JSON by loading
            meps_data = json.loads(file_content)  
            print(f"Loaded {len(meps_data)} MEPs successfully.")
            return meps_data
    
    except json.JSONDecodeError as e:
        print(f"JSON decode error at character {e.pos}: {e}")
    except ValueError as e:
        print(f"ValueError: {e}")
    except Exception as e:
        print(f"Failed to load MEP data: {e}")
    exit(1)

# File paths (replace with your paths)
data_9th_term = load_mep_data('C:/Users/Emilia/Documents/Uni Helsinki/Year Three/AMO Freelance/assistant task/9 term/raw data/final/9TERM_ALL_STANDARDIZED.json')
data_10th_term = load_mep_data('C:/Users/Emilia/Documents/Uni Helsinki/Year Three/AMO Freelance/assistant task/10 term/raw data/29-10-2024/mep_assistants.json')

# Convert data to DataFrame
df_9th = pd.json_normalize(data_9th_term)
df_9th['term'] = 9  # Add term info
df_10th = pd.json_normalize(data_10th_term)
df_10th['term'] = 10

# Concatenate dataframes
all_meps = pd.concat([df_9th, df_10th], ignore_index=True)

Loading MEP data from: C:/Users/Emilia/Documents/Uni Helsinki/Year Three/AMO Freelance/assistant task/9 term/raw data/final/9TERM_ALL_STANDARDIZED.json
Loaded 735 MEPs successfully.
Loading MEP data from: C:/Users/Emilia/Documents/Uni Helsinki/Year Three/AMO Freelance/assistant task/10 term/raw data/29-10-2024/mep_assistants.json
Loaded 719 MEPs successfully.


In [33]:
# keep only columns that we will use in analysis
columns_to_keep = [
    'name', 
    'party', 
    'country', 
    'term', 
    'assistants.Accredited assistants', 
    'assistants.Accredited assistants (grouping)'
]

# filter the DataFrame
meps_apas = all_meps[columns_to_keep]

# rename the columns by removing the 'assistants.' prefix
meps_apas.columns = [col.replace('assistants.', '') for col in meps_apas.columns]

# 2. CLEAN & STANDARDIZE

### 2.1 Party names to abbreviations

In [34]:
# Create a mapping dictionary for party names to abbreviations
party_abbreviations = {
    'Renew Europe Group': 'Renew',
    'European Conservatives and Reformists Group': 'ECR',
    "Group of the European People's Party (Christian Democrats)": 'EPP',
    'Group of the Progressive Alliance of Socialists and Democrats in the European Parliament': 'S&D',
    'Identity and Democracy Group': 'ID',
    'Group of the Greens/European Free Alliance': 'G/EFA',
    'Confederal Group of the European United Left - Nordic Green Left': 'GUE/NGL',
    'Non-attached Members': 'NA',
    'The Left group in the European Parliament - GUE/NGL': 'GUE/NGL',
    'Group of the European United Left - Nordic Green Left': 'GUE/NGL',
    'Patriots for Europe Group': 'PFE',
    'Europe of Sovereign Nations Group': 'ESN'
}

meps_apas.loc[:, 'group_abbrv'] = meps_apas['party'].map(party_abbreviations)
#print(meps_apas['party'].unique())
#print(meps_apas['group_abbrv'].unique())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meps_apas.loc[:, 'group_abbrv'] = meps_apas['party'].map(party_abbreviations)


### 2.3 Flatten & pivot (assistants are row level)

In [35]:
# Function to extract only relevant assistants
def extract_assistants(row):
    assistants = []
    # Focus only on "Accredited assistants" and "Accredited assistants (grouping)"
    relevant_groups = ['Accredited assistants', 'Accredited assistants (grouping)']
    
    for group in relevant_groups:
        # Check if the group exists in the row
        if group in row and isinstance(row[group], list):
            names = row[group]
            for name in names:
                assistants.append({
                    'assistant_name': name,
                    'mep_name': row['name'],  
                    'group': row['group_abbrv'],
                    'country': row['country'],
                    'term': row['term']
                })
    return assistants

# Flatten assistants for easier comparison
assistant_data = pd.DataFrame(
    [item for sublist in meps_apas.apply(extract_assistants, axis=1) for item in sublist]
)
print(assistant_data)

                         assistant_name         mep_name  group   country  \
0                 Anna Sophia BENGTSSON  Abir AL-SAHLANI  Renew    Sweden   
1               John August HULTENGAARD  Abir AL-SAHLANI  Renew    Sweden   
2                  Tyra Louise LUNDBERG  Abir AL-SAHLANI  Renew    Sweden   
3     Linn Christina Brunhilde OETTERLI  Abir AL-SAHLANI  Renew    Sweden   
4               Sylwia Joanna BETKOWSKA     Adam JARUBAS    EPP    Poland   
...                                 ...              ...    ...       ...   
7495          Andréa Laure Marie MOULIN    Željana ZOVKO    EPP   Croatia   
7496                         Polona KEK       Milan ZVER    EPP  Slovenia   
7497                     Petra SKRINJAR       Milan ZVER    EPP  Slovenia   
7498                        Peter SUHEL       Milan ZVER    EPP  Slovenia   
7499                     Dominik STRAKL       Milan ZVER    EPP  Slovenia   

      term  
0        9  
1        9  
2        9  
3        9  
4        9

### 2.3 Remove duplicate assistant names in same term

In [None]:
# FIRST REMAP THE NAMES
from fuzzywuzzy import fuzz

def find_similar_names(df, same_term=True):
    similar_pairs = []

    # Iterate through each MEP
    for mep in df['mep_name'].unique():
        # Filter for the current MEP
        if same_term:
            mep_data = df[df['mep_name'] == mep]
        else:
            mep_data = df[df['mep_name'] == mep]  # No term filter for this case

        names = mep_data['assistant_name'].tolist()

        # Check for similarity between each pair of assistant names
        for i in range(len(names)):
            for j in range(i + 1, len(names)):
                # Calculate similarity score
                score = fuzz.ratio(names[i].lower(), names[j].lower())
                
                # Adjusting the threshold between 90 and 99
                if 85 <= score < 100:  # Use the desired range here
                    similar_pairs.append((names[i], names[j], mep))

    return similar_pairs

# create remapping based on similar names
name_mapping = {
    "Magdalena NOWACKA": "Magdalena HILLS-NOWACKA", 
    "PAULA SENDIN RODRIGUEZ": "Paula SENDÍN RODRIGUEZ",
    "Eleonora Nikolaycheva GUIGOVA": "Eleonora Nikolaycheva GUIGOVA-NOSKER",
    "Eleonora Nikolaycheva GUIGOVA-NOSKER": "Eleonora Nikolaycheva GUIGOVA-NOSKER",
    "Stefanie SIFFT": "Stefanie SIFFT",
    "Stefanie Gabi SIFFT": "Stefanie SIFFT",
    "Anne-Cecile Juliette GAULT": "Anne-Cecile Juliette GAULT",
    "Anne-Cecile Juliette Rachel GAULT": "Anne-Cecile Juliette GAULT",
    "Sophie Anne Geraldine Marie GUIL": "Sophie Anne Geraldine Marie GUIL",
    "Sophie Anne Geraldine Marie Genevieve GUIL": "Sophie Anne Geraldine Marie GUIL",
    "Ana LOPEZ GONZALEZ": "ANA LÓPEZ GONZÁLEZ",
    "ANA LÓPEZ GONZÁLEZ": "ANA LÓPEZ GONZÁLEZ",
    "Claudia MARTINEZ MUNOZ": "Claudia MARTÍNEZ MUÑOZ",
    "CLAUDIA MARTÍNEZ MUÑOZ": "Claudia MARTÍNEZ MUÑOZ",
    "Maria Mercedes GARCIA MUNOZ": "MARIA MERCEDES GARCIA MUÑOZ",
    "MARIA MERCEDES GARCIA MUÑOZ": "MARIA MERCEDES GARCIA MUÑOZ",
    "Gilles Willy B SEGERS": "GILLES WILLY SEGERS",
    "GILLES WILLY SEGERS": "GILLES WILLY SEGERS",
    "Magdalena GONZALEZ GOZALBO": "Maria Magdalena GONZALEZ GOZALBO",
    "Maria Magdalena GONZALEZ GOZALBO": "Maria Magdalena GONZALEZ GOZALBO",
    "Arturo VILLARROYA GONZALEZ": "Arturo VILLARROYA GONZÁLEZ",
    "Fernando Jose NUNEZ ROBRES PATINO": "Fernando Jose NUNEZ-ROBRES PATINO",
    "Fernando Jose NUNEZ-ROBRES PATINO": "Fernando Jose NUNEZ-ROBRES PATINO",
    "Bibiana CARRETO PEREZ BARBADILLO": "BIBIANA CARRETO PÉREZ BARBADILLO",
    "BIBIANA CARRETO PÉREZ BARBADILLO": "BIBIANA CARRETO PÉREZ BARBADILLO",
    "Magdalena GONZALEZ GOZALBO": "Maria Magdalena GONZALEZ GOZALBO",
    "PAULA SENDIN RODRIGUEZ": "Paula SENDÍN RODRIGUEZ",
    "Paula SENDIN RODRIGUEZ": "Paula SENDÍN RODRIGUEZ",
    "MARIA MERCEDES GARCIA MUNOZ": "MARIA MERCEDES GARCIA MUÑOZ", 
    "ARTURO VILLARROYA GONZALEZ": "Arturo VILLARROYA GONZÁLEZ",
    "ARTURO VILLARROYA GONZALEZ":"Arturo VILLARROYA GONZÁLEZ"
}

# remap 
assistant_data['assistant_name'] = assistant_data['assistant_name'].replace(name_mapping)

# Find similar names considering only the same term
similar_names_same_term = find_similar_names(assistant_data, same_term=True)
print("Similar assistant names for the same MEP in the same term:")
for name1, name2, mep in similar_names_same_term:
    print(f"{name1} and {name2} for {mep} are similar.")

print(assistant_data)

Similar assistant names for the same MEP in the same term:
Michal MOJTO and Michaela MOJTOVÁ for Monika BEŇOVÁ are similar.
Michal MOJTO and Michaela MOJTOVÁ for Monika BEŇOVÁ are similar.
Michal MOJTO and Michaela MOJTOVÁ for Erik KALIŇÁK are similar.
                         assistant_name         mep_name  group   country  \
0                 Anna Sophia BENGTSSON  Abir AL-SAHLANI  Renew    Sweden   
1               John August HULTENGAARD  Abir AL-SAHLANI  Renew    Sweden   
2                  Tyra Louise LUNDBERG  Abir AL-SAHLANI  Renew    Sweden   
3     Linn Christina Brunhilde OETTERLI  Abir AL-SAHLANI  Renew    Sweden   
4               Sylwia Joanna BETKOWSKA     Adam JARUBAS    EPP    Poland   
...                                 ...              ...    ...       ...   
7495          Andréa Laure Marie MOULIN    Željana ZOVKO    EPP   Croatia   
7496                         Polona KEK       Milan ZVER    EPP  Slovenia   
7497                     Petra SKRINJAR       Milan ZV

In [49]:
# THEN REMOVE ANY EXACT MATCHES 
# Find exact matches within each MEP category
# Find exact matches within each MEP and term
def find_exact_matches(df):
    exact_matches = {}

    # Group by 'mep_name' and 'term' and find duplicates in 'remapped_name'
    for (mep, term), group in df.groupby(['mep_name', 'term']):
        duplicates = group['assistant_name'].value_counts()
        # Only keep names that occur more than once
        duplicate_names = duplicates[duplicates > 1].index.tolist()
        
        if duplicate_names:
            exact_matches[(mep, term)] = duplicate_names

    return exact_matches

# Get exact matches
exact_matches = find_exact_matches(assistant_data)

# Print out exact matches
print("Exact matches in MEP categories (for the same term):")
for (mep, term), names in exact_matches.items():
    print(f"\nMEP: {mep} | Term: {term}")
    for name in names:
        print(f" - {name}")

# remove duplicates 
unique_meps_apas = assistant_data.drop_duplicates(subset=['mep_name', 'term', 'assistant_name'])

Exact matches in MEP categories (for the same term):

MEP: Alicia HOMS GINEL | Term: 9
 - Alberto BONDESIO MARTINEZ
 - Cristian VILLAR PRIETO
 - Elena PEREDA LAGARTOS
 - Daniel DIEZ CECILIA
 - Josep MERCADAL BAQUERO
 - Joan SERRA MINGOT
 - Maria Magdalena GONZALEZ GOZALBO

MEP: Antonio LÓPEZ-ISTÚRIZ WHITE | Term: 9
 - Carlos Casimiro SALVADOR ARMENDARIZ

MEP: Esteban GONZÁLEZ PONS | Term: 9
 - Carlos FERNANDEZ OJEA

MEP: Francisco José MILLÁN MON | Term: 9
 - MARIA MASEDA VARELA

MEP: Iratxe GARCÍA PÉREZ | Term: 9
 - Paula SENDÍN RODRIGUEZ
 - Maria Magdalena GONZALEZ GOZALBO
 - Izaskun BERNAL CERDEIRA
 - Mercedes MARISCAL CAMPOS
 - Nayra Maria PRADO MARRERO
 - Daniel DIEZ CECILIA
 - Alberto BONDESIO MARTINEZ
 - MEL RAVELO CORDOVES
 - BLANCA SAENZ DE BURUAGA SANCHEZ
 - Jaime Daniel DE FRUTOS GONZALEZ
 - Roi VILLAR VAZQUEZ
 - Carmen MAGDALENA VALLEJO
 - Ana MARTINEZ SANJURJO
 - Ignacio Aitor DE LA PUERTA MARCO
 - Victor Patricio DONATE PAVON

MEP: Isabel BENJUMEA BENJUMEA | Term: 9
 - Te

## 3. Create Network Graph

In [None]:
import networkx as nx
import plotly.graph_objects as go

# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges
for _, row in unique_meps_apas.iterrows():
    G.add_node(row['mep_name'], type='mep')
    G.add_node(row['assistant_name'], type='assistant')
    G.add_edge(row['mep_name'], row['assistant_name'])

# Create positions for the nodes using spring layout
pos = nx.spring_layout(G)

# Extract edges and nodes for visualization
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)  # None for break in the line
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)  # None for break in the line

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

# Add nodes for MEPs
node_x = []
node_y = []
node_text = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(node)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=node_text,
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        size=10,
        color='blue'
    )
)

# Create the figure
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Assistants Network Graph',
                    titlefont=dict(size=16),
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=0,l=0,r=0,t=40),
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )

# Show the plot
fig.show()


                         assistant_name         mep_name  group   country  \
0                 Anna Sophia BENGTSSON  Abir AL-SAHLANI  Renew    Sweden   
1               John August HULTENGAARD  Abir AL-SAHLANI  Renew    Sweden   
2                  Tyra Louise LUNDBERG  Abir AL-SAHLANI  Renew    Sweden   
3     Linn Christina Brunhilde OETTERLI  Abir AL-SAHLANI  Renew    Sweden   
4               Sylwia Joanna BETKOWSKA     Adam JARUBAS    EPP    Poland   
...                                 ...              ...    ...       ...   
7495          Andréa Laure Marie MOULIN    Željana ZOVKO    EPP   Croatia   
7496                         Polona KEK       Milan ZVER    EPP  Slovenia   
7497                     Petra SKRINJAR       Milan ZVER    EPP  Slovenia   
7498                        Peter SUHEL       Milan ZVER    EPP  Slovenia   
7499                     Dominik STRAKL       Milan ZVER    EPP  Slovenia   

      term  
0        9  
1        9  
2        9  
3        9  
4        9