In [1]:
# In this final step of processing, we will bucket the valid tags into
# broader groups. To do so, we used ChatGPT 4o on the browser,
# manually reviewed the result and made manual adjustments.

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import math

In [3]:
frequent_tags = ['democracy', 'war', 'finance', 'climate', 'migration', 'technology',
       'parliament procedures', 'human rights', 'industry', 'economy',
       'agriculture', 'innovation', 'elections', 'environment', 'politics',
       'health', 'energy', 'trade', 'gender equality',
       'international relations', 'education', 'security', 'corruption',
       'healthcare', 'rule of law', "women's rights", 'solidarity',
       'legislation', 'sustainability', 'poverty', 'pandemic', 'cooperation',
       'Brexit', 'terrorism', 'diplomacy', 'transparency', 'discrimination',
       'employment', 'justice', 'EU Parliament', 'geopolitics', 'biodiversity',
       'peace', 'equality', 'COVID-19', 'sanctions', 'violence against women',
       'climate change', 'religion', 'youth', 'culture', "children's rights",
       'foreign policy', 'EU policies', 'labor rights', 'defense',
       'media freedom', 'EU enlargement', 'tourism', 'transportation',
       'vaccines', 'food security', 'infrastructure', 'humanitarian aid',
       'vaccination', 'social issues', 'international cooperation',
       'renewable energy', 'journalism', 'animal welfare', 'violence',
       'disinformation', 'racism', 'freedom of speech', 'development',
       'research', 'inequality', 'media', 'sovereignty', 'social rights',
       'law', 'governance', 'history', 'extremism', 'LGBT rights', 'freedom',
       'energy crisis', 'social policy', 'freedom of expression',
       'EU institutions', 'foreign affairs', 'mental health', 'social justice',
       'abortion']

In [4]:
broad_groups = {
    "Parliament issues and procedures": [
        'parliament procedures', "EU Parliament", 'legislation',
    ],
    "Democracy and Governance": [
        'democracy', 'elections', 'politics', 'rule of law',
        'corruption', 'transparency', 'justice',
        'EU policies', 'EU institutions', 'governance', "law"
    ],
    "War and Security": [
        'war', 'security', 'terrorism', 
        'sanctions', 'defense', 'peace',
    ],
    "International Relations and Diplomacy": [
        'foreign policy', 'diplomacy', 'geopolitics', 'foreign affairs', 
        'Brexit', 'international relations', 'sovereignty', "cooperation", "EU enlargement",
        "international cooperation"
        
    ],
    "Economy, Finance and Infrastructure": [
        'finance', 'economy', 'trade', 'industry', 'employment', 'poverty', 
        'inequality', 'tourism', 'infrastructure', "transportation",
        'agriculture', 'energy', 'energy crisis',
    ],
    "Environment and Climate": [
        'climate', 'environment', 'sustainability', 'biodiversity', 'climate change',
        'renewable energy', 'animal welfare'
    ],
    "Health": [
        'health', 'healthcare', 'pandemic', 'COVID-19', 'vaccines', 'vaccination',
        'mental health', 'abortion'
    ],
    "Technology and Innovation": [
        'technology', 'innovation', 'research', 'development'
    ],
    "Human Rights, Freedoms and Social Issues": [
        'human rights', 'freedom of speech', 'freedom of expression', 'media freedom',
        'journalism', 'media', 'disinformation', 'violence',
        'violence against women', 'racism', 'freedom', 'social rights',
        'equality', 'social issues', 'social policy', 'social justice',
        'poverty', 'discrimination', 'solidarity', "women's rights", 
        'gender equality', "children's rights", "youth", "LGBT rights", 'food security',
        "migration", "religion", "labor rights", "humanitarian aid", "extremism",
        "culture", "education", "history"
    ]
}


In [5]:
# Extract all lists from the dictionary
lists = broad_groups.values()

# Flatten the lists so we can check if all the tags are present
flattened_list = [item for sublist in lists for item in sublist]

In [6]:
# All items on the dictionary are on the frequent tags
[item for item in flattened_list if item not in frequent_tags]

[]

In [7]:
# All items on the frequent tags are on my dictionary
[item for item in frequent_tags if item not in flattened_list]

[]

In [8]:
# Now, let's map this classification into our bigger dataset
df = pd.read_feather("../../output/llm-validated/validated-data.feather")

In [9]:
def contains_keywords(tags, keywords):
    tags_list = [tag.strip().lower() for tag in tags.split(',')]
    keywords = [keyword.strip().lower() for keyword in keywords]
    return any(keyword in tags_list for keyword in keywords)

# Add a column for each broad group
for group, keywords in broad_groups.items():
    df[group] = df['valid_tags'].apply(lambda x: contains_keywords(str(x), keywords))

In [10]:
df.to_feather("../../output/llm-bucketed/bucketed_valid_data.feather")