In [1]:
import glob
import pandas as pd
import re

In [2]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI

In [3]:
OPENAI_API_KEY = "YOUR API KEY GOES HERE"
OPENAI_API_MODEL = 'gpt-3.5-turbo-0125'

In [5]:
# Get the classified files
files = glob.glob("../../output/classified-llm/*.csv")

In [6]:
df = pd.concat([pd.read_csv(file) for file in files]).reset_index(drop=True)

In [7]:
# Removes columns that we don't need
# df = df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'level_0', 'index'])

In [8]:
df['level_0'].value_counts()

102379    1
128680    1
128682    1
128683    1
128684    1
         ..
324594    1
324595    1
324596    1
324597    1
161144    1
Name: level_0, Length: 59377, dtype: int64

In [9]:
# Let's take a look at all the retrieved tags

# All tags
all_tags = df.topic.str.split(',') # Split in the commas
flattened_list = [item.strip() for sublist in all_tags for item in sublist] # Makes one big list
all_tags_df = pd.DataFrame(flattened_list).rename(columns={0:'tag'}) # Dataframe with one row for wich time a tag appeared

In [10]:
display(all_tags_df)

Unnamed: 0,tag
0,gender pay gap
1,equality
2,women's rights
3,gender equality
4,work-life balance
...,...
186090,climate
186091,water management
186092,international relations
186093,climate


In [11]:
# Total tags applied
all_tags_df.tag.shape[0]

186095

In [12]:
# Total tags applied
all_tags_df.tag.unique().shape[0]

7942

In [13]:
# That seems like a disproportionate ammount of tags.
# Some of them might be applying to one single speech.
# Let's take a look at the value counts.
display(all_tags_df.tag.value_counts(), all_tags_df.tag.value_counts(normalize=True))

democracy                     21893
war                           11995
finance                       11664
climate                        9314
migration                      7930
                              ...  
barriers removal                  1
market services                   1
European arrest warrant           1
digital crimes                    1
water resources management        1
Name: tag, Length: 7942, dtype: int64

democracy                     0.117644
war                           0.064456
finance                       0.062678
climate                       0.050050
migration                     0.042613
                                ...   
barriers removal              0.000005
market services               0.000005
European arrest warrant       0.000005
digital crimes                0.000005
water resources management    0.000005
Name: tag, Length: 7942, dtype: float64

In [14]:
# The top entries concentrate most of the speeches.
# Let's see exactly how many.

In [15]:
# Counts the tags
value_counts = all_tags_df.tag.value_counts()

# Get the 100 most common ones
frequent_tags = value_counts.head(100).index

In [16]:
# Taking a look at the substrings. Let's remove the mentions to specific
# places and stick with general themes and issues.
frequent_tags

Index(['democracy', 'war', 'finance', 'climate', 'migration', 'technology',
       'parliament procedures', 'human rights', 'industry', 'economy',
       'agriculture', 'innovation', 'elections', 'environment', 'politics',
       'European Union', 'health', 'energy', 'trade', 'gender equality',
       'international relations', 'education', 'security', 'corruption',
       'healthcare', 'rule of law', 'women's rights', 'solidarity',
       'legislation', 'sustainability', 'poverty', 'pandemic', 'cooperation',
       'Brexit', 'terrorism', 'diplomacy', 'transparency', 'discrimination',
       'employment', 'justice', 'EU Parliament', 'geopolitics', 'biodiversity',
       'peace', 'equality', 'COVID-19', 'sanctions', 'violence against women',
       'climate change', 'religion', 'youth', 'culture', 'children's rights',
       'Russia', 'foreign policy', 'EU policies', 'labor rights', 'defense',
       'media freedom', 'Ukraine', 'EU enlargement', 'tourism',
       'transportation', 'vacc

In [17]:
frequent_tags = frequent_tags.drop(['European Union', 'Russia', 'Ukraine', 'China', 'Europe', 'EU'])

In [18]:
frequent_tags

Index(['democracy', 'war', 'finance', 'climate', 'migration', 'technology',
       'parliament procedures', 'human rights', 'industry', 'economy',
       'agriculture', 'innovation', 'elections', 'environment', 'politics',
       'health', 'energy', 'trade', 'gender equality',
       'international relations', 'education', 'security', 'corruption',
       'healthcare', 'rule of law', 'women's rights', 'solidarity',
       'legislation', 'sustainability', 'poverty', 'pandemic', 'cooperation',
       'Brexit', 'terrorism', 'diplomacy', 'transparency', 'discrimination',
       'employment', 'justice', 'EU Parliament', 'geopolitics', 'biodiversity',
       'peace', 'equality', 'COVID-19', 'sanctions', 'violence against women',
       'climate change', 'religion', 'youth', 'culture', 'children's rights',
       'foreign policy', 'EU policies', 'labor rights', 'defense',
       'media freedom', 'EU enlargement', 'tourism', 'transportation',
       'vaccines', 'food security', 'infrastructure

In [19]:
# On the original dataframe, select all the speeches that have at least one of this tags
pattern = "|".join(frequent_tags)

# Use str.contains to check if any of the strings are present
df['contains_string'] = df['topic'].str.contains(pattern, case=False, regex=True)

# How many speeches have at least one tag that is among the most common?
df['contains_string'].value_counts(normalize=True)

# 98.5% of the speeches contain at least one this tags!!!
# Let's look at the ones that doesn't.

True     0.984034
False    0.015966
Name: contains_string, dtype: float64

In [20]:
# Overall, the share of English speeches in the group 
# for which we DON'T have data is nearly 10% LARGER to the one we have
# data for. This could show some bias: the classifier has
# more problems dealing with foreign languages.

In [21]:
df[df.contains_string].language.str.lower().value_counts(normalize=True)

english                             0.272365
german                              0.111417
french                              0.104349
spanish                             0.088826
italian                             0.065892
polish                              0.057574
portuguese                          0.043626
dutch                               0.032090
romanian                            0.031851
greek                               0.025039
czech                               0.021975
swedish                             0.020777
hungarian                           0.020692
croatian                            0.019323
slovak                              0.015643
bulgarian                           0.015044
finnish                             0.011279
danish                              0.010782
slovenian                           0.010680
lithuanian                          0.006743
maltese                             0.004775
estonian                            0.003645
latvian   

In [22]:
df[~df.contains_string].language.str.lower().value_counts(normalize=True)

english       0.359705
spanish       0.083333
german        0.075949
french        0.075949
italian       0.056962
polish        0.047468
dutch         0.040084
portuguese    0.039030
croatian      0.032700
romanian      0.032700
czech         0.025316
greek         0.022152
bulgarian     0.018987
hungarian     0.017932
swedish       0.014768
slovenian     0.011603
danish        0.011603
slovak        0.011603
finnish       0.005274
latvian       0.004219
estonian      0.004219
maltese       0.003165
lithuanian    0.003165
irish         0.002110
Name: language, dtype: float64

In [23]:
# Share of speeches by language that didn't get any valid tags
(df[~df.contains_string].language.str.lower().value_counts() / \
df.language.str.lower().value_counts()) * 100

# No language was hugely affected

bulgarian                           2.006689
croatian                            2.672414
czech                               1.834862
danish                              1.716069
dutch                               1.986409
english                             2.097816
english, french                          NaN
estonian                            1.843318
finnish                             0.753012
french                              1.167126
french, dutch                            NaN
french, german, english                  NaN
french, greek                            NaN
german                              1.093892
german, french                           NaN
german, greek                            NaN
greek                               1.415094
hungarian                           1.386623
irish                               1.960784
italian                             1.383197
latvian                             2.222222
latvian, french, german, english         NaN
lithuanian

In [25]:
# Removes entries with invalid strings
df = df[df.contains_string]

In [26]:
# Let's create a regex pattern with capture groups to get the frequent tags that were applied into each.
pattern = '|'.join(f'(?:{substring})' for substring in frequent_tags)

In [27]:
def find_substrings(text, pattern):
    matches = re.findall(pattern, text, re.IGNORECASE)
    return ', '.join(matches)

In [28]:
# Apply the function to the DataFrame
df['valid_tags'] = df['topic'].apply(lambda x: find_substrings(x, pattern))

In [29]:
# Clean's data up for further processing
df = df[['level_0', 'speech', 'date', 'term', 'year', 'speech_length_in_characters', 'language', 'valid_tags']]

In [33]:
df = df.reset_index(drop=True)

In [34]:
# Saves
df.to_feather("../../output/llm-validated/validated-data.feather")

In [None]:
frequent_tags