In [1]:
import re
import spacy
import pandas as pd

## Preprocessing

In [2]:
# EU Member states
eu_member_states = [
    "Austria",
    "Belgium",
    "Bulgaria",
    "Croatia",
    "Cyprus",
    "Czechia",
    "Denmark",
    "Estonia",
    "Finland",
    "France",
    "Germany",
    "Greece",
    "Hungary",
    "Ireland",
    "Italy",
    "Latvia",
    "Lithuania",
    "Luxembourg",
    "Malta",
    "Netherlands",
    "Poland",
    "Portugal",
    "Romania",
    "Slovakia",
    "Slovenia",
    "Spain",
    "Sweden"
]

# Load NLP
nlp = spacy.load('en_core_web_lg') # Remember to download the model by: $sudo python -m spacy download en_core_web_sm

In [None]:
class eudata:

    def __init__(self, country, nlp):
        self.country = country
        self.data    = pd.read_parquet(f"/Users/carlostoruno/Documents/GitHub/EU-copilot/data/{country}_master.parquet.gzip")
        self.nlp     = nlp
    
    def process_text(self, text):

        # Remove URLs
        text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
        
        # Process text with spaCy
        doc = self.nlp(text)
        tokens = [token for token in doc if not token.is_stop and token.is_alpha]
        lemmatized_tokens = [token.lemma_.lower() for token in tokens]

        return " ".join(lemmatized_tokens)
    
    def add_proctext(self):
        preproc_texts = [self.process_text(article) for article in self.data["content_trans"].to_list()]
        self.data["cleaned_text"] = preproc_texts
        return self.data


In [7]:
data = [eudata(country, nlp=nlp).add_proctext() for country in eu_member_states]

In [12]:
for count, country in enumerate(eu_member_states):
    data[count].to_parquet(
        f"/Users/carlostoruno/Documents/GitHub/EU-copilot/data/{country}_master.parquet.gzip",
        compression="gzip"
    )

## Getting entities

In [12]:
def get_entities(text, nlp=nlp):
    NER_mod = nlp(text)
    entities = [
        str(ent) for ent in NER_mod.ents 
        if ent.label_ not in ["DATE", "PRODUCT", "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"]
    ]
    unique_tokens = list(dict.fromkeys(" ".join(entities).split()))
    return " ".join(unique_tokens)

In [16]:
for country in eu_member_states:
    data = pd.read_parquet(f"../data/news-data/{country}_master.parquet.gzip")
    data["entities"] = data["cleaned_text"].apply(lambda x: get_entities(x))
    data.to_parquet(
        f"../data/news-data/{country}_master.parquet.gzip",
        compression="gzip"
    )