In [28]:
import pandas as pd
from tqdm import tqdm
from os.path import isfile
import concurrent.futures
import numpy as np

In [29]:
def add_word_boundaries(df):
    # Assuming this function adds word boundaries to the country names
    return df.applymap(lambda x: f'\\b{x}\\b')

In [30]:
def process_chunk(chunk, speeches):

    for name, regex in chunk:
        
        print("Processing", name)
        name = name.lower()
        fname = f"../output/countries-matching/{name}.feather"

        if isfile(fname):
            print(fname, "already exists")
            continue

        results = speeches['speech'].str.contains(regex, case=False)
        results = pd.DataFrame(results).rename(columns={"speech": name})
        results.to_feather(fname)

In [31]:
def main():
    
    # Column names
    languages = [
        'bg', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el', 'hu', 'ga', 
        'it', 'lv', 'lt', 'mt', 'pl', 'pt', 'ro', 'sk', 'sl', 'es', 'sv', 'en'
    ]
        
    # Reads the data in
    speeches = pd.read_feather("../output/processed/all-speeches-df.feather")
    country_names = pd.read_csv("../output/country-names/country-names-manual-revision.csv")
    
    # Saves the country names
    english_names = country_names['en']
    
    # Add word boundaries
    country_names = country_names[languages]
    country_names = add_word_boundaries(country_names)
    
    # Creates the regex patterns
    regex_patterns = country_names[languages].apply(lambda row: '|'.join(val.strip() for val in row), axis=1)    
    
    # Creates name and regex matches
    pairs = list(zip(english_names, regex_patterns))
    
    # Discover how many CPU cores we have available
    max_workers=multiprocessing.cpu_count()
    
    # Splits the pairs in this many chunks
    chunks = np.array_split(pairs, max_workers)

    
    # Process the chunks in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        
        futures = [executor.submit(process_chunk, chunk, speeches) for chunk in chunks]
        
        for future in concurrent.futures.as_completed(futures):
            future.result()
    
    print("Processing completed.")

In [32]:
# Run the main function
main()


Processing Afghanistan
../output/countries-matching/afghanistan.feather already exists
Processing Albania
../output/countries-matching/albania.feather already exists
Processing Algeria
../output/countries-matching/algeria.feather already exists
Processing Andorra
../output/countries-matching/andorra.feather already exists
Processing Angola
../output/countries-matching/angola.feather already exists
Processing Antigua and Barbuda
../output/countries-matching/antigua and barbuda.feather already exists
Processing Argentina
../output/countries-matching/argentina.feather already exists
Processing Armenia
../output/countries-matching/armenia.feather already exists
Processing Australia
../output/countries-matching/australia.feather already exists
Processing Austria
../output/countries-matching/austria.feather already exists
Processing Azerbaijan
Processing Bangladesh
../output/countries-matching/bangladesh.feather already exists
Processing Barbados
../output/countries-matching/azerbaijan.feath

In [17]:
from os.path import isfile
import pandas as pd
import re
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

In [2]:
# Function to add word boundaries to each entry
def add_word_boundaries(df):
    return df.applymap(lambda x: r'\b' + x + r'\b')

In [33]:
def main():
    
    # This is defined inside the function so it can access speeches in the main scope
    def find_matches(pair):
        name, regex = pair
        print("Processing", name)
        name = name.lower()
        fname = f"../output/countries-matching/{name}.feather"

        if isfile(fname):
            print(fname, "already exists")
            return

        # Compute-heavy part
        results = speeches['speech'].str.contains(regex, case=False)
        results = pd.DataFrame(results).rename(columns={"speech": name})
        results.to_feather(fname)
        
    
    # Column names
    languages = [
        'bg', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el', 'hu', 'ga', 
        'it', 'lv', 'lt', 'mt', 'pl', 'pt', 'ro', 'sk', 'sl', 'es', 'sv', 'en'
    ]
        
    # Reads the data in
    speeches = pd.read_feather("../output/processed/all-speeches-df.feather")
    country_names = pd.read_csv("../output/country-names/country-names-manual-revision.csv")

    # Saves the country names
    english_names = country_names['en']
    
    # Add word boundaries
    country_names = country_names[languages]
    country_names = add_word_boundaries(country_names)
    
    # Creates the regex patterns
    regex_patterns = country_names[languages].apply(lambda row: '|'.join(val.strip() for val in row), axis=1)    
    
    # Iterates pair wise with the regex patterns and the names
    pairs = zip(english_names, regex_patterns)

    for pair in pairs:
        find_matches(pair)

In [34]:
%%time
if __name__ == '__main__':
    main()

Processing Afghanistan
../output/countries-matching/afghanistan.feather already exists
Processing Albania
../output/countries-matching/albania.feather already exists
Processing Algeria
../output/countries-matching/algeria.feather already exists
Processing Andorra
../output/countries-matching/andorra.feather already exists
Processing Angola
../output/countries-matching/angola.feather already exists
Processing Antigua and Barbuda
../output/countries-matching/antigua and barbuda.feather already exists
Processing Argentina
../output/countries-matching/argentina.feather already exists
Processing Armenia
../output/countries-matching/armenia.feather already exists
Processing Australia
../output/countries-matching/australia.feather already exists
Processing Austria
../output/countries-matching/austria.feather already exists
Processing Azerbaijan
../output/countries-matching/azerbaijan.feather already exists
Processing Bahamas
../output/countries-matching/bahamas.feather already exists
Processi

In [None]:
speec