In [52]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import time


def search_and_extract_info(query, outlist = None):
    if outlist == None:
        outlist = []
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument('--no-sandbox')

    # Create a service instance with the path to ChromeDriver
    s =  Service("/home/ubuntu/.wdm/drivers/chromedriver/linux64/126.0.6478.182/chromedriver-linux64/chromedriver")

    # Initialize the WebDriver with the specified options and service
    driver = webdriver.Chrome(service=s, options=chrome_options)

    print(f"{query=}")
    try:
        driver.get("https://finto.fi/udcs/fi/")
        
        search_input_locator = (By.ID, "search-field")
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(search_input_locator))
        search_input = driver.find_element(*search_input_locator)
        search_input.clear()
        search_input.send_keys(query)
        
        search_button_locator = (By.ID, "search-all-button")
        WebDriverWait(driver, 60).until(EC.element_to_be_clickable(search_button_locator))
        search_button = driver.find_element(*search_button_locator)
        search_button.click()
        
        # Attempt to scroll to the bottom of the page to ensure dynamic content loads
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        results_locator = (By.CLASS_NAME, "search-result")
        WebDriverWait(driver, 60).until(EC.presence_of_element_located(results_locator))
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        #print("Parsing HTML...")
        
        first_result = soup.find('div', class_='search-result')
        if first_result:
            notation_span = first_result.find('span', class_='notation')
            pref_label_a = first_result.find('a', class_='prefLabel conceptlabel')
            
            if notation_span and pref_label_a:
                numeric_id = notation_span.text.strip()
                title = pref_label_a.text.strip()
                output = {'udk': numeric_id, 'explanation': title}
                outlist.append(output)
                driver.quit()
                return output
        driver.quit()
            
    except Exception as e:
        #print(f"An error occurred: {e}")
        driver.quit()
        return None

df = pd.read_csv('/home/ubuntu/git/fennica/inst/examples/output.tables/UDK_discarded.csv', sep='\t')
queries = df['Name'].tolist()


In [2]:
from concurrent.futures import ThreadPoolExecutor
import concurrent
def process_query_parallel(queries):
    all_info = []
    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = {executor.submit(search_and_extract_info, query): query for query in queries[:100]}
        for future in concurrent.futures.as_completed(futures):
            query = futures[future]
            try:
                info = future.result()
                if info:
                    all_info.append(info)
            except Exception as exc:
                print(f'Query "{query}" generated an exception: {exc}')
    return all_info

In [55]:
#### In case you need multiprocessing 

# import threading, multiprocessing 
# def main_multiprocessing():
#     start_time = time.time() 
#     with multiprocessing.Manager() as manager:
#         outlist = manager.list()
#         processes = [] 
#         for query in queries[:2]: # each thread a new 'click' 
#             ps = multiprocessing.Process(target=search_and_extract_info, args=(query,outlist,))    
#             ps.start() # could sleep 1 between 'clicks' with `time.sleep(1)``
#             processes.append(ps)        
#         for ps in processes:
#             ps.join() # Main wait for processes finish
#             outlist = list(outlist)

#     return outlist

#all_info = main_multiprocessing()

In [22]:
all_info = process_query_parallel(queries)

query='08'
query='06'
query='242'
query='22.06'
query='910.4'
query='244'
query='111'
query='630.9'
query='631.1'
query='639'
query='71'
query='633'
query='141.332'
query='329.14'
query='634/635'
query='630.2'
query='252'
query='641.55'
query='631.4'
query='92'
query='245'
query='630.1'
query='087.5'
query='059'
query='630.8'
query='22.09'
query='630.5'
query='711.2'
query='82.085'
query='637.1'
query='241'
query='839.31'
query='630.7'
query='639.2'
query='641.5'
query='266.1'
query='528'
query='796'
query='581.9'
query='784'
query='894.451'
query='796.4'
query='159.922.7'
query='948.5'
query='631.3'
query='807.1'
query='930.85'
query='264'
query='711.4'
query='234'
query='711.5'
query='781'
query='796/797'
query='809.454.2'
query='860'
query='9'
query='948.01'
query='061.5'
query='133'
query='238'
query='336.72'
query='613.2'
query='243'
query='266'
query='630.3'
query='630.6'
query='875'
query='398'
query='378.048'
query='266.3'
query='269'
query='792'
query='801.32'
query='808.2'
qu

query='821.511.111-1'
query='829'
query='84.71'
query='840.0'
query='86'
query='872.14'
query='882.0'
query='886.1'
query='89-3'
query='891.1'
query='891.541'
query='893.541'
query='894'
query='894.514'
query='894.541.'
query='894.5451'
query='894.79'
query='895(024.7)'
query='903.2'
query='904'
query='911.37'
query='911.5/.7'
query='911.52/.53'
query='914.0'
query='914.30'
query='914.36'
query='914.5'
query='914.60'
query='914.69'
query='914.802'
query='914.806'
query='914.807'
query='914.81'
query='914.90'
query='914.98'
query='915.2'
query='915.4'
query='915/919'
query='929.5./9.'
query='929.6'
query='929.7'
query='93'
query='940.48'
query='943.086'
query='944.04'
query='944.05'
query='947.084.3'
query='947.45'
query='947.7'
query='948.0.01'
query='948.0.02'
query='948.0.04/.05'
query='948.0.07/.08'
query='948.0.082'
query='948.0.083'
query='948.026/.029'
query='948.051'
query='956.94'
query='98'
query='[314.1+929]'
query='[71/78+821.113.6+821.511.111]'
query='[718+736]'
query='[73+

In [58]:
all_info = []
with ThreadPoolExecutor(max_workers=50) as executor:
    futures = {executor.submit(search_and_extract_info, query): query for query in queries}
    for future in concurrent.futures.as_completed(futures):
        query = futures[future]
        try:
            info = future.result()
            if info:
                all_info.append(info)
        except Exception as exc:
            print(f'Query "{query}" generated an exception: {exc}')


query='242'
query='244'
query='08'
query='06'
query='630.9'
query='22.06'
query='111'
query='141.332'
query='71'
query='910.4'
query='631.1'
query='639'
query='92'
query='633'
query='631.4'
query='252'
query='634/635'
query='630.8'
query='641.55'
query='329.14'
query='630.1'
query='245'
query='630.2'
query='059'
query='82.085'
query='631.3'
query='087.5'
query='639.2'
query='22.09'
query='796'
query='711.2'
query='630.5'
query='839.31'
query='266.1'
query='637.1'
query='241'
query='784'
query='581.9'
query='630.7'
query='641.5'
query='234'
query='528'
query='894.451'
query='159.922.7'
query='796.4'
query='948.5'
query='930.85'
query='807.1'
query='264'
query='711.4'
query='243'
query='266'
query='630.3'
query='613.2'
query='378.048'
query='630.6'
query='398'
query='875'
query='261'
query='801.32'
query='266.3'
query='820.0'
query='329.15'query='792'

query='808.2'
query='512'
query='232'
query='284.1'
query='664'
query='629.113'
query='269'
query='236'
query='849.541'
query='871'
query

In [59]:
all_info

[{'udk': '244', 'explanation': 'Japanilainen buddhalaisuus'},
 {'udk': '242', 'explanation': 'Mahajana-buddhalaisuus. Suuri vaunu'},
 {'udk': '06', 'explanation': 'Erilaiset yhteisöt'},
 {'udk': '252', 'explanation': 'Mesopotamian uskonnot'},
 {'udk': '631.3', 'explanation': 'Maatalouskoneet, -välineet ja -laitteet'},
 {'udk': '910.4', 'explanation': 'Tutkimusmatkat'},
 {'udk': '631.1',
  'explanation': 'Maatilan hoito ja organisointi (maatalousekonomia)'},
 {'udk': '111', 'explanation': 'Yleinen metafysiikka. Ontologia'},
 {'udk': '639', 'explanation': 'Metsästys. Kalastus. Kalanviljely'},
 {'udk': '639.2', 'explanation': 'Kalastus. Kalatalous'},
 {'udk': '08', 'explanation': 'Moniaiheiset teokset. Kokoomateokset'},
 {'udk': '241',
  'explanation': 'Hinajana-buddhalaisuus. Pieni vaunu. Theravada-buddhalaisuus. Pālin koulukunta'},
 {'udk': '796.4', 'explanation': 'Voimistelu. Akrobatia. Yleisurheilu'},
 {'udk': '234', 'explanation': 'Jainalaisuus'},
 {'udk': '528',
  'explanation': 'Ge

In [61]:
import csv

# Path to udk_monografia.csv
csv_path = '/home/ubuntu/git/fennica/inst/examples/udk_monografia.csv'

# Prepare new data for appending
new_data = [f"{info['udk']};{info['explanation']}" for info in all_info]

# Append new data to udk_monografia.csv
with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile, delimiter=';')
    for row in new_data:
        writer.writerow([row])

print("Data has been appended to udk_monografia.csv.")

Error: need to escape, but no escapechar set