In [7]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import time


def search_and_extract_info(query, outlist = None):
    if outlist == None:
        outlist = []
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument('--no-sandbox')

    # Create a service instance with the path to ChromeDriver
    s =  Service("/home/ubuntu/.wdm/drivers/chromedriver/linux64/126.0.6478.182/chromedriver-linux64/chromedriver")

    # Initialize the WebDriver with the specified options and service
    driver = webdriver.Chrome(service=s, options=chrome_options)

    print(f"{query=}")
    try:
        driver.get("https://finto.fi/udcs/fi/")
        
        search_input_locator = (By.ID, "search-field")
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(search_input_locator))
        search_input = driver.find_element(*search_input_locator)
        search_input.clear()
        search_input.send_keys(query)
        
        search_button_locator = (By.ID, "search-all-button")
        WebDriverWait(driver, 60).until(EC.element_to_be_clickable(search_button_locator))
        search_button = driver.find_element(*search_button_locator)
        search_button.click()
        
        # Attempt to scroll to the bottom of the page to ensure dynamic content loads
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        results_locator = (By.CLASS_NAME, "search-result")
        WebDriverWait(driver, 60).until(EC.presence_of_element_located(results_locator))
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        #print("Parsing HTML...")
        
        first_result = soup.find('div', class_='search-result')
        if first_result:
            notation_span = first_result.find('span', class_='notation')
            pref_label_a = first_result.find('a', class_='prefLabel conceptlabel')
            
            if notation_span and pref_label_a:
                numeric_id = notation_span.text.strip()
                title = pref_label_a.text.strip()
                output = {'udk': numeric_id, 'explanation': title}
                outlist.append(output)
                driver.quit()
                return output
        driver.quit()
            
    except Exception as e:
        #print(f"An error occurred: {e}")
        driver.quit()
        return None

df = pd.read_csv('/home/ubuntu/git/fennica/inst/examples/output.tables/UDK_discarded.csv', sep='\t')

queries = df['udk'].tolist()


In [6]:
print(queries)

['681.3', '65.012.4', '372.851', '364.65', '681.3.06', '658.11', '711.4', '929,', '001.891', '061.3', '339.923', '396', '371.671.12', '65.012.2', '372.880.1', '31', '681.324', '364.444', '372.880.20', '65.012', '355.48', '641.55', '338.23', '159.922.7', '004.7', '801.316.4', '331.45', '82.0', '378.662', '372.880.397', '504.45', '336.76', '65.012.6', '371.214', '372.82', '65.016', '504.06', '658.112.3', '65.011.4', '65.011.8', '519.68', '658.64', '004.42', '894.541.09', '656.08', '711.1', '37.0', '658.562', '398.2', '504.03', '681.3.02', '331.105.44', '336.1/.5', '371.13', '65.017.2/.3', '613.81', '504.054', '82/89', '364.27', '378.1', '316.356.2', '519.688', '316.66', '061.5', '658.155', '336.74', '339.564', '37.015.3', '504.064.4', '504.064', '65.012.122', '800', '061.1', '801.32', '372.891', '711.7', '860', '504.05', '330.34', '372.88', '082.2', '351.778.5', '657.1', '656.61', '372.3', '372.893', '368.4', '398.22', '331.56', '658.512.2', '657.47', '339.13', '364.44', '869.0', '681.32

In [8]:
from concurrent.futures import ThreadPoolExecutor
import concurrent
def process_query_parallel(queries):
    all_info = []
    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = {executor.submit(search_and_extract_info, query): query for query in queries[:100]}
        for future in concurrent.futures.as_completed(futures):
            query = futures[future]
            try:
                info = future.result()
                if info:
                    all_info.append(info)
            except Exception as exc:
                print(f'Query "{query}" generated an exception: {exc}')
    return all_info

In [55]:
#### In case you need multiprocessing 

# import threading, multiprocessing 
# def main_multiprocessing():
#     start_time = time.time() 
#     with multiprocessing.Manager() as manager:
#         outlist = manager.list()
#         processes = [] 
#         for query in queries[:2]: # each thread a new 'click' 
#             ps = multiprocessing.Process(target=search_and_extract_info, args=(query,outlist,))    
#             ps.start() # could sleep 1 between 'clicks' with `time.sleep(1)``
#             processes.append(ps)        
#         for ps in processes:
#             ps.join() # Main wait for processes finish
#             outlist = list(outlist)

#     return outlist

#all_info = main_multiprocessing()

In [9]:
all_info = process_query_parallel(queries)

query='37.0'
query='004.7'
query='339.923'
query='159.922.7'
query='372.880.397'
query='894.541.09'
query='658.11'
query='331.45'
query='681.3.06'
query='371.671.12'
query='372.851'
query='65.011.4'
query='65.011.8'
query='929,'
query='372.880.1'
query='65.012.2'
query='656.08'
query='504.06'
query='82.0'
query='338.23'
query='371.214'
query='336.76'
query='711.1'
query='65.012'
query='504.45'
query='658.64'
query='364.65'
query='65.016'
query='372.82'
query='364.444'
query='65.012.6'
query='681.3'
query='004.42'
query='372.880.20'
query='711.4'
query='31'
query='658.112.3'
query='398.2'
query='001.891'
query='519.68'
query='65.012.4'
query='801.316.4'
query='061.3'
query='355.48'
query='658.562'
query='504.03'
query='681.324'
query='378.662'
query='396'
query='641.55'
query='681.3.02'
query='331.105.44'
query='336.1/.5'
query='371.13'
query='65.017.2/.3'
query='613.81'
query='504.054'
query='364.27'
query='82/89'
query='316.356.2'
query='519.688'
query='378.1'
query='061.5'
query='316

In [10]:
all_info = []
with ThreadPoolExecutor(max_workers=50) as executor:
    futures = {executor.submit(search_and_extract_info, query): query for query in queries}
    for future in concurrent.futures.as_completed(futures):
        query = futures[future]
        try:
            info = future.result()
            if info:
                all_info.append(info)
        except Exception as exc:
            print(f'Query "{query}" generated an exception: {exc}')


query='061.3'
query='159.922.7'
query='65.012.4'
query='681.324'
query='372.880.1'
query='396'
query='364.65'
query='929,'
query='001.891'
query='656.08'
query='711.4'
query='65.012.2'
query='65.012.6'
query='658.11'
query='681.3'
query='372.880.20'
query='31'
query='339.923'
query='681.3.06'
query='364.444'
query='372.851'
query='371.671.12'
query='355.48'
query='65.012'
query='378.662'
query='82.0'
query='372.880.397'
query='338.23'
query='801.316.4'
query='641.55'
query='504.06'
query='004.42'
query='398.2'
query='004.7'
query='65.011.8'
query='65.011.4'
query='658.112.3'
query='894.541.09'
query='336.76'
query='504.45'
query='371.214'
query='519.68'
query='658.64'
query='65.016'
query='658.562'
query='711.1'
query='504.03'
query='331.45'
query='37.0'
query='372.82'
query='681.3.02'
query='331.105.44'
query='336.1/.5'
query='371.13'
query='65.017.2/.3'
query='504.054'
query='316.66'
query='061.5'
query='801.32'
query='316.356.2'
query='504.064.4'
query='519.688'
query='378.1'
query=

In [11]:
all_info

[{'udk': '311', 'explanation': 'Tilastotiede. Tilaston teoria'},
 {'udk': '004.7', 'explanation': 'Tietoliikenne. Tietoverkot'},
 {'udk': '82.02',
  'explanation': 'Kirjallisuuden koulukunnat, suuntaukset ja liikkeet'},
 {'udk': '004.42', 'explanation': 'Tietokoneohjelmointi. Tietokoneohjelmat'},
 {'udk': '37.01',
  'explanation': 'Kasvatuksen perusteet. Teoria. Tutkimusmenetelmät'},
 {'udk': '061.1', 'explanation': 'Julkisyhteisöt ja hallinnollinen yhteistyö'},
 {'udk': '004.4', 'explanation': 'Tietokoneohjelmat'},
 {'udk': '502.3/.7', 'explanation': 'Ympäristön osat'},
 {'udk': '004.43', 'explanation': 'Tietokonekielet'},
 {'udk': '303', 'explanation': 'Yhteiskuntatieteiden tutkimusmenetelmät'},
 {'udk': '93/94', 'explanation': 'Historia'},
 {'udk': '159.942', 'explanation': 'Tunteet. Emootiot'},
 {'udk': '005.5', 'explanation': 'Johtamistoimet. Määräykset'},
 {'udk': '004.9', 'explanation': 'Tietokonesovellukset'},
 {'udk': '537.8',
  'explanation': 'Sähkömagnetismi. Sähkömagneettin

In [12]:
import csv

# Path to udk_monografia.csv
csv_path = '/home/ubuntu/git/fennica/inst/examples/udk.csv'

# Prepare new data for appending
new_data = [f"{info['udk']};{info['explanation']}" for info in all_info]

# Append new data to udk_monografia.csv
with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    for row in new_data:
        writer.writerow([row])

print("Data has been appended to udk.csv.")

Data has been appended to udk_monografia.csv.
