In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import time


def search_and_extract_info(query, outlist = None):
    if outlist == None:
        outlist = []
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument('--no-sandbox')

    # Create a service instance with the path to ChromeDriver
    s =  Service("/home/ubuntu/.wdm/drivers/chromedriver/linux64/126.0.6478.182/chromedriver-linux64/chromedriver")

    # Initialize the WebDriver with the specified options and service
    driver = webdriver.Chrome(service=s, options=chrome_options)

    print(f"{query=}")
    try:
        driver.get("https://finto.fi/udcs/fi/")
        
        search_input_locator = (By.ID, "search-field")
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(search_input_locator))
        search_input = driver.find_element(*search_input_locator)
        search_input.clear()
        search_input.send_keys(query)
        
        search_button_locator = (By.ID, "search-all-button")
        WebDriverWait(driver, 60).until(EC.element_to_be_clickable(search_button_locator))
        search_button = driver.find_element(*search_button_locator)
        search_button.click()
        
        # Attempt to scroll to the bottom of the page to ensure dynamic content loads
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        results_locator = (By.CLASS_NAME, "search-result")
        WebDriverWait(driver, 60).until(EC.presence_of_element_located(results_locator))
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        #print("Parsing HTML...")
        
        first_result = soup.find('div', class_='search-result')
        if first_result:
            notation_span = first_result.find('span', class_='notation')
            pref_label_a = first_result.find('a', class_='prefLabel conceptlabel')
            
            if notation_span and pref_label_a:
                numeric_id = notation_span.text.strip()
                title = pref_label_a.text.strip()
                output = {'udk': numeric_id, 'explanation': title}
                outlist.append(output)
                driver.quit()
                return output
        driver.quit()
            
    except Exception as e:
        #print(f"An error occurred: {e}")
        driver.quit()
        return None

df = pd.read_csv('/home/ubuntu/git/fennica/inst/examples/output.tables/UDK_discarded.csv', sep='\t')

queries = df['udk'].tolist()
queries = queries[7501:50000]


In [2]:
from concurrent.futures import ThreadPoolExecutor
import concurrent
def process_query_parallel(queries):
    all_info = []
    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = {executor.submit(search_and_extract_info, query): query for query in queries[:100]}
        for future in concurrent.futures.as_completed(futures):
            query = futures[future]
            try:
                info = future.result()
                if info:
                    all_info.append(info)
            except Exception as exc:
                print(f'Query "{query}" generated an exception: {exc}')
    return all_info

In [3]:
#### In case you need multiprocessing 

# import threading, multiprocessing 
# def main_multiprocessing():
#     start_time = time.time() 
#     with multiprocessing.Manager() as manager:
#         outlist = manager.list()
#         processes = [] 
#         for query in queries[:2]: # each thread a new 'click' 
#             ps = multiprocessing.Process(target=search_and_extract_info, args=(query,outlist,))    
#             ps.start() # could sleep 1 between 'clicks' with `time.sleep(1)``
#             processes.append(ps)        
#         for ps in processes:
#             ps.join() # Main wait for processes finish
#             outlist = list(outlist)

#     return outlist

#all_info = main_multiprocessing()

In [4]:
#all_info = process_query_parallel(queries)

In [5]:
all_info = []
with ThreadPoolExecutor(max_workers=50) as executor:
    futures = {executor.submit(search_and_extract_info, query): query for query in queries}
    for future in concurrent.futures.as_completed(futures):
        query = futures[future]
        try:
            info = future.result()
            if info:
                all_info.append(info)
        except Exception as exc:
            print(f'Query "{query}" generated an exception: {exc}')


query='581.143'
query='347.15/.17'
query='331.368'
query='612.79'
query='65.012.4:65.011.8'
query='929 Arwidsson'
query='35.072'
query='613.261'
query='141.81'
query='78.082'
query='552.52'
query='378.14.014.13'
query='547.917'
query='577.151.6'
query='332.25'
query='721.011.2'
query='338.97'
query='908 Pihtipudas'
query='628.48'
query='551.586'
query='362.7/.8'
query='398.87'
query='331.556.462'
query='728.51'
query='553.3/.9'
query='37.014.4'
query='614.1'
query='582.287.237'
query='331.109.6'
query='929 Ahlqvist'
query='581.526.426.2'
query='683.9'
query='65.015.12'
query='615.326'
query='546.175'
query='948.0.084'
query='929 Åström'
query='331.456'
query='8/9'
query='001.94'
query='641.5.06'
query='631.879'
query='75.071 Simberg'
query='929 Romanov'
query='338.439.6'
query='612.392.01'
query='35.071'
query='809.453'
query='681.53'
query='291.21'
query='69.059.6'
query='550.35'
query='929 Kivelä'
query='796.89'
query='629.734'
query='378.36'
query='621.3.011'
query='528.47/.48'
quer

In [1]:
all_info

NameError: name 'all_info' is not defined

In [27]:
import csv

# Path to udk_monografia.csv
csv_path = '/home/ubuntu/git/fennica/inst/examples/udk_monografia.csv'

# Prepare new data for appending
new_data = [f"{info['udk']};{info['explanation']}" for info in all_info]

# Append new data to udk_monografia.csv
with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile, delimiter=';')
    for row in new_data:
        writer.writerow([row])

print("Data has been appended to udk_monografia.csv.")

Data has been appended to udk_monografia.csv.
