In [13]:
import datetime, json, time, re, traceback
from tqdm import tqdm
from duckduckgo_search import DDGS
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import pandas as pd

companies  = [
    "Ecopetrol", "Enel", "Grupo Argos", "Terpel", "Vanti", "Promigas", "Bancolombia", "BBVA", "Grupo AVAL", "Davivienda", "Credibanco",
    "Grupo Sura", "Itau", "Nequi", "Grupo Exito", "Cencosud", "Tiendas D1", "Alkosto", "Falabella", "Sodimac", "Caracol", "Empresa Claro", "ETB",
    'GeoPark', 'Empresa Brilla', 'Banco de Occidente', 'Cenit'
    ]

sources = ['larepublica', 'semana', 'eltiempo', 'lasillavacia', 'elcolombiano', 'portafolio', 'elespectador']

sources_links = {
    'semana': 'semana.com', 'larepublica': 'larepublica.co', 'eltiempo': 'eltiempo.com', 'lasillavacia': 'lasillavacia.com',
    'elcolombiano': 'elcolombiano.com', 'portafolio': 'portafolio.co', 'elespectador': 'elespectador.com'
    }

def api_search(company, source, sources_links, sleep):
    results = DDGS().text(
                    keywords=f'{company} site:{sources_links[source]}',
                    max_results=10,
                    safesearch='off',
                    timelimit='w'
                    )
    time.sleep(3)
    return results

def selenium_search(driver, company, source, sources_links):
    driver.get(f'https://duckduckgo.com/?q={company}+site%3A{sources_links[source]}&t=h_&df=w&ia=web')
    time.sleep(1) # give page a chance to fully load
    content = driver.page_source
    links = pd.Series(re.findall(f'(<a href=".*?" rel)', content, flags=re.IGNORECASE))
    links = links.str.replace('<a href=', '').str.replace('"', '').str.replace(' rel', '').drop_duplicates()
    links = links[links.str.contains('https\://www.+', regex=True, case=False)]
    return links

def get_news_links(companies, sources, sources_links):
    search_results = {}
    pbar = tqdm(total=len(companies)*len(sources))
    driver = webdriver.Chrome()
    try:
        for source in sources:
            aux = {}
            for company in companies:
                pbar.update(1)
                pbar.set_description(f'source:{source}-company:{company}')
                results = selenium_search(driver, company, source, sources_links)
                if len(results):
                    if isinstance(results, dict):
                        aux[company] = [json_['href'] for json_ in results]
                    else:
                        aux[company] = results.tolist()
            if len(aux):
                search_results[source] = aux
        pbar.close()
        driver.quit()
        week_number = datetime.datetime.now().isocalendar()[1]
        with open(f'engine_search/week_{week_number}.json', 'w') as f:
            json.dump(search_results, f)
        return search_results
    except Exception:
        traceback.print_exc()
        print('failed scraping...')
        pbar.close()
        driver.quit()

In [14]:
search_results = get_news_links(companies, sources, sources_links)

source:elespectador-company:Cenit: 100%|██████████| 189/189 [07:41<00:00,  2.44s/it]             


In [15]:
search_results

{'larepublica': {'Ecopetrol': ['https://www.larepublica.co/resultados-de-ecopetrol',
   'https://www.larepublica.co/economia/escandalos-contra-roa-ponen-el-riesgo-el-futuro-de-ecopetrol-3857482',
   'https://www.larepublica.co/economia/ecopetrol-entrego-ecoparque-solar-de-22-1-megavatios-construido-dentro-de-reficar-3839538',
   'https://www.larepublica.co/economia/grupo-ecopetrol-genero-mas-de-128-000-empleos-en-2023-3797261',
   'https://www.larepublica.co/economia/el-estado-colombiano-paga-mas-de-20-billones-anuales-en-subsidio-de-diesel-3877082',
   'https://www.larepublica.co/economia/petro-abre-puerta-para-que-ecopetrol-explore-gas-y-petroleo-en-frontera-con-venezuela-3838075',
   'https://www.larepublica.co/economia/fracking-en-la-filial-permian-de-ecopetrol-en-ee-uu-impulso-la-produccion-petrolera-3856800',
   'https://www.larepublica.co/empresas/grupo-epm-anuncio-que-sus-filiales-de-energia-se-acogeran-a-la-reduccion-tarifaria-3878808',
   'https://www.larepublica.co/gas-natur