In [30]:
import html2text, re, time, tqdm
from datetime import datetime
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
from transformers import AutoModel
from numpy.linalg import norm
import pandas as pd

async def do_webscraping(link):
    try:
        urls = [link]
        loader = AsyncHtmlLoader(urls)
        docs = loader.load()

        html2text_transformer = Html2TextTransformer()
        docs_transformed = html2text_transformer.transform_documents(docs)

        if docs_transformed != None and len(docs_transformed) > 0:
            metadata = docs_transformed[0].metadata
            title = metadata.get('title', '')
            return {
                'summary': docs_transformed[0].page_content,
                'title': title,
                'metadata': metadata,
                'clean_content': html2text.html2text(docs_transformed[0].page_content)
            }
        else:
            return None

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

In [5]:
structured_response = []
search_results = [
  r'https://caracol.com.co/2024/07/01/ante-deficit-ecopetrol-estaria-negociando-para-importar-gas-natural-licuado-desde-eeuu/',
  r'https://www.elcolombiano.com/negocios/fracking-estados-unidos-ecopetrol-negocia-gas-natural-con-estados-unidos-BG24887475',
  r'https://www.lafm.com.co/economia/ecopetrol-esta-en-negociaciones-para-importar-gas-a-estados-unidos',
  r'https://www.pulzo.com/economia/ecopetrol-haria-negocio-con-estados-unidos-crisis-gas-colombia-PP3756548A',
  r'https://www.infobae.com/colombia/2024/06/29/remezon-en-ecopetrol-se-avecinan-cambios-en-la-alta-gerencia-a-partir-del-1-de-julio/',
  r'https://www.eltiempo.com/economia/empresas/los-cambios-en-la-alta-gerencia-de-ecopetrol-que-empezaran-a-regir-a-partir-del-1-de-julio-3357513',
  r'https://www.valoraanalitik.com/ecopetrol-anuncia-nuevos-cambios-en-varias-vicepresidencias/',
  r'https://www.pulzo.com/economia/ecopetrol-anuncio-cuatro-nuevos-cambios-con-cargos-importantes-compania-PP3755005A'
]
for link in search_results:
  print(link)
  response = await do_webscraping(link)
  if response != None:
    structured_response.append(response)

https://caracol.com.co/2024/07/01/ante-deficit-ecopetrol-estaria-negociando-para-importar-gas-natural-licuado-desde-eeuu/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.64it/s]


https://www.elcolombiano.com/negocios/fracking-estados-unidos-ecopetrol-negocia-gas-natural-con-estados-unidos-BG24887475


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.42it/s]


https://www.lafm.com.co/economia/ecopetrol-esta-en-negociaciones-para-importar-gas-a-estados-unidos


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.30it/s]


https://www.pulzo.com/economia/ecopetrol-haria-negocio-con-estados-unidos-crisis-gas-colombia-PP3756548A


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.08s/it]


https://www.infobae.com/colombia/2024/06/29/remezon-en-ecopetrol-se-avecinan-cambios-en-la-alta-gerencia-a-partir-del-1-de-julio/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.62it/s]


https://www.eltiempo.com/economia/empresas/los-cambios-en-la-alta-gerencia-de-ecopetrol-que-empezaran-a-regir-a-partir-del-1-de-julio-3357513


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.68it/s]


https://www.valoraanalitik.com/ecopetrol-anuncia-nuevos-cambios-en-varias-vicepresidencias/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.40it/s]


https://www.pulzo.com/economia/ecopetrol-anuncio-cuatro-nuevos-cambios-con-cargos-importantes-compania-PP3755005A


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.07it/s]


In [6]:
structured_response[1]

{'summary': 'x\n\n  * bookmarkGuardados\n  * graphic_eqPódcast\n  * play_arrowVideos\n  * photo_cameraReportajes gráficos\n  * account_circleSuscríbete\n\npico y placa\n\nPico y Placa Medellín\n\n#### Pico y Placa Medellín\n\n## viernes\n\n7 y 9\n\n7 y 9\n\npico y placa\n\nPico y Placa Medellín\n\n#### Pico y Placa Medellín\n\n## jueves\n\n3 y 6\n\n3 y 6\n\npico y placa\n\nPico y Placa Medellín\n\n#### Pico y Placa Medellín\n\n## miercoles\n\n2 y 0\n\n2 y 0\n\npico y placa\n\nPico y Placa Medellín\n\n#### Pico y Placa Medellín\n\n## martes\n\n1 y 4\n\n1 y 4\n\npico y placa\n\nPico y Placa Medellín\n\n#### Pico y Placa Medellín\n\n## domingo\n\nno\n\nno\n\npico y placa\n\nPico y Placa Medellín\n\n#### Pico y Placa Medellín\n\n## sabado\n\nno\n\nno\n\npico y placa\n\nPico y Placa Medellín\n\n#### Pico y Placa Medellín\n\n## lunes\n\n5 y 8\n\n5 y 8\n\n  * search\n  * account_circleMi Cuenta\n  * bookmarkGuardados\n  * descriptionPeriódico impreso\n  * graphic_eqPódcast\n  * play_arrowVide

In [27]:
news = {}
for i, doc in enumerate(structured_response):
    doc = doc['clean_content'].replace(doc['title'], '')
    texts = pd.Series(re.split('#', doc)).str.strip()
    texts = texts[texts.str.len()>100].reset_index(drop=True)
    lens = texts.apply(lambda text: pd.Series(text.split('\n')).str.strip().str.len().mean())
    texts = texts[lens>20]
    news[structured_response[i]['title'].strip()] = texts

In [184]:
def get_dates(news):
    dates = []
    for value in news.values():
        date = value[value.str.contains('[0-9]{1,4}\W[0-9]{1,2}\W[0-9]{1,4}', regex=True)]
        date = date.str.extract('([0-9]{1,4}\W[0-9]{1,2}\W[0-9]{1,4})')
        date = date[0].str.split('\W', regex=True, expand=True)
        if len(date):
            lens = date.iloc[0].str.len()
            if lens.tolist()==lens.sort_values(ascending=False).tolist():
                date_ = '-'.join(date.iloc[0].tolist())
            else:
                date_ = '-'.join(date.iloc[0][::-1].tolist())
        else:
            now = datetime.now()
            date_ = f'{now.year}-{0 if now.month<10 else ""}{now.month}-{0 if now.day<10 else ""}{now.day}'
        dates.append(date_)
    return dates

class scrap_news:
    model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-es', trust_remote_code=True) # trust_remote_code is needed to use the encode method

    def cos_sim(self, a, b):
        return (a @ b.T) / (norm(a)*norm(b))

    def get_similarity(self, df, title, cutoff=0.4):
        texts = self.model.encode(df.tolist())
        title = self.model.encode(title)
        embedds = []
        for text in texts:
            embedds.append(self.cos_sim(text, title))
        df = df.to_frame().assign(SIMILARITY=embedds)
        df = df[df.SIMILARITY>cutoff]
        return df

    def scrap(self, news):
        self.similarity, self.content = {}, []
        for key, value in tqdm.tqdm(news.items()):
            self.similarity[key] = self.get_similarity(value, key)
            value = self.similarity[key][0]
            value = value.str.replace('(\n)|(\*)', ' ', regex=True).str.replace(' +', ' ', regex=True).str.strip()
            self.content.append('\n\n\n\n'.join(value.tolist()))

In [185]:
scraper = scrap_news()
scraper.scrap(news)
dates = get_dates(news)

  0%|          | 0/8 [00:00<?, ?it/s]

 12%|█▎        | 1/8 [01:29<10:26, 89.48s/it]


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 14828660640 bytes.

In [None]:
company = 'Ecopetrol'
news_docs = []
for i in range(len(scraper.content)):
    doc = {
        'content': scraper.content[i],
        'metadata': {
            'title': structured_response[i]['title'],
            'source': structured_response[i]['metadata']['source'],
            'date': dates[i],
            'company': company
        }
    }
    news_docs.append(doc)

In [2]:
import time
from tqdm import tqdm
from duckduckgo_search import DDGS

companies  = [
    "Ecopetrol", "Enel", "Grupo Argos", "Terpel", "Vanti", "Promigas", "Bancolombia", "BBVA", "Grupo AVAL", "Davivienda", "Credibanco",
    "Grupo Sura", "Itau", "Nequi", "Grupo Exito", "Cencosud", "Tiendas D1", "Alkosto", "Falabella", "Sodimac", "Caracol", "Claro", "ETB",
    'GeoPark', 'Brilla', 'Banco de Occidente', 'Cenit'
    ]
sources = ['larepublica', 'semana', 'eltiempo', 'lasillavacia', 'elcolombiano', 'portafolio', 'elespectador']
sources_links = {
    'semana': 'semana.com', 'larepublica': 'larepublica.co', 'eltiempo': 'eltiempo.com', 'lasillavacia': 'lasillavacia.com',
    'elcolombiano': 'elcolombiano.com', 'portafolio': 'portafolio.co', 'elespectador': 'elespectador.com'
    }

search_results = {}
pbar = tqdm(total=len(companies)*len(sources))

for source in sources:
    aux = {}
    for company in companies:
        pbar.update(1)
        pbar.set_description(f'source:{source}-company:{company}')
        time.sleep(3)
        results = DDGS().text(
            keywords=f'{company} site:{sources_links[source]}',
            max_results=10,
            safesearch='off',
            timelimit='w'
            )
        if len(results):
            aux[company] = [json_['href'] for json_ in results]
    if len(aux):
        search_results[source] = aux
pbar.close()
search_results

source:larepublica-company:Ecopetrol:   7%|▋         | 1/15 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [17]:
from selenium import webdriver
import time

driver = webdriver.Chrome()
driver.get(f'https://duckduckgo.com/?q={company}+site%3A{sources_links[source]}&t=h_&df=w&ia=web')
time.sleep(3) # give page a chance to fully load
content = driver.page_source
driver.quit()

In [34]:
import html2text
string = html2text.html2text(content)

In [40]:
import re
re.findall(f'www.{sources_links[source]}.+\n', string, flags=re.IGNORECASE)

['www.larepublica.co.ico)](/?q=Ecopetrol%20site%3Alarepublica.co&t=h_ "Search domain larepublica.co")[https://www.larepublica.co › economia › ecopetrol-quiere-comprar-activos-de-canacol-3856926](https://www.larepublica.co/economia/ecopetrol-quiere-comprar-activos-de-canacol-3856926)\n',
 'www.larepublica.co/economia/ecopetrol-quiere-comprar-activos-de-\n',
 'www.larepublica.co.ico)](/?q=Ecopetrol%20site%3Alarepublica.co&t=h_ "Search domain larepublica.co")[https://www.larepublica.co › economia › el-estado-colombiano-paga-mas-de-20-billones-anuales-en-subsidio-de-diesel-3877082](https://www.larepublica.co/economia/el-estado-colombiano-paga-mas-de-20-billones-anuales-en-subsidio-de-diesel-3877082)\n',
 'www.larepublica.co/economia/el-estado-colombiano-paga-mas-\n',
 'www.larepublica.co.ico)](/?q=Ecopetrol%20site%3Alarepublica.co&t=h_ "Search domain larepublica.co")[https://www.larepublica.co › gas-natural](https://www.larepublica.co/gas-natural)\n',
 'www.larepublica.co/gas-natural)\n',


In [63]:
content
links = pd.Series(re.findall(f'(<a href=".*?" rel)', content, flags=re.IGNORECASE))
links = links.str.replace('<a href=', '').str.replace('"', '').str.replace(' rel', '').drop_duplicates()

In [77]:
links[links.str.contains(f'www.{sources_links[source]}', case=False)].iloc[2]

'https://www.larepublica.co/gas-natural'