In [46]:
import html2text, re, spacy, tqdm, numpy
from datetime import datetime
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
from transformers import AutoModel
from numpy.linalg import norm
import pandas as pd

async def do_webscraping(link):
    try:
        urls = [link]
        loader = AsyncHtmlLoader(urls)
        docs = loader.load()

        html2text_transformer = Html2TextTransformer()
        docs_transformed = html2text_transformer.transform_documents(docs)

        if docs_transformed != None and len(docs_transformed) > 0:
            metadata = docs_transformed[0].metadata
            title = metadata.get('title', '')
            return {
                'summary': docs_transformed[0].page_content,
                'title': title,
                'metadata': metadata,
                'clean_content': html2text.html2text(docs_transformed[0].page_content)
            }
        else:
            return None

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

async def get_news(search_results):
  structured_response = []
  for link in search_results:
    print(link)
    response = await do_webscraping(link)
    if response != None:
        structured_response.append(response)
  return structured_response

async def process_news(structured_response):
  news = {}
  for i, doc in enumerate(structured_response):
    doc = doc['clean_content'].replace(doc['title'], '')
    texts = pd.Series(re.split('#', doc)).str.strip()
    texts = texts[texts.str.len()>100].reset_index(drop=True)
    lens = texts.apply(lambda text: pd.Series(text.split('\n')).str.strip().str.len().mean())
    texts = texts[lens>20]
    news[structured_response[i]['title'].strip()] = texts
  return news

def get_dates(news):
    dates = []
    for value in news.values():
        date = value[value.str.contains('[0-9]{1,4}\W[0-9]{1,2}\W[0-9]{1,4}', regex=True)]
        date = date.str.extract('([0-9]{1,4}\W[0-9]{1,2}\W[0-9]{1,4})')
        date = date[0].str.split('\W', regex=True, expand=True)
        if len(date):
            lens = date.iloc[0].str.len()
            if lens.tolist()==lens.sort_values(ascending=False).tolist():
                date_ = '-'.join(date.iloc[0].tolist())
            else:
                date_ = '-'.join(date.iloc[0][::-1].tolist())
        else:
            now = datetime.now()
            date_ = f'{now.year}-{0 if now.month<10 else ""}{now.month}-{0 if now.day<10 else ""}{now.day}'
        dates.append(date_)
    return dates

In [30]:

search_results = [
  r'https://caracol.com.co/2024/07/01/ante-deficit-ecopetrol-estaria-negociando-para-importar-gas-natural-licuado-desde-eeuu/',
  r'https://www.elcolombiano.com/negocios/fracking-estados-unidos-ecopetrol-negocia-gas-natural-con-estados-unidos-BG24887475',
  r'https://www.lafm.com.co/economia/ecopetrol-esta-en-negociaciones-para-importar-gas-a-estados-unidos',
  r'https://www.pulzo.com/economia/ecopetrol-haria-negocio-con-estados-unidos-crisis-gas-colombia-PP3756548A',
  r'https://www.infobae.com/colombia/2024/06/29/remezon-en-ecopetrol-se-avecinan-cambios-en-la-alta-gerencia-a-partir-del-1-de-julio/',
  r'https://www.eltiempo.com/economia/empresas/los-cambios-en-la-alta-gerencia-de-ecopetrol-que-empezaran-a-regir-a-partir-del-1-de-julio-3357513',
  r'https://www.valoraanalitik.com/ecopetrol-anuncia-nuevos-cambios-en-varias-vicepresidencias/',
  r'https://www.pulzo.com/economia/ecopetrol-anuncio-cuatro-nuevos-cambios-con-cargos-importantes-compania-PP3755005A'
]

structured_response = await get_news(search_results)
news = await process_news(structured_response)

https://caracol.com.co/2024/07/01/ante-deficit-ecopetrol-estaria-negociando-para-importar-gas-natural-licuado-desde-eeuu/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.55it/s]


https://www.elcolombiano.com/negocios/fracking-estados-unidos-ecopetrol-negocia-gas-natural-con-estados-unidos-BG24887475


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.50it/s]


https://www.lafm.com.co/economia/ecopetrol-esta-en-negociaciones-para-importar-gas-a-estados-unidos


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.02it/s]


https://www.pulzo.com/economia/ecopetrol-haria-negocio-con-estados-unidos-crisis-gas-colombia-PP3756548A


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.01it/s]


https://www.infobae.com/colombia/2024/06/29/remezon-en-ecopetrol-se-avecinan-cambios-en-la-alta-gerencia-a-partir-del-1-de-julio/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.28it/s]


https://www.eltiempo.com/economia/empresas/los-cambios-en-la-alta-gerencia-de-ecopetrol-que-empezaran-a-regir-a-partir-del-1-de-julio-3357513


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.87it/s]


https://www.valoraanalitik.com/ecopetrol-anuncia-nuevos-cambios-en-varias-vicepresidencias/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.61it/s]


https://www.pulzo.com/economia/ecopetrol-anuncio-cuatro-nuevos-cambios-con-cargos-importantes-compania-PP3755005A


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.07it/s]


In [49]:
class scrap_news:
    nlp = spacy.load('es_core_news_md')

    def __init__(self, company, source):
        self.company, self.source = company, source

    def cos_sim(self, text1, text2):
        tokens1 = self.nlp(text1)
        tokens2 = self.nlp(text2)
        similarity = tokens1.similarity(tokens2)
        return similarity

    def get_similarity(self, df, title, cutoff=0.4):
        embedds = []
        for text in df.tolist():
            embedds.append(self.cos_sim(text, title))
        df = df.to_frame().assign(SIMILARITY=embedds)
        df = df[df.SIMILARITY>cutoff]
        return df

    def scrap(self, news):
        self.similarity, self.content = {}, []
        for key, value in tqdm.tqdm(news.items()):
            self.similarity[key] = self.get_similarity(value, key)
            value = self.similarity[key][0]
            value = value.str.replace('(\n)|(\*)', ' ', regex=True).str.replace(' +', ' ', regex=True).str.strip()
            self.content.append('\n\n\n\n'.join(value.tolist()))

    def process_results(self, dates):
        news_docs = []
        for i in range(len(self.content)):
            doc = {
                'content': self.content[i],
                'metadata': {
                    'title': structured_response[i]['title'],
                    'source': self.source,
                    'link': structured_response[i]['metadata']['source'],
                    'date': dates[i],
                    'company': self.company
                }
            }
            news_docs.append(doc)

In [50]:
scraper = scrap_news()
scraper.scrap(news)
dates = get_dates(news)

100%|██████████| 8/8 [00:03<00:00,  2.13it/s]


In [51]:
scraper.content

['Colombia busca mitigar la escasez de Gas con un nuevo terminal de importación de GNL Cortesía: Ecopetrol facebook twitter linkedIn whatsapp Juan Manuel CorreaJuanMc9617 01/07/2024 - 11:33 h COT Según el medio estadounidense Bloomberg, especializado en economía a nivel internacional, Ecopetrol está en conversaciones con proveedores de gas natural licuado (GNL) debido a la disminución de la producción de gas doméstico, según fuentes familiarizadas c on el asunto que prefirieron no ser identificadas por tratarse de información confidencial. Según fuentes citadas por Bloomberg, los funcionarios de Ecopetrol están particularmente interesados en el GNL proveniente de Estados Unidos . Esta medida se produce mientras Colombia explora la construcción de su segundo terminal de importación de GNL para prevenir una escasez d e gas pronosticada para los próximos dos años. Aunque Colombia representa una fracción minúscula de las importaciones globales de GNL, su proximidad a Estados Unidos —el may

In [None]:
company = 'Ecopetrol'
news_docs = []
for i in range(len(scraper.content)):
    doc = {
        'content': scraper.content[i],
        'metadata': {
            'title': structured_response[i]['title'],
            'source': structured_response[i]['metadata']['source'],
            'date': dates[i],
            'company': company
        }
    }
    news_docs.append(doc)

In [None]:
class scrap_news:
    model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-es', trust_remote_code=True) # trust_remote_code is needed to use the encode method

    def cos_sim(self, a, b):
        return (a @ b.T) / (norm(a)*norm(b))

    def get_similarity(self, df, title, cutoff=0.4):
        texts = self.model.encode(df.tolist())
        title = self.model.encode(title)
        embedds = []
        for text in texts:
            embedds.append(self.cos_sim(text, title))
        df = df.to_frame().assign(SIMILARITY=embedds)
        df = df[df.SIMILARITY>cutoff]
        return df

    def scrap(self, news):
        self.similarity, self.content = {}, []
        for key, value in tqdm.tqdm(news.items()):
            self.similarity[key] = self.get_similarity(value, key)
            value = self.similarity[key][0]
            value = value.str.replace('(\n)|(\*)', ' ', regex=True).str.replace(' +', ' ', regex=True).str.strip()
            self.content.append('\n\n\n\n'.join(value.tolist()))