In [15]:
import requests
import logging
import html
# Set up logging to write to a file
logging.basicConfig(level=logging.INFO, filename="bing_log1.txt", filemode="w")
logger = logging.getLogger(__name__)

In [16]:
def bing_news_crawler(search_term):
    
    search_url = "https://api.bing.microsoft.com/v7.0/news/search"
    subscription_key = "e6f92ff18e204266894f631fa8a97259"
    headers = {"Ocp-Apim-Subscription-Key": subscription_key}
    params = {"q": search_term, "freshness":"Day","textDecorations": True, "textFormat": "HTML"}
    try:
        response = requests.get(search_url, headers=headers, params=params)
        response.raise_for_status()
        search_results = response.json()
        url_list = []
        
        for item in search_results.get("value", []):
            name =  BeautifulSoup(item.get("name", ""), 'html.parser').get_text()
            url = item.get("url", "")
            description = html.unescape(item.get("description", ""))
            date_published = item.get("datePublished", "")
            
            url_list.append({
                "source": "Bing News API",
                "client": search_term,
                "title": name,
                "date": date_published,
                "url": url,
                "description": description,
            })
        return url_list
    except requests.RequestException as e:
        logger.error(f"Error fetching data for {search_term}: {e}")
        return []

In [18]:
from openai import AzureOpenAI
api_type = "azure"
base_url = "https://qa.gai.cencora.com/aoai"
api_version = "2024-02-15-preview"
api_key = "10a5c5995bd74909bfeb43de6c11c4bf"
deployment_id = "gpt-4o" 
model_name = 'gpt-4o'
oai_client = AzureOpenAI(base_url = f"{base_url}/openai/deployments/{deployment_id}", api_key = api_key, api_version = api_version)
oai_client

def chatResponses(messages):
    oai_response = oai_client.chat.completions.create(model = model_name, messages = messages, max_tokens = 2000, temperature = 0.1, seed=10)
    oai_response = oai_response.choices[0].message.content
    return oai_response

from langchain import PromptTemplate
template= '''Your objective is to extract the news article and news summary from beautiful soup paragraphs content

The content is  
-----
{content}
-----
The output should be structured in JSON format with keys 'article' and 'summary', provide with no extra description.:

'''
#try to add the news relevance in prompt
prompt = PromptTemplate(
    input_variables=["content"],
    template=template)

In [29]:
from bs4 import BeautifulSoup
import ast
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/108.0.0.0 Safari/537.36"
    }

def fetch_data(url):
    try:
        logger.info(f"Fetching data from {url}")
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            paragraphs = soup.find_all('p')
            if len(paragraphs) > 0:
                system_content = "You are an AI assistant that helps in extracting news article and news summary from beautiful soup paragraphs content parsed from HTML document."
                messages = [{"role": "system", "content": system_content}]
                query = prompt.format(content=paragraphs)
                messages.append({"role": "user", "content": query})
                reply = chatResponses(messages)
                contents = ast.literal_eval(reply.replace("```json", "").replace("```", ""))
                logger.info(f"Data fetched successfully from {url}")
                return contents['article'], contents['summary']
            else:
                logger.warning(f"No paragraphs found in data from {soup.get_text(separator=' ')}")
                return None, None
        else:
            logger.warning(f"Failed to fetch data from {url}. Status code: {response.status_code}")
            return None, None
    except Exception as e:
        logger.error(f"Failed to fetch data from {url}. Error: {str(e)}")
        return None, None

In [20]:
clients= ['HCA Healthcare', 'Trinity Health', 'University of California Health','Tenet Healthcare']
full_urls= []
for client in clients:
    logger.info(f"Extracting news for {client}")
    urls= bing_news_crawler(client)
    full_urls.extend(urls)

In [21]:
import re
# Process the results
def process_url(url_data):
    url = url_data['url']
    article, summary = fetch_data(url)
    url_data['summary'] = summary
    url_data['news'] = article
    #url_data['relevant']= bool(re.search(re.escape(url_data['client']), url_data['news'], re.IGNORECASE))
    return url_data

from concurrent.futures import ThreadPoolExecutor
# Initialize an empty list to store results
results = []

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=10) as executor:
    for url_data in full_urls:
        future = executor.submit(process_url, url_data)
        results.append(future)


In [22]:
all_data=[]
# Process the results
for i, result in enumerate(results):
    url_data = result.result()
    all_data.append(url_data)

In [23]:
all_data

[{'source': 'Bing News API',
  'client': 'HCA Healthcare',
  'title': "El Paso native new CEO at HCA Healthcare's Las Palmas Medical Center in East El Paso",
  'date': '2024-06-12T11:30:44.0000000Z',
  'url': 'https://www.msn.com/en-us/money/companies/el-paso-native-new-ceo-at-hca-healthcare-s-las-palmas-medical-center-in-east-el-paso/ar-BB1o5oNE',
  'description': "El Paso native Alejandro Romero replaces retired Don Karl as CEO of HCA Healthcare's 327-bed Las Palmas Medical Center in West El Paso.",
  'summary': None,
  'news': None},
 {'source': 'Bing News API',
  'client': 'HCA Healthcare',
  'title': 'HCA Houston Healthcare Northwest CEO Scott Davis announces departure from hospital',
  'date': '2024-06-12T13:00:00.0000000Z',
  'url': 'https://communityimpact.com/houston/spring-klein/health-care/2024/06/12/hca-houston-healthcare-northwest-ceo-scott-davis-announces-departure-from-hospital/',
  'description': 'Scott Davis, who has served as the CEO of HCA Houston Healthcare Northwes

In [13]:
import json
file_path = r"C:\Users\2329791\OneDrive - Cognizant\Desktop\Work\Cencora\cencora_poc_code\bing_data.json"

# Write the data to the JSON file
with open(file_path, "w") as json_file:
    json.dump(all_data, json_file, indent=4)

In [70]:
url= full_urls[6]['url']   
result= fetch_data(url)

In [71]:
result

('The University of Houston Andy and Barbara Gessner College of Nursing has earned prestigious national\u202faccreditation from the Society of Simulation in Healthcare. The College is the first and only college in the greater-Houston area to achieve this accreditation. The Methodist Sugar Land Hospital Nursing Simulation Center at UH at Sugar Land and the HCA Houston Healthcare Nursing Simulation Center at UH at Katy feature state-of-the-art mannequins and tools for multi-disciplinary, hospital-based training in a realistic environment. Through this dynamic learning, students practice, observe and are evaluated in various environments such as maternity, hospice, emergency room and bedside care. Beena Joseph, clinical assistant professor/clinical learning coordinator at the Gessner College of Nursing, led the effort to gain accreditation at the two sites. In a recent perioperative nursing course at Gessner College, it was easy to see how the college earned its accreditation. The course 

In [18]:
url= full_urls[5]['url']
response = requests.get(url, headers=headers)


In [25]:
full_urls

[{'source': 'Bing News API',
  'client': 'HCA Healthcare',
  'title': '<b>HCA</b> shares continue climb to new highs',
  'date': '2024-06-06T13:57:00.0000000Z',
  'url': 'https://www.beckershospitalreview.com/finance/hca-shares-continue-climb-to-new-highs.html',
  'description': 'Nashville, Tenn.-based HCA operates 186 hospitals and 49,588 total licensed beds. In January, its stock price hit an all-time high, soaring past $300 per share following an 8% increase in revenue in 2023. The stock has maintained momentum, hitting $336.91 on June 5 — a year-over-year increase of approximately 24%.'},
 {'source': 'Bing News API',
  'client': 'HCA Healthcare',
  'title': 'CEO named for 2 Oregon hospitals',
  'date': '2024-06-06T17:48:00.0000000Z',
  'url': 'https://www.beckershospitalreview.com/hospital-executive-moves/ceo-named-for-2-oregon-hospitals.html',
  'description': 'Brandon Mencini was named CEO of Rogue Regional Medical Center in Medford, Ore., and Ashland (Ore.) Community Hospital. M

In [29]:
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p')

In [67]:
system_content = "You are an AI assistant that helps in extracting news article and news summary from beautiful soup paragraphs content parsed from HTML document."
messages = [{"role": "system", "content": system_content}]
query = prompt.format(content=paragraphs)
messages.append({"role": "user", "content": query})
reply = chatResponses(messages)
#contents = ast.literal_eval(reply)

In [68]:
reply

'```json\n{\n  "article": "Ina Susan Ryder, 92, of Homosassa, FL, passed away on Thursday, March 30, 2023 at HCA Florida Citrus Hospital in Inverness, FL. A native of Waukegan, IL, she was born May 11, 1930, the only child to Henry and Mary Perkio. Ina moved to Homosassa in 1989 from her native Waukegan and was a faithful and devoted member of First United Methodist Church of Homosassa. Mrs. Ryder pursued her love for the game of golf through her membership in the Ladies Golf Association of Sugarmill Woods, where she served as their Past President and Past Treasurer. Ina also enjoyed dancing and the game of bridge during her life. In addition to her parents, she was preceded in death by her first husband of 49 years, Richard F. “Dick” Anderson in 1998, and a son, Robert Mark Anderson. Ina is survived by her husband of 23 years, Joe Ryder, Homosassa, FL; daughter Linda S. Klonowski (husband David), Woodridge, IL and son Richard Michael Anderson (wife Gloria), Homosassa, FL; stepchildren

In [69]:
ast.literal_eval(reply.replace("```json", "").replace("```", ""))

{'article': 'Ina Susan Ryder, 92, of Homosassa, FL, passed away on Thursday, March 30, 2023 at HCA Florida Citrus Hospital in Inverness, FL. A native of Waukegan, IL, she was born May 11, 1930, the only child to Henry and Mary Perkio. Ina moved to Homosassa in 1989 from her native Waukegan and was a faithful and devoted member of First United Methodist Church of Homosassa. Mrs. Ryder pursued her love for the game of golf through her membership in the Ladies Golf Association of Sugarmill Woods, where she served as their Past President and Past Treasurer. Ina also enjoyed dancing and the game of bridge during her life. In addition to her parents, she was preceded in death by her first husband of 49 years, Richard F. “Dick” Anderson in 1998, and a son, Robert Mark Anderson. Ina is survived by her husband of 23 years, Joe Ryder, Homosassa, FL; daughter Linda S. Klonowski (husband David), Woodridge, IL and son Richard Michael Anderson (wife Gloria), Homosassa, FL; stepchildren Karen Pattaro

In [45]:
ast.literal_eval(reply.replace("```json", "").replace("```", ""))

{'article': 'Ina Susan Ryder, 92, of Homosassa, FL, passed away on Thursday, March 30, 2023 at HCA Florida Citrus Hospital in Inverness, FL. A native of Waukegan, IL, she was born May 11, 1930, the only child to Henry and Mary Perkio. Ina moved to Homosassa in 1989 from her native Waukegan and was a faithful and devoted member of First United Methodist Church of Homosassa. Mrs. Ryder pursued her love for the game of golf through her membership in the Ladies Golf Association of Sugarmill Woods, where she served as their Past President and Past Treasurer. Ina also enjoyed dancing and the game of bridge during her life. In addition to her parents, she was preceded in death by her first husband of 49 years, Richard F. “Dick” Anderson in 1998, and a son, Robert Mark Anderson. Ina is survived by her husband of 23 years, Joe Ryder, Homosassa, FL; daughter Linda S. Klonowski (husband David), Woodridge, IL and son Richard Michael Anderson (wife Gloria), Homosassa, FL; stepchildren Karen Pattaro

In [26]:
ast.literal_eval(reply)

SyntaxError: invalid syntax (<unknown>, line 1)

In [None]:
url="https://www.beckershospitalreview.com/hr/u-of-utah-hospital-employees-to-get-parking-stipends.html"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')