In [10]:
# Downloading libraries for web-scraping
!pip install requests
!pip install beautifulsoup4==4.9.3
!pip install bs4
!pip install html5lib
!pip install num2words

# Importing libraries for web-scraping and tokenization purposes
from bs4 import BeautifulSoup as bs
import requests
import nltk
import pandas as pd
import re
from textblob import TextBlob
from num2words import num2words

# The list of common words such as "a", "an", etc.
nltk.download('stopwords')
# Download a collection of popular resources from the NLTK library
nltk.download('popular', quiet=True)
# Sentence tokenization (Splitting a text into individual senteces)
nltk.download('punkt')
# English vocabulary database
nltk.download('wordnet')
# Used for training language models or evaluating nlp algothms
nltk.download('brown')
# Used for performing sentiment analysis on text
nltk.download('vader_lexicon')

from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()
analyzer = SentimentIntensityAnalyzer()
stemmer = SnowballStemmer("english")
stop_words = stopwords.words("english")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [11]:
sentences = []

def scrape_clean (url, div_class,id_name=None):
  # Load the website
  website = requests.get(url).text
  soup = bs(website,'html.parser')

  # Find the div section that is the parent of all paragraphs
  div = soup.find_all('div', attrs={'class':div_class, 'id':id_name})

  # Get the text from paragraphs
  extracted_text = []
  for div in div:
      paragraphs = div.find_all('p')
      for paragraph in paragraphs:
          extracted_text.append(paragraph.get_text(strip=True))  # Remove leading/trailing whitespace

  # Splitting the text into sentences, remove the stopwords and punctuations, and save the cleaned version in a list
  sentences = []
  for text in extracted_text:
    sentence = sent_tokenize(text)
    for item in sentence:
      # Convert to lowercase
      text_lowercase = item.lower()
      # Remove punctuation
      text_without_punctuation = re.sub(r"[^\w\s]", "", text_lowercase)
      # Remove stopwords and stem words
      tokens = word_tokenize(text_without_punctuation)
      new_tokens = []
      for word in tokens:
        if word.isnumeric():
          word = num2words(word)
          new_tokens.append(lemmatizer.lemmatize(word))
          continue
        elif word not in stop_words:
          new_tokens.append(lemmatizer.lemmatize(word))
          continue
      # Join tokens back into a string
      cleaned_text = " ".join(new_tokens)
      sentences.append(cleaned_text)

  return sentences

In [12]:
website1 = 'https://www.pewresearch.org/science/2023/02/22/60-of-americans-would-be-uncomfortable-with-provider-relying-on-ai-in-their-own-health-care/'
class1 = 'post-content'
id1 = None
text1 = scrape_clean(website1, class1, id1)
sentences.extend(text1)

In [13]:
website2 = 'https://translational-medicine.biomedcentral.com/articles/10.1186/s12967-019-02204-y'
class2 ='c-article-section__content'
id2 = 'Sec13-content'
text2 = scrape_clean(website2, class2, id2)
sentences.extend(text2)

In [14]:
website3 = 'https://foreseemed.com/artificial-intelligence-in-healthcare'
class3 = ["sqs-block html-block sqs-block-html","sqs-block html-block sqs-block-html"]
id3 = ["block-yui_3_17_2_1_1659634949407_20116","block-yui_3_17_2_1_1677014147372_139146"]
text3 = scrape_clean(website3, class3,id3)
sentences.extend(text3)

In [15]:
website4 = 'https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-023-04698-z#Sec7'
class4 = 'c-article-section__content'
id4 = "Sec8-content"
text4 = scrape_clean(website4, class4,id4)
sentences.extend(text4)

In [16]:
website5 = 'https://svn.bmj.com/content/2/4/230'
class5 = 'section'
id5 = "sec-16"
text5 = scrape_clean(website5, class5, id5)
sentences.extend(text5)

In [17]:
website6 = 'https://www.mckinsey.com/industries/healthcare/our-insights/transforming-healthcare-with-ai'
class6 = 'mdc-o-content-body mck-u-dropcap'
id6 = None
text6 = scrape_clean(website6, class6, id6)
sentences.extend(text6)

In [18]:
website7 = 'https://www.technologyreview.com/2020/01/22/276128/how-ai-is-humanizing-health-care/?utm_source=google&utm_medium=search&utm_campaign=acq_UKBLOW&utm_content=DSATOPICS&gad_source=1&gclid=Cj0KCQjw2a6wBhCVARIsABPeH1sNptGNGF9jblaq0_x0nJ3rvDIG9FuuDwKeI0Gnl2zBBurM94Yxpj4aAt5UEALw_wcB'
class7 = 'gutenbergContent__content--109b03a769a11e8ae3acbab352a64269 html_2'
id7 = None
text7 = scrape_clean(website7, class7, id7)
sentences.extend(text7)

In [19]:
website8 = 'https://www.healthcaredive.com/news/artificial-intelligence-AI-healthcare-patients-uncomfortable-Pew-research-center/643429/'
class8 = "medium-10 medium-centered large-12"
id8 = None
text8 = scrape_clean(website8, class8, id8)
sentences.extend(text8)

In [20]:
website9 = 'https://www.healthcaredive.com/news/artificial-intelligence-AI-healthcare-patients-uncomfortable-Pew-research-center/643429/'
class9 = "medium-10 medium-centered large-12"
id9 = None
text9 = scrape_clean(website9, class9, id9)
sentences.extend(text9)

In [21]:
website10 = 'https://www.medpagetoday.com/popmedicine/popmedicine/105330'
class10 = "main-content-region mpt-article-page"
id10 = "js-main-content-region"
text10 = scrape_clean(website10, class10, id10)
sentences.extend(text10)

In [22]:
website11 = 'https://www.medicaleconomics.com/view/patients-don-t-understand-use-of-ai-in-health-care-and-many-don-t-trust-it'
class11 = "blockText_blockContent__TbCXh"
id11 = None
text11 = scrape_clean(website11, class11, id11)
sentences.extend(text11)

In [23]:
website12 = 'https://www.digitalhealth.net/2023/08/patient-data-more-than-half-of-uk-public-dont-trust-ai/'
class12 = "entry-content"
id12 = None
text12 = scrape_clean(website12, class12, id12)
sentences.extend(text12)

In [24]:
website13 = 'https://www.healthcareitnews.com/news/survey-1-3-patients-comfortable-ai-led-primary-care'
class13 = "field-item even"
id13 = None
text13 = scrape_clean(website13, class13, id13)
sentences.extend(text13)

In [25]:
website14 = 'https://www.prnewswire.com/news-releases/a-majority-of-americans-are-optimistic-that-ai-will-improve-healthcare-in-2024-301986068.html'
class14 = "col-lg-10 col-lg-offset-1"
id14 = None
text14 = scrape_clean(website14, class14, id14)
sentences.extend(text14)

In [26]:
website15 = 'https://www.mobihealthnews.com/news/virtual-second-opinions-are-popular-wariness-persists-ai-diagnosis-tools'
class15 = "region region-content"
id15 = None
text15 = scrape_clean(website15, class15, id15)
sentences.extend(text15)

In [27]:
website16 = 'https://mobius.md/2023/12/13/how-do-patients-feel-about-ai-in-healthcare/'
class16 = "entry-content"
id16 = None
text16 = scrape_clean(website16, class16, id16)
sentences.extend(text16)

In [28]:
website17 = 'https://www.urologytimes.com/view/survey-shows-patient-mistrust-of-ai-use-in-health-care'
class17 = "blockText_blockContent__TbCXh"
id17 = None
text17 = scrape_clean(website17, class17, id17)
sentences.extend(text17)

In [29]:
for i in sentences:
  print(f'{i}\n')

pew research center conducted study understand american view artificial intelligence ai us health medicine

analysis surveyed eleven thousand and four u adult dec one thousand, two hundred and eighteen two thousand and twenty-two

everyone took part survey member center american trend panel atp online survey panel recruited national random sampling residential address

way nearly u adult chance selection

survey weighted representative u adult population gender race ethnicity partisan affiliation education category

read theatps methodology

thequestions usedfor report along response andits methodology

part series survey report look increasingrole aiin shaping american life

readpublic awareness artificial intelligence everyday activitiesand american view emerging us artificial intelligence including program generate text art

new pew research center survey explores public view artificial intelligence ai inhealth medicine area american mayincreasingly encountertechnologies thing like 

In [30]:
# Use textblob library to get the polarity of the sentence
def getPolarity(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0:
      sentiment = 'postive'
    elif polarity < 0:
      sentiment = 'negative'
    else:
      sentiment = 'neutral'
    return sentiment,polarity


public relation

investor relation

seventeen billion, six hundred and thirty-five million, two hundred and sixty-eight thousand, four hundred and seventy-two

seventeen billion, three hundred and sixty-five million, fifty-four thousand, six hundred and twenty-six

source medtronic plc

medtronic plc nyse mdt global leader healthcare technology today announced new data largest headtohead comparative trial

medtronic plc global leader healthcare technology today unveiled topline finding comprehensive survey woman perception

health care hospital

artificial intelligence

medical pharmaceutical

computer electronics

photo jgijamie grillblend imagesgetty image

despite growing demand virtual healthcare solution patient still harbor apprehension use ai medical diagnosis according asurveyfrom clinic cleveland clinic joint venture betweencleveland clinicand telehealth giantamwell

survey found availability virtual second opinion vso service made far likely respondent follow seeking second 

In [31]:
for sentence in sentences:
   sentiment, polarity = getPolarity(sentence)
   print('\n', sentence)
   print(f"{sentiment}, {polarity}")


 pew research center conducted study understand american view artificial intelligence ai us health medicine
negative, -0.2333333333333333

 analysis surveyed eleven thousand and four u adult dec one thousand, two hundred and eighteen two thousand and twenty-two
postive, 0.1

 everyone took part survey member center american trend panel atp online survey panel recruited national random sampling residential address
negative, -0.19999999999999998

 way nearly u adult chance selection
postive, 0.1

 survey weighted representative u adult population gender race ethnicity partisan affiliation education category
postive, 0.1

 read theatps methodology
neutral, 0.0

 thequestions usedfor report along response andits methodology
neutral, 0.0

 part series survey report look increasingrole aiin shaping american life
neutral, 0.0

 readpublic awareness artificial intelligence everyday activitiesand american view emerging us artificial intelligence including program generate text art
negative, -0

In [32]:
data = []
for sentence in sentences:
  sentiment, polarity = getPolarity(sentence)
  new_item = {'sentence' : sentence, 'sentiment' : sentiment, 'category': 'Health Care'}
  data.append(new_item)

df = pd.DataFrame(data)


In [33]:
df.to_csv('healthcare.csv', index = False)