In [1]:
# Downloading libraries for web-scraping
!pip install requests
!pip install beautifulsoup4==4.9.3
!pip install bs4
!pip install html5lib
!pip install num2words
!pip install vaderSentiment

# Importing libraries for web-scraping and tokenization purposes
from bs4 import BeautifulSoup as bs
import requests
import nltk
import pandas as pd
import re
from textblob import TextBlob
from num2words import num2words
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# The list of common words such as "a", "an", etc.
nltk.download('stopwords')
# Download a collection of popular resources from the NLTK library
nltk.download('popular', quiet=True)
# Sentence tokenization (Splitting a text into individual senteces)
nltk.download('punkt')
# English vocabulary database
nltk.download('wordnet')
# Used for training language models or evaluating nlp algothms
nltk.download('brown')
# Used for performing sentiment analysis on text
nltk.download('vader_lexicon')

from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()
analyzer = SentimentIntensityAnalyzer()
stemmer = SnowballStemmer("english")
stop_words = stopwords.words("english")

Collecting beautifulsoup4==4.9.3
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.8/115.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: beautifulsoup4
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.12.3
    Uninstalling beautifulsoup4-4.12.3:
      Successfully uninstalled beautifulsoup4-4.12.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yfinance 0.2.37 requires beautifulsoup4>=4.11.1, but you have beautifulsoup4 4.9.3 which is incompatible.[0m[31m
[0mSuccessfully installed beautifulsoup4-4.9.3
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2
Collecting num2words
  Downloading num2words-0.5.13-py3-none-any.whl (1

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [2]:
sentences = []

def scrape_clean (url, div_class,id_name=None):
  # Load the website
  website = requests.get(url).text
  soup = bs(website,'html.parser')

  # Find the div section that is the parent of all paragraphs
  div = soup.find_all('div', attrs={'class':div_class, 'id':id_name})

  # Get the text from paragraphs
  extracted_text = []
  for div in div:
      paragraphs = div.find_all('p')
      for paragraph in paragraphs:
          extracted_text.append(paragraph.get_text(strip=True))  # Remove leading/trailing whitespace

  # Splitting the text into sentences, remove the stopwords and punctuations, and save the cleaned version in a list
  sentences = []
  for text in extracted_text:
    sentence = sent_tokenize(text)
    for item in sentence:
      # Convert to lowercase
      text_lowercase = item.lower()
      # Remove punctuation
      text_without_punctuation = re.sub(r"[^\w\s]", "", text_lowercase)
      # Remove stopwords and stem words
      tokens = word_tokenize(text_without_punctuation)
      new_tokens = []
      for word in tokens:
        if word.isnumeric():
          word = num2words(word)
          new_tokens.append(lemmatizer.lemmatize(word))
          continue
        elif word not in stop_words:
          new_tokens.append(lemmatizer.lemmatize(word))
          continue
      # Join tokens back into a string
      cleaned_text = " ".join(new_tokens)
      sentences.append(cleaned_text)

  return sentences

In [3]:
website1 = 'https://educationhub.blog.gov.uk/2023/12/06/artificial-intelligence-in-schools-everything-you-need-to-know/'
class1 = 'entry-content'
text1 = scrape_clean(website1, class1)
sentences.extend(text1)

In [4]:
website2 = 'https://www.linkedin.com/pulse/review-ai-education-huzaifa-khan-rktgf'
class2 = 'counter-list'
text2 = scrape_clean(website2, class2)
sentences.extend(text2)

In [5]:
website3 = 'https://slidesgo.com/slidesgo-school/news/ai-in-education-survey-exclusive-slidesgo-insights-of-ai-tools-for-education'
class3 = 'counter-list'
id3 = None
text3 = scrape_clean(website3, class3, id3)
sentences.extend(text3)

In [6]:
website4 = 'https://www.insidehighered.com/news/student-success/life-after-college/2024/01/10/survey-college-students-thoughts-ai-and-careers'
class4 = 'node-content'
id4 = None
text4 = scrape_clean(website4, class4, id4)
sentences.extend(text4)

In [7]:
website5 = 'https://today.yougov.com/technology/articles/45607-most-think-schools-should-teach-about-ai-poll'
class5 = 'app-container ng-tns-c2792280337-0 ng-trigger ng-trigger-routeAnimation'
id5 = None
text5 = scrape_clean(website5, class5, id5)
sentences.extend(text5)

In [8]:
website6 = 'https://www.the74million.org/article/how-do-teachers-feel-about-their-jobs-the-impact-of-ai-new-survey-has-answers/'
class6 = 'article_content'
id6 = None
text6 = scrape_clean(website6, class6, id6)
sentences.extend(text6)

In [9]:
website7 = 'https://www.theguardian.com/technology/2024/feb/01/more-than-half-uk-undergraduates-ai-essays-artificial-intelligence'
class7 = 'dcr-lw02qf'
id7 = "maincontent"
text7 = scrape_clean(website7, class7, id7)
sentences.extend(text7)

In [10]:
website8 = 'https://www.forbes.com/sites/nickmorrison/2023/05/31/half-of-teachers-believe-ai-will-change-education-for-the-better/'
class8 = 'article-body fs-article fs-responsive-text current-article'
id8 = None
text8 = scrape_clean(website8, class8, id8)
sentences.extend(text8)

In [11]:
website9 = 'https://www.learner.com/blog/ai-education-survey'
class9 = 'w-node-_6b677d7e-03e7-913b-97b5-19c037c52658-3294f5cd'
id9 = 'content'
text9 = scrape_clean(website9, class9, id9)
sentences.extend(text9)

In [12]:
website10 = 'https://doodlelearning.com/us/math/guides/ai-in-education'
class10 = 'elementor-widget-container'
id10 = None
text10 = scrape_clean(website10, class10, id10)
sentences.extend(text10)

In [13]:
website11 = 'https://www.the74million.org/article/national-chatgpt-survey-teachers-accepting-ai-into-classrooms-workflow-even-more-than-'
class11 = None
id11 = 'content'
text11 = scrape_clean(website11, class11, id11)
sentences.extend(text11)

In [14]:
website12 = 'https://thetutorsassociation.org.uk/2023/09/27/students-begin-to-embrace-ai-for-learning-survey-shows/'
class12 = 'entry-content clear'
id12 = None
text12 = scrape_clean(website12, class12, id12)
sentences.extend(text12)

In [15]:
website13 = 'https://thetutorsassociation.org.uk/2023/09/27/students-begin-to-embrace-ai-for-learning-survey-shows/'
class13 = 'entry-content clear'
id13 = None
text13 = scrape_clean(website13, class13, id13)
sentences.extend(text13)

In [16]:
website14 = 'https://www.fenews.co.uk/skills/new-research-ai-for-education-in-the-uk-does-the-potential-outweigh-the-risk/'
class14 = 'entry-content'
id14 = None
text14 = scrape_clean(website14, class14, id14)
sentences.extend(text14)

In [17]:
website15 = 'https://www.theguardian.com/commentisfree/2023/jul/14/ai-artificial-intelligence-disrupt-education-creativity-critical-thinking'
class15 = 'dcr-lw02qf'
id15 = 'maincontent'
text15 = scrape_clean(website15, class15, id15)
sentences.extend(text15)

In [18]:
for i in sentences:
  print(f'{i}\n')

artificial intelligence ai stay already positive impact across society including education sector

believe could used improve education system example early research suggests could used free teacher time provide personalised support pupil

want make emerging technology need understand risk well opportunity

set develop artificial intelligence education safely explore could benefit teacher young people

investing ai tool gathering evidence teacher pupil using ai already investigating take advantage ai education

first step already

understand education already use generative ai including use produce content text imagery audio launched call evidence gather view educational professional academic edtech sector risk possibility

result show ai already used reduce administrative task optimistic potential

organised twoday hackathon teacher andschool leader across england allowed teacher school leader work data scientist come solution use ai tackle reallife issue like teacher workload

result

In [19]:
# Use Vader library to get the polarity of the sentence
def getPolarity(text):
    polarity = SentimentIntensityAnalyzer().polarity_scores(text)
    if polarity['compound'] > 0:
      sentiment = 'postive'
    elif polarity['compound'] < 0:
      sentiment = 'negative'
    else:
      sentiment = 'neutral'
    return sentiment,polarity

In [20]:
for sentence in sentences:
  sentiment, polarity = getPolarity(sentence)
  print('\n', sentence)
  print(f"{sentiment}, {polarity}")


 artificial intelligence ai stay already positive impact across society including education sector
postive, {'neg': 0.0, 'neu': 0.599, 'pos': 0.401, 'compound': 0.7717}

 believe could used improve education system example early research suggests could used free teacher time provide personalised support pupil
postive, {'neg': 0.0, 'neu': 0.643, 'pos': 0.357, 'compound': 0.836}

 want make emerging technology need understand risk well opportunity
postive, {'neg': 0.158, 'neu': 0.376, 'pos': 0.466, 'compound': 0.4767}

 set develop artificial intelligence education safely explore could benefit teacher young people
postive, {'neg': 0.0, 'neu': 0.492, 'pos': 0.508, 'compound': 0.8519}

 investing ai tool gathering evidence teacher pupil using ai already investigating take advantage ai education
postive, {'neg': 0.0, 'neu': 0.875, 'pos': 0.125, 'compound': 0.25}

 first step already
neutral, {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

 understand education already use generative

In [21]:
data = []
for sentence in sentences:
  sentiment, polarity = getPolarity(sentence)
  new_item = {'sentence' : sentence, 'sentiment' : sentiment, 'category': 'Education'}
  data.append(new_item)

df = pd.DataFrame(data)

In [22]:
df.to_csv('Education dataset (no numbers) - Vader ver.csv', index = False)