## Web scraping with selenium

In [1]:
from datetime import date, datetime, timedelta
import time
import io
import os
from io import BytesIO
# from PIL import Image
import requests
from bs4 import BeautifulSoup
import sys
import logging

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

log = logging.getLogger(__name__)

## Run service

In [2]:
options = webdriver.ChromeOptions()
options.gpu = False
options.headless = True
options.headless = False
desired = options.to_capabilities()

# donwload chromedriver matched your chrome
service = Service('./chromedriver.exe')
service.start()


## chrome should be shown as headless is set at False
driver = webdriver.Remote(
    service.service_url, desired_capabilities=desired)

In [3]:
#URL = "https://www.theguardian.com/australia-news/2022/sep/19/why-many-australian-workers-are-expected-to-call-in-sick-this-friday"
URL = "https://www.theguardian.com/world/2022/sep/11/margrethe-ii-denmark-jubilee-europe-only-reigning-queen"

## Open page in browser

In [4]:
driver.get(URL)

In [5]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [6]:
soup.prettify()



## retrieve data from page

In [7]:
article = soup.article

### title

In [8]:
title = article.find("div", {"data-gu-name" : "headline"}).h1.text
print(title)

Margrethe II of Denmark marks jubilee as Europe’s only reigning queen


### main body

In [9]:
all_paragraphs = article.find_all("p", {"class" : "dcr-1vtk2mf"})
main_body = ""
#combine all paragraphs into single text
for paragrah in all_paragraphs:
    main_body = main_body + " " + paragrah.text
#remove trailing and tailing space
main_body = main_body.strip()
print(main_body)

Queen Margrethe II of Denmark, Europe’s longest serving monarch and only reigning queen after the death of Queen Elizabeth II, has marked her jubilee with a series of low-key celebrations in a show of respect for her third cousin. Margrethe, 82, an immensely popular monarch known for her artistic streak as well as her chain-smoking, is said to have made the decision herself. She had originally been due to mark the 50th anniversary of her accession to the Danish throne in January, but festivities were considerably scaled back because of the pandemic. Full-blown celebrations had been expected to take place across the country this weekend instead, but were then toned down again after the death of Queen Elizabeth on Thursday. The palace said that the decision was taken “at the Queen’s own request”, Danish media reported. In a letter of condolence to King Charles III, Margrethe called Queen Elizabeth “a towering figure among the European monarchs and a great inspiration to us all. We shall 

### images

#### main image

In [10]:
#media = article.find("div", {"data-gu-name" : "media"})
#fig = media.figure
#print(fig.img['src'])
#print(fig.figcaption.get_text())

#### all image

In [11]:
i = 0;
for img in article.find_all("img", {"class":"dcr-4zleql"}):
    print(img['src'])
    fname = str(i)+".jpg"
    with open(fname,"wb") as f:
        f.write(requests.get(img['src']).content)
    i = i + 1

https://i.guim.co.uk/img/media/1e08e1a14abbfac38ba08b89cc3b8d3f5889a19b/0_142_4264_2559/master/4264.jpg?width=465&quality=85&dpr=1&s=none
https://i.guim.co.uk/img/media/485de141fc72363e366fc0e999dfbca2883b11e2/0_0_5175_3450/master/5175.jpg?width=445&quality=85&dpr=1&s=none


In [12]:
import torch
import torchvision
from PIL import Image
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer, PegasusForConditionalGeneration, PegasusTokenizer

In [13]:
#core of image captioning
img_model = VisionEncoderDecoderModel.from_pretrained("F:/UTS study/32933 Research Project/hugging face/vit-gpt2-image-captioning")
img_feature_extractor = ViTFeatureExtractor.from_pretrained("F:/UTS study/32933 Research Project/hugging face/vit-gpt2-image-captioning")
img_tokenizer = AutoTokenizer.from_pretrained("F:/UTS study/32933 Research Project/hugging face/vit-gpt2-image-captioning")
img_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
img_model.to(img_device)

max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = img_feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(img_device)

  output_ids = img_model.generate(pixel_values, **gen_kwargs)

  preds = img_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds

In [14]:
src_text = [
    main_body
]

In [15]:
#core of text summarization
sum_device = "cuda" if torch.cuda.is_available() else "cpu"
sum_tokenizer = PegasusTokenizer.from_pretrained("F:/UTS study/32933 Research Project/hugging face/pegasus-xsum")
sum_model = PegasusForConditionalGeneration.from_pretrained("F:/UTS study/32933 Research Project/hugging face/pegasus-xsum").to(sum_device)
batch = sum_tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(sum_device)
translated = sum_model.generate(**batch)
tgt_text = sum_tokenizer.batch_decode(translated, skip_special_tokens=True)



### summerized results from main body

In [16]:
tgt_text

['Queen Margrethe II of Denmark, Europe’s longest serving monarch and only reigning queen after the death of Queen Elizabeth II, has marked her jubilee with a series of low-key celebrations in a show of respect for her third cousin.']

In [17]:
images_path = []
for k in range(i):
    images_path.append(str(k)+".jpg")
images_path

['0.jpg', '1.jpg']

In [18]:
tgt_text = tgt_text + predict_step(images_path)

### summerized results from main body and images

In [19]:
tgt_text

['Queen Margrethe II of Denmark, Europe’s longest serving monarch and only reigning queen after the death of Queen Elizabeth II, has marked her jubilee with a series of low-key celebrations in a show of respect for her third cousin.',
 'a large group of people dressed in red and white',
 'a number of people dressed in military garb on a pier']

## Combined summarization

In [20]:
src_text = ""
for txt in tgt_text:
    src_text = src_text + " " +txt
src_text = src_text.strip()
print(src_text)

Queen Margrethe II of Denmark, Europe’s longest serving monarch and only reigning queen after the death of Queen Elizabeth II, has marked her jubilee with a series of low-key celebrations in a show of respect for her third cousin. a large group of people dressed in red and white a number of people dressed in military garb on a pier


In [21]:
batch = sum_tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(sum_device)
translated = sum_model.generate(**batch)
final_text = sum_tokenizer.batch_decode(translated, skip_special_tokens=True)

In [22]:
final_text

['Hundreds of people have gathered in the Danish capital, Copenhagen, to celebrate the Diamond Jubilee of Queen Margrethe II.']