In [None]:
# day45_Scraping the Web with BeautifulSoup_240911
# aim: how to make a soup
# beautifulsoup: a module that helps developers to make sense of website
# a libray, beautiful soup that parsing(분석하는) html file
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [104]:
from bs4 import BeautifulSoup
# import lxml # for read XML files

with open("website.html", encoding='utf8') as file:  # utf8 is one way of encoding
    contents = file.read()

soup = BeautifulSoup(contents, "html.parser") # generate an object from a Class

In [106]:
# how to get one tag and its content?
print(soup.title)
print(soup.title.name)
print(soup.title.string)

<title>Angela's Personal Site</title>
title
Angela's Personal Site


In [108]:
# showing the html code with indent
print(soup.prettify())  

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <title>
   Angela's Personal Site
  </title>
 </head>
 <body>
  <h1 id="name">
   Angela Yu
  </h1>
  <p>
   <em>
    Founder of
    <strong>
     <a href="https://www.appbrewery.co/">
      The App Brewery
     </a>
    </strong>
    .
   </em>
  </p>
  <p>
   I am an iOS and Web Developer. I ❤️ coffee and motorcycles.
  </p>
  <hr/>
  <h3 class="heading">
   Books and Teaching
  </h3>
  <ul>
   <li>
    The Complete iOS App Development Bootcamp
   </li>
   <li>
    The Complete Web Development Bootcamp
   </li>
   <li>
    100 Days of Code - The Complete Python Bootcamp
   </li>
  </ul>
  <hr/>
  <h3 class="heading">
   Other Pages
  </h3>
  <a href="https://angelabauer.github.io/cv/hobbies.html">
   My Hobbies
  </a>
  <a href="https://angelabauer.github.io/cv/contact-me.html">
   Contact Me
  </a>
 </body>
</html>



In [110]:
# how to get multiple(all) tags and their contents?
all_anchor_tags = soup.find_all(name="p")  # name = "a" or "p" or "title" etc
print(all_anchor_tags)

[<p><em>Founder of <strong><a href="https://www.appbrewery.co/">The App Brewery</a></strong>.</em></p>, <p>I am an iOS and Web Developer. I ❤️ coffee and motorcycles.</p>]


In [112]:
# What if I want only the text in the anchor tags? soup.find_all(name="p")
for tag in all_anchor_tags:
    print(tag.getText())

Founder of The App Brewery.
I am an iOS and Web Developer. I ❤️ coffee and motorcycles.


In [114]:
all_anchor_tags = soup.find_all(name="a")
for tag in all_anchor_tags:
    print(tag.get("href"))

https://www.appbrewery.co/
https://angelabauer.github.io/cv/hobbies.html
https://angelabauer.github.io/cv/contact-me.html


In [116]:
# What if I want to find particular tag with id in html?
headings = soup.find(name="h1", id="name")
print(headings)

<h1 id="name">Angela Yu</h1>


In [118]:
# What if I want to find particular tag with class in html?
section_heading = soup.find(name="h3", class_="heading")  # cf) class_
print(section_heading)
print(section_heading.get("class"))

<h3 class="heading">Books and Teaching</h3>
['heading']


In [120]:
# how can I find particular contents the html?
# by narrowing the scope. using select_one. it returns first matching item
company_url = soup.select_one(selector="p a")
print(company_url)

<a href="https://www.appbrewery.co/">The App Brewery</a>


In [122]:
name = soup.select_one(selector="#name")  # for id, #id
print(name)

<h1 id="name">Angela Yu</h1>


In [124]:
headings = soup.select(".heading")  # for class, .class_name
print(headings)

[<h3 class="heading">Books and Teaching</h3>, <h3 class="heading">Other Pages</h3>]


In [144]:
# let's scrape data from live website
from bs4 import BeautifulSoup
import requests

response = requests.get("https://news.ycombinator.com/")
yc_web_page = response.text

soup = BeautifulSoup(yc_web_page, "html.parser")
print(soup.title)

<title>Hacker News</title>


In [146]:
article_text = soup.select_one(selector=".titleline").getText()
print(article_text)

My business card runs Linux and Ultrix (2022) (dmitry.gr)


In [148]:
article_tag = soup.find(name="span", class_="titleline").getText()
print(article_tag)

My business card runs Linux and Ultrix (2022) (dmitry.gr)


In [150]:
article_link = soup.find(name="span", class_="titleline").select_one(selector="a").get("href")
print(article_link)
article_upvote = soup.find(name="span", class_="score").getText()
print(article_upvote)

http://dmitry.gr/?r=05.Projects&proj=33.%20LinuxCard
147 points


In [152]:
# use find_all to get all parts in the page
articles = soup.find_all(name="span", class_="titleline")
article_texts = []
article_links = []

for article_tag in articles:
    article_text = article_tag.getText()
    article_texts.append(article_text)
    article_link = article_tag.select_one(selector="a").get("href")
    article_links.append(article_link)

article_upvotes = [int(score.getText().split()[0]) for score in soup.find_all(name="span", class_="score")]

# article_upvotes_num = []
# for sc in article_upvotes:
#     article_upvotes_num.append(int(sc.split()[0]))

print(article_texts)
print(article_links)
print(article_upvotes)
# print(article_upvotes_num)

['My business card runs Linux and Ultrix (2022) (dmitry.gr)', 'Show HN: Konty – A Balsamiq-alternative lo-fi wireframe tool for modern apps (konty.app)', 'A MiniGolf game for Palm OS (ctrl-c.club)', 'The Minneapolis Street Grid: Explained (streets.mn)', 'Be a Thermostat, Not a Thermometer (larahogan.me)', 'My Homelab Setup (arslan.io)', 'Why smart telescopes are the future of astrophotography (2022) (techradar.com)', 'Noisy neighbor detection with eBPF (netflixtechblog.com)', 'We spent $20 to achieve RCE and accidentally became the admins of .mobi (watchtowr.com)', 'Show HN: Simple Alternative to Complex Project Management for Freelancers', 'AppleWatchAmmeter (github.com/jp3141)', 'Transparenttextures.com (transparenttextures.com)', 'ClickHouse Data Modeling for Postgres Users (clickhouse.com)', 'Making progress on side projects with content-driven development (ntietz.com)', 'Show HN: Tune LLaMa3.1 on Google Cloud TPUs (github.com/felafax)', 'The first release candidate of FreeCAD 1.0 

In [156]:
# print by following the largest number order
print(len(article_texts))
print(len(article_links))
article_upvotes.append(100) # score 값이 없는 기사가 있어서 임의로 점수를 하나 추가
print(len(article_upvotes))

30
30
30


In [200]:
import pandas as pd
articles_df = pd.DataFrame({'text':article_texts,
                           'links':article_links,
                           'score':article_upvotes})
print(len(articles_df))

30


In [204]:
articles_df.sort_values(by='score', ascending=False)

Unnamed: 0,text,links,score
8,We spent $20 to achieve RCE and accidentally b...,https://labs.watchtowr.com/we-spent-20-to-achi...,1266
10,AppleWatchAmmeter (github.com/jp3141),https://github.com/jp3141/AppleWatchAmmeter,261
2,A MiniGolf game for Palm OS (ctrl-c.club),https://ctrl-c.club/~captain/posts/2024-08-29-...,205
22,[Deploying a basic React site pt 1] Setting up...,https://jpegsfiles.medium.com/deploying-a-basi...,201
15,The first release candidate of FreeCAD 1.0 is ...,https://blog.freecad.org/2024/09/10/the-first-...,194
17,"David Chang on the long, hard, stupid way (her...",https://herbertlui.net/david-chang-on-the-long...,186
11,Transparenttextures.com (transparenttextures.com),https://www.transparenttextures.com,164
7,Noisy neighbor detection with eBPF (netflixtec...,https://netflixtechblog.com/noisy-neighbor-det...,160
28,Passive damping – Bathroom scales (thinking-ab...,https://thinking-about-science.com/2024/09/08/...,154
0,My business card runs Linux and Ultrix (2022) ...,http://dmitry.gr/?r=05.Projects&proj=33.%20Lin...,147


In [208]:
largest_number = max(article_upvotes)
largest_index = article_upvotes.index(largest_number)
print(largest_index)

8


In [210]:
# Web scraping Ethics
# law on web scraping
# you can't commercialise copyrighted content
# you can'r scrape data behind authentication: data able to get after log-in the site
# Public API first
# Respect the Web Owner
# www.naver.com/robots.txt _show what you can scrap can't
# limit your rate

In [35]:
# fianl project: 100 Greatest Movies _my solution
from bs4 import BeautifulSoup
import requests

response = requests.get("https://web.archive.org/web/20200518073855/https://www.empireonline.com/movies/features/best-movies-2/")
movie_page = response.text

soup = BeautifulSoup(movie_page, "html.parser")
# print(soup.title)

titles = [title.getText() for title in soup.find_all(name="h3", class_="title")]
# print(titles)
    
# Exchanging the order of the list from 100~1 to 1~100
titles_ascending = []
for title in titles:
    titles_ascending.insert(0, title)
# print(titles_ascending)

titles_str = ""
for title in titles_ascending:
    titles_str += title+"\n"
# print(title_str)

with open("movies.txt", mode="w", encoding="utf-8") as data:
    data.write(titles_str)


# Reflection: I could have used [::-1] to exchange the exchange the order of the list
# well done to look for the error and adding 'encoding="utf-8"' in open 

In [41]:
# fianl project: 100 Greatest Movies _teacher solution
import requests
from bs4 import BeautifulSoup

URL = "https://web.archive.org/web/20200518073855/https://www.empireonline.com/movies/features/best-movies-2/"

response = requests.get(URL)
website_html = response.text

soup = BeautifulSoup(website_html, "html.parser")

all_movies = soup.find_all(name="h3", class_="title")

movie_titles = [movie.getText() for movie in all_movies]
movies = movie_titles[::-1]

with open("movies.txt", mode="w", encoding="utf-8") as file:
    for movie in movies:
        file.write(f"{movie}\n")

In [37]:
a = [1,2,3,4,5]
b = a[::-1]
print(b)

[5, 4, 3, 2, 1]
