# Practice Web Scraping

practicing follows this columns 

[link](https://medium.com/@spaw.co/best-websites-to-practice-web-scraping-9df5d4df4d1)

## Import Library

In [1]:
# %pip install beautifulsoup4 pandas

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## 1. Books to Scrape

In [3]:
book_url = "https://books.toscrape.com/"

### Extract Categories

In [5]:
book_data = requests.get(book_url)
soup = BeautifulSoup(book_data.content, "html.parser")

In [6]:
categories = soup.find("ul", "nav nav-list").find("ul").find_all("li")

In [7]:
list_categories = []
for cat in categories:
    list_categories.append(
        {
            "link" : cat.find("a").get("href"),
            "name" : cat.get_text(strip=True)
        }
    )

In [8]:
categories_df = pd.DataFrame(list_categories)

In [9]:
categories_df.head()

Unnamed: 0,link,name
0,catalogue/category/books/travel_2/index.html,Travel
1,catalogue/category/books/mystery_3/index.html,Mystery
2,catalogue/category/books/historical-fiction_4/...,Historical Fiction
3,catalogue/category/books/sequential-art_5/inde...,Sequential Art
4,catalogue/category/books/classics_6/index.html,Classics


### Extract Books

In [10]:
list_books = []
for category in list_categories:
    endpoint = category["link"]
    book_per_page = 20
    page = 1
    while book_per_page == 20:
        page_endpoint = (
            endpoint if page == 1 else endpoint.replace("index", f"page-{page}")
        )
        book_data = requests.get(f"{book_url}/{page_endpoint}")
        if book_data.status_code == 200:
            soup = BeautifulSoup(book_data.content, "html.parser")
            books = soup.find("section").find_all("li")
            list_book_in_page = []
            for book in books:
                if book.find("div", "image_container"):
                    list_book_in_page.append(
                        {
                            "image": book.find("div", "image_container")
                            .find("img")
                            .get("src"),
                            "star": book.find("p", "star-rating").get("class")[1],
                            "title": book.find("h3").find("a").get("title"),
                            "price": book.find("div", "product_price")
                            .find("p", "price_color")
                            .text,
                            "availability": book.find("div", "product_price")
                            .find("p", "instock availability")
                            .find("i")
                            .get("class")[0]
                            .split("-")[-1],
                            "category": category["name"],
                            "page": page,
                        }
                    )
            list_books.extend(list_book_in_page)
            book_per_page = len(list_book_in_page)
            page += 1
        else:
            continue

In [11]:
books_df = pd.DataFrame(list_books)

In [12]:
books_df.head()

Unnamed: 0,image,star,title,price,availability,category,page
0,../../../../media/cache/27/a5/27a53d0bb95bdd88...,Two,It's Only the Himalayas,£45.17,ok,Travel,1
1,../../../../media/cache/57/77/57770cac1628f440...,Four,Full Moon over Noah’s Ark: An Odyssey to Mount...,£49.43,ok,Travel,1
2,../../../../media/cache/9a/7e/9a7e63f12829df4b...,Three,See America: A Celebration of Our National Par...,£48.87,ok,Travel,1
3,../../../../media/cache/d5/bf/d5bf0090470b0b8e...,Two,Vagabonding: An Uncommon Guide to the Art of L...,£36.94,ok,Travel,1
4,../../../../media/cache/98/c2/98c2e95c5fd1a4e7...,Three,Under the Tuscan Sun,£37.33,ok,Travel,1


In [None]:
def extract_tag_content(tag):
    if not tag.contents:  # Base case: no child elements
        return None

    result = []
    for child in tag.contents:
        if child.name:  # Only process elements with a tag name
            result.append({
                "tag": child.name,
                "content": extract_tag_content(child),  # Recursively process child elements
                "attributes": child.attrs,
                "text": child.get_text(strip=True) if child.name in ["p", "a"] or "h" in child.name else None  # Text if no further children
            })
    return result

## 2. Scrape This Site

In [74]:
url = "https://www.scrapethissite.com"

### 2.1 Extract Page

In [75]:
data = requests.get(f"{url}/pages")
soup = BeautifulSoup(data.content, "html.parser")

In [76]:
pages = [
    {
        "link": page.find("a")["href"],
        "title": page.find("a").get_text(strip=True),
        "description": page.find("p").get_text(strip=True),
    }
    for page in soup.find_all("div", "page")
]

In [77]:
pages_df = pd.DataFrame(pages)

In [78]:
pages_df.head()

Unnamed: 0,link,title,description
0,/pages/simple/,Countries of the World: A Simple Example,A single page that lists information about all...
1,/pages/forms/,"Hockey Teams: Forms, Searching and Pagination",Browse through a database of NHL team stats si...
2,/pages/ajax-javascript/,Oscar Winning Films: AJAX and Javascript,Click through a bunch of great films. Learn ho...
3,/pages/frames/,Turtles All the Way Down: Frames & iFrames,Some older sites might still use frames to bre...
4,/pages/advanced/,Advanced Topics: Real World Challenges You'll ...,"Scraping real websites, you're likely run into..."


### 2.2 Countries of the World: A Simple Example

In [79]:
data = requests.get(f"{url}/pages/simple/")
soup = BeautifulSoup(data.content, "html.parser")

In [80]:
list_country = [
    {
        "name": country.find("h3").get_text(strip=True),
        "capital": country.find("span", "country-capital").get_text(strip=True),
        "population": int(
            country.find("span", "country-population").get_text(strip=True)
        ),
        "area": float(country.find("span", "country-area").get_text(strip=True)),
    }
    for country in soup.find_all("div", "col-md-4 country")
]

In [81]:
list_country_df = pd.DataFrame(list_country)

In [82]:
list_country_df.head()

Unnamed: 0,name,capital,population,area
0,Andorra,Andorra la Vella,84000,468.0
1,United Arab Emirates,Abu Dhabi,4975593,82880.0
2,Afghanistan,Kabul,29121286,647500.0
3,Antigua and Barbuda,St. John's,86754,443.0
4,Anguilla,The Valley,13254,102.0


### 2.3 Hockey Teams: Forms, Searching and Pagination

In [136]:
page_num = 1
item_per_page = list_teams_per_page = 100

In [137]:
list_teams = []
while list_teams_per_page == item_per_page:
    data = requests.get(
        f"{url}/pages/forms/?page_num={page_num}&per_page={item_per_page}"
    )
    soup = BeautifulSoup(data.content, "html.parser")
    list_teams_page = [
        {
            "name": team.find("td", "name").get_text(strip=True),
            "year": int(team.find("td", "year").get_text(strip=True)),
            "wins": int(team.find("td", "wins").get_text(strip=True)),
            "losses": int(team.find("td", "losses").get_text(strip=True)),
            "ot-losses": team.find("td", "ot-losses").get_text(strip=True) ,
            "win-pct": float(team.find("td", "pct").get_text(strip=True)),
            "gf": int(team.find("td", "gf").get_text(strip=True)),
            "ga": int(team.find("td", "ga").get_text(strip=True)),
            "diff": int(team.find("td", "diff").get_text(strip=True)),
        }
        for team in soup.find_all("tr", "team")
    ]
    list_teams.extend(list_teams_page)
    list_teams_per_page = len(list_teams_page)
    page_num += 1

In [138]:
list_teams_df = pd.DataFrame(list_teams)

In [139]:
list_teams_df.tail()

Unnamed: 0,name,year,wins,losses,ot-losses,win-pct,gf,ga,diff
577,Tampa Bay Lightning,2011,38,36,8,0.463,235,281,-46
578,Toronto Maple Leafs,2011,35,37,10,0.427,231,264,-33
579,Vancouver Canucks,2011,51,22,9,0.622,249,198,51
580,Washington Capitals,2011,42,32,8,0.512,222,230,-8
581,Winnipeg Jets,2011,37,35,10,0.451,225,246,-21


## 3. Oscar Winning Films: AJAX and Javascript

In [147]:
url = "https://www.scrapethissite.com/pages/ajax-javascript"

### 3.1 Extract Oscar year

In [148]:
data = requests.get(url)
soup = BeautifulSoup(data.content, "html.parser")

In [149]:
oscar_years = [year.get_text(strip=True) for year in soup.find_all("a", "year-link")]

In [150]:
oscar_years

['2015', '2014', '2013', '2012', '2011', '2010']

### 3.2 Extract Nominate Movies 

In [166]:
list_movies = []
for year in oscar_years:
    data = requests.get(f"{url}/?ajax=true&year={year}")
    list_movies.extend(data.json())

In [168]:
list_movies_df = pd.DataFrame(list_movies)

In [169]:
list_movies_df.head()

Unnamed: 0,title,year,awards,nominations,best_picture
0,Spotlight,2015,2,6,True
1,Mad Max: Fury Road,2015,6,10,
2,The Revenant,2015,3,12,
3,Bridge of Spies,2015,1,6,
4,The Big Short,2015,1,5,


## 4. Turtles All the Way Down: Frames & iFrames

In [200]:
base_url = "https://www.scrapethissite.com"
frame_endpoint = "/pages/frames"

In [201]:
data = requests.get(f"{base_url}{frame_endpoint}")
soup = BeautifulSoup(data.content, "html.parser")

In [202]:
iframe_endpoint = soup.find("iframe").get("src")

In [220]:
data = requests.get(f"{base_url}{iframe_endpoint}")
soup = BeautifulSoup(data.content, "html.parser")

In [221]:
list_turtles = []
for turtle in soup("div", "col-md-4 turtle-family-card"):
    detail_data = requests.get(f"{base_url}{turtle.find('a').get('href')}")
    detail_data_soup = BeautifulSoup(detail_data.content, "html.parser")
    list_turtles.append(
        {
            "image": turtle.find("img").get("src"),
            "name": turtle.find("h3").get_text(strip=True),
            "description": detail_data_soup.find("p").get_text().strip(),
        }
    )

In [224]:
list_turtles_df = pd.DataFrame(list_turtles)

In [225]:
list_turtles_df.head()

Unnamed: 0,image,name,description
0,https://upload.wikimedia.org/wikipedia/commons...,Carettochelyidae,The Carettochelyidae family of turtles — more ...
1,https://upload.wikimedia.org/wikipedia/commons...,Cheloniidae,The Cheloniidae family of turtles — more commo...
2,https://upload.wikimedia.org/wikipedia/commons...,Chelydridae,The Chelydridae family of turtles — more commo...
3,https://upload.wikimedia.org/wikipedia/commons...,Dermatemydidae,The Dermatemydidae family of turtles — more co...
4,https://upload.wikimedia.org/wikipedia/commons...,Dermochelyidae,The Dermochelyidae family of turtles — more co...
