
# Data Scraping
**Libraries:**
- [Pandas](https://pandas.pydata.org/)
- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
- [Requests](https://docs.python-requests.org/en/latest/)


1. [Pandas](https://lms.sdmdigital.id/mod/book/view.php?id=21829&chapterid=366)

In [None]:
import pandas as pd
# Read the CSV file
data = pd.read_csv('data/data.csv', delimiter=';')
# Display 5 rows of the DataFrame
print(data.head(5))
print(data.shape)

2. [BeatifulSoup](https://lms.sdmdigital.id/mod/book/view.php?id=21829&chapterid=363)

In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the website to scrape
url = 'https://www.kompas.com/global/read/2025/04/09/123149070/china-akan-larang-semua-film-dari-as-balas-tarif-impor-104-persen-trump'

# Headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Send a GET request to the URL with headers
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the title of the news article
    judul_berita = soup.find('h1', class_='read__title').text.strip()

    # Print the title of the news article
    df_berita = pd.DataFrame({
        'judul': [judul_berita]
    })

    # Display the DataFrame
    print(df_berita)
else:
    print(f'Error: {response.status_code}, {response.reason}')

 **Exercise:** [Simple Web Scraping](https://www.scrapethissite.com/pages/simple/)

In [None]:
import csv

url = 'https://www.scrapethissite.com/pages/simple/'
# Send a GET request to the URL
response = requests.get(url)
print(f"URL: {response.url} Status Code: {response.status_code}, Reason: {response.reason}")
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
country_blocks = soup.find_all('div', class_='col-md-4 country')
print(f"Found {len(country_blocks)} countries.")

# Extract country data
countries = []
for block in country_blocks:
    name = block.find('h3', class_='country-name').text.strip()
    capital= block.find('span', class_='country-capital').text.strip()
    population = block.find('span', class_='country-population').text.strip()
    area = block.find('span', class_='country-area').text.strip()
    countries.append({
        'name': name,
        'capital': capital,
        'population': population,
        'area': area
    })
# Convert to DataFrame
df_countries = pd.DataFrame(countries)
# Display the DataFrame
print(df_countries.head(10))

# Save to CSV
df_countries.to_csv('data/countries.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)


 **Exercise:** [Web Scraping Paginate](https://www.scrapethissite.com/pages/forms/)

In [41]:
base_url = 'https://www.scrapethissite.com/pages/forms/'
response = requests.get(base_url)
print(f"Base URL: {response.url} Status Code: {response.status_code}, Reason: {response.reason}")

teams = []
for page in range(1, 7):  # Scrape first 6 pages
    page_url = f"{base_url}?page_num={page}&per_page=100"
    response = requests.get(page_url)
    print(f"Page {page} URL: {response.url} Status Code: {response.status_code}, Reason: {response.reason}")

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    # Find the table
    table = soup.find('table', class_='table')
    rows = table.find_all('tr', class_='team')

    # Extract team data
    for row in rows:
        name = row.find('td', class_='name').text.strip()
        year = row.find('td', class_='year').text.strip()
        wins = row.find('td', class_='wins').text.strip()
        losses = row.find('td', class_='losses').text.strip()
        ot_losses = row.find('td', class_='ot-losses').text.strip()
        pct = row.find('td', class_='pct').text.strip()
        gf = row.find('td', class_='gf').text.strip()
        ga = row.find('td', class_='ga').text.strip()
        diff = row.find('td', class_='diff').text.strip()

        teams.append({
            'Team Name': name,
            'Year': year,
            'Wins': wins,
            'Losses': losses,
            'OT Losses': ot_losses,
            'Win %': pct,
            'Goal For (GF)': gf,
            'Goal Against (GA)': ga,
            '+ / -': diff
        })

# Convert to DataFrame
df_teams = pd.DataFrame(teams)
# Display the DataFrame
print(f"Title: {soup.title.string}")
print(f"Found {len(df_teams)} teams.")
print(df_teams.head(10))

# Save to CSV
df_teams.to_csv('data/teams.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)
print("Data saved to data/teams.csv")

Base URL: https://www.scrapethissite.com/pages/forms/ Status Code: 200, Reason: OK
Page 1 URL: https://www.scrapethissite.com/pages/forms/?page_num=1&per_page=100 Status Code: 200, Reason: OK
Page 2 URL: https://www.scrapethissite.com/pages/forms/?page_num=2&per_page=100 Status Code: 200, Reason: OK
Page 3 URL: https://www.scrapethissite.com/pages/forms/?page_num=3&per_page=100 Status Code: 200, Reason: OK
Page 4 URL: https://www.scrapethissite.com/pages/forms/?page_num=4&per_page=100 Status Code: 200, Reason: OK
Page 5 URL: https://www.scrapethissite.com/pages/forms/?page_num=5&per_page=100 Status Code: 200, Reason: OK
Page 6 URL: https://www.scrapethissite.com/pages/forms/?page_num=6&per_page=100 Status Code: 200, Reason: OK
Title: Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping
Found 582 teams.
               Team Name  Year Wins Losses OT Losses  Win % Goal For (GF)  \
0          Boston Bruins  1990   44     24          

 **Exercise:** [Web Scraping with API](https://www.scrapethissite.com/pages/api/)