
# Data Scraping
**Libraries:**
- [Pandas](https://pandas.pydata.org/)
- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
- [Requests](https://docs.python-requests.org/en/latest/)


1. [Pandas](https://lms.sdmdigital.id/mod/book/view.php?id=21829&chapterid=366)

In [1]:
import pandas as pd
from termcolor import cprint
# Read the CSV file
data = pd.read_csv('data/data_nasabah.csv', delimiter=';')


cprint(f"Data (Rows, Columns): {data.shape}", "green") # Display the shape of the DataFrame (rows, columns)
print(data.head()) # Display 5 rows of the DataFrame

[32mData (Rows, Columns): (100, 10)[0m
  nasabah_id  umur jenis_kelamin  pendapatan  saldo_rata_rata  \
0       N001    22     Perempuan     5800000          1508000   
1       N002    64     Perempuan     5700000          1254000   
2       N003    27     Perempuan     2950000           590000   
3       N004    34     Perempuan     3100000           186000   
4       N005    45     Laki-Laki     6700000          1474000   

   jumlah_transaksi  jenis_produk  frekuensi_kunjungi_cabang  \
0                19      tabungan                          1   
1                 9  kartu_kredit                          2   
2                12      tabungan                          1   
3                16      deposito                          5   
4                15  kartu_kredit                          3   

  pengguna_mobile_banking  skor_kredit  
0                   TIDAK          900  
1                   TIDAK          900  
2                      YA          500  
3                  

2. [BeatifulSoup](https://lms.sdmdigital.id/mod/book/view.php?id=21829&chapterid=363)

In [2]:
import requests
from bs4 import BeautifulSoup

# URL of the website to scrape
url = 'https://www.kompas.com/global/read/2025/04/09/123149070/china-akan-larang-semua-film-dari-as-balas-tarif-impor-104-persen-trump'

# Headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Send a GET request to the URL with headers
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the title of the news article
    judul_berita = soup.find('h1', class_='read__title').text.strip()

    # Print the title of the news article
    df_berita = pd.DataFrame({
        'judul': [judul_berita]
    })

    # Display the DataFrame
    print(df_berita)
else:
    print(f'Error: {response.status_code}, {response.reason}')

                                               judul
0  China Akan Larang Semua Film dari AS, Balas Ta...


 **Exercise:** [Simple Web Scraping](https://www.scrapethissite.com/pages/simple/)

In [3]:

url = 'https://www.scrapethissite.com/pages/simple/'
# Send a GET request to the URL
response = requests.get(url)
print(f"URL: {response.url} Status Code: {response.status_code}, Reason: {response.reason}")
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
country_blocks = soup.find_all('div', class_='col-md-4 country')
cprint(f"Found {len(country_blocks)} countries.", "green")

# Extract country data
countries = []
for block in country_blocks:
    name = block.find('h3', class_='country-name').text.strip()
    capital= block.find('span', class_='country-capital').text.strip()
    population = block.find('span', class_='country-population').text.strip()
    area = block.find('span', class_='country-area').text.strip()
    countries.append({
        'name': name,
        'capital': capital,
        'population': population,
        'area': area
    })
# Convert to DataFrame
df_countries = pd.DataFrame(countries)
# Display the DataFrame
print(df_countries.head(10))

# Save to CSV
# df_countries.to_csv('data/countries.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)


URL: https://www.scrapethissite.com/pages/simple/ Status Code: 200, Reason: OK
[32mFound 250 countries.[0m
                   name           capital population       area
0               Andorra  Andorra la Vella      84000      468.0
1  United Arab Emirates         Abu Dhabi    4975593    82880.0
2           Afghanistan             Kabul   29121286   647500.0
3   Antigua and Barbuda        St. John's      86754      443.0
4              Anguilla        The Valley      13254      102.0
5               Albania            Tirana    2986952    28748.0
6               Armenia           Yerevan    2968000    29800.0
7                Angola            Luanda   13068161  1246700.0
8            Antarctica              None          0      1.4E7
9             Argentina      Buenos Aires   41343201  2766890.0


 **Exercise:** [Web Scraping Paginate](https://www.scrapethissite.com/pages/forms/)

In [4]:
base_url = 'https://www.scrapethissite.com/pages/forms/'
response = requests.get(base_url)
print(f"Base URL: {response.url} Status Code: {response.status_code}, Reason: {response.reason}")

teams = []
for page in range(1, 7):  # Scrape first 6 pages
    page_url = f"{base_url}?page_num={page}&per_page=100"
    response = requests.get(page_url)
    print(f"Page {page} URL: {response.url} Status Code: {response.status_code}, Reason: {response.reason}")

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    # Find the table
    table = soup.find('table', class_='table')
    rows = table.find_all('tr', class_='team')

    # Extract team data
    for row in rows:
        name = row.find('td', class_='name').text.strip()
        year = row.find('td', class_='year').text.strip()
        wins = row.find('td', class_='wins').text.strip()
        losses = row.find('td', class_='losses').text.strip()
        ot_losses = row.find('td', class_='ot-losses').text.strip()
        pct = row.find('td', class_='pct').text.strip()
        gf = row.find('td', class_='gf').text.strip()
        ga = row.find('td', class_='ga').text.strip()
        diff = row.find('td', class_='diff').text.strip()

        teams.append({
            'Team Name': name,
            'Year': year,
            'Wins': wins,
            'Losses': losses,
            'OT Losses': ot_losses,
            'Win %': pct,
            'Goal For (GF)': gf,
            'Goal Against (GA)': ga,
            '+ / -': diff
        })

# Convert to DataFrame
df_teams = pd.DataFrame(teams)
# Display the DataFrame
print(f"Title: {soup.title.string}")
cprint(f"Found {len(df_teams)} teams.", "green")
print(df_teams.head(10))

# Save to CSV
# df_teams.to_csv('data/teams.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

Base URL: https://www.scrapethissite.com/pages/forms/ Status Code: 200, Reason: OK
Page 1 URL: https://www.scrapethissite.com/pages/forms/?page_num=1&per_page=100 Status Code: 200, Reason: OK
Page 2 URL: https://www.scrapethissite.com/pages/forms/?page_num=2&per_page=100 Status Code: 200, Reason: OK
Page 3 URL: https://www.scrapethissite.com/pages/forms/?page_num=3&per_page=100 Status Code: 200, Reason: OK
Page 4 URL: https://www.scrapethissite.com/pages/forms/?page_num=4&per_page=100 Status Code: 200, Reason: OK
Page 5 URL: https://www.scrapethissite.com/pages/forms/?page_num=5&per_page=100 Status Code: 200, Reason: OK
Page 6 URL: https://www.scrapethissite.com/pages/forms/?page_num=6&per_page=100 Status Code: 200, Reason: OK
Title: Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping
[32mFound 582 teams.[0m
               Team Name  Year Wins Losses OT Losses  Win % Goal For (GF)  \
0          Boston Bruins  1990   44     24 

 **Exercise:** [Web Scraping with API](https://www.scrapethissite.com/pages/api/)

In [5]:
base_url = 'https://www.scrapethissite.com/pages/ajax-javascript/?ajax=true&year=2014'
response = requests.get(base_url)

cprint(f"Base URL: {response.url} Status Code: {response.status_code}, Reason: {response.reason}", "green")

moives_data = response.json()
df_movies = pd.DataFrame(moives_data)  # Display first 10 movies

print(moives_data) # Display JSON data
# Display the DataFrame
cprint(f"Found {len(df_movies)} movies.", "green")
print(df_movies.to_string())



[32mBase URL: https://www.scrapethissite.com/pages/ajax-javascript/?ajax=true&year=2014 Status Code: 200, Reason: OK[0m
[{'title': 'Birdman', 'year': 2014, 'awards': 4, 'nominations': 9, 'best_picture': True}, {'title': 'The Grand Budapest Hotel', 'year': 2014, 'awards': 4, 'nominations': 9}, {'title': 'Whiplash', 'year': 2014, 'awards': 3, 'nominations': 5}, {'title': 'The Imitation Game', 'year': 2014, 'awards': 1, 'nominations': 8}, {'title': 'American Sniper', 'year': 2014, 'awards': 1, 'nominations': 6}, {'title': 'Boyhood', 'year': 2014, 'awards': 1, 'nominations': 6}, {'title': 'Interstellar', 'year': 2014, 'awards': 1, 'nominations': 5}, {'title': 'The Theory of Everything', 'year': 2014, 'awards': 1, 'nominations': 5}, {'title': 'Ida', 'year': 2014, 'awards': 1, 'nominations': 2}, {'title': 'Selma', 'year': 2014, 'awards': 1, 'nominations': 2}, {'title': 'Citizenfour', 'year': 2014, 'awards': 1, 'nominations': 1}, {'title': 'Big Hero 6', 'year': 2014, 'awards': 1, 'nominatio

**Exercise**: [Spoofing Header](https://originui.com/r/comp-577.json)

In [6]:
url = 'https://www.scrapethissite.com/pages/advanced/?gotcha=headers'
headers = {
    'User-Agent' :'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:139.0) Gecko/20100101 Firefox/139.0',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
response = requests.get(url, headers=headers)

if response.status_code == 200:
    cprint(f"URL: {response.url} Status Code: {response.status_code}, Reason: {response.reason}", "green")
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the title of the page
    title = soup.title.string.strip()
    text = soup.find('div', class_ ='col-md-4 col-md-offset-4').text.strip()

    # Display the title
    print(f"Title: {title}")
    print(f"Text: {text}")
else:
    cprint(f"Error: {response.status_code}, {response.reason}", "red")

[32mURL: https://www.scrapethissite.com/pages/advanced/?gotcha=headers Status Code: 200, Reason: OK[0m
Title: Advanced Topics: Real World Challenges You'll Encounter | Scrape This Site | A public sandbox for learning web scraping
Text: Headers properly spoofed, request appears to be coming from a browser :)


**Exercise:** [Login Form](https://www.scrapingcourse.com/)

In [7]:
url = 'https://www.scrapingcourse.com/login'

payload = {
    'email' : 'admin@example.com',
    'password': 'password'
}

response = requests.post(url, data=payload)

print(f"URL: {response.url}, Status Code: {response.status_code}, Reason: {response.reason}")

print(response.text)  # Display the response text

URL: https://www.scrapingcourse.com:443/dashboard, Status Code: 200, Reason: OK
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="ie=edge">
    <title>Success Page - ScrapingCourse.com</title>

    <!-- Bootstrap CSS -->
    <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css" rel="stylesheet">
    <script src="https://challenges.cloudflare.com/turnstile/v0/api.js" async defer></script>
    <!-- Google tag (gtag.js) -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=G-NZGD14H87G"></script>
    <script>
        window.dataLayer = window.dataLayer || [];
        function gtag(){dataLayer.push(arguments);}
        gtag('js', new Date());
        gtag('config', 'G-NZGD14H87G');
    </script>
    <link rel="preload" as="style" href="https://www.scrapingcourse.com/build/assets/app-5Cdbk7yA.css" /><link

**Exercise:** Login form with CSRF Token

In [30]:
url = 'https://www.scrapingcourse.com/login/csrf'

session = requests.Session()
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
csrf_token = soup.find('input', {'name': '_token'})['value']
cprint(csrf_token, 'green')


payload = {
    'email': 'admin@example.com',
    'password': 'password',
    '_token': csrf_token
}

response = session.post(url, data=payload)

if response.status_code == 200:
    print('Login successful!')
    print(f"URL: {response.url}, Status Code: {response.status_code}, Reason: {response.reason}")
else:
    print('Login failed!')
    print(f"Status Code: {response.status_code}, Reason: {response.reason}")


[32mu2A8a4Qcz5oWNqaA575NKbVPyKC4PIIunnlHXDpQ[0m
Login successful!
URL: https://www.scrapingcourse.com:443/dashboard, Status Code: 200, Reason: OK
