## Web Scraping (Teknologi Perekayasaan Data)

##BeautifulSoup + Requests

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
headers = {
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) Chrome/126.0.0.0 Safari/537.36',
    'accept': 'text/html',
}

halaman = requests.get('https://www.imdb.com/chart/top/', headers=headers) # Mendapatkan page HTML melalui request
halaman_parsed = BeautifulSoup(halaman.content, 'html.parser') # Parsing konten halaman menggunakan beautifulsoup

In [None]:
halaman.text

In [None]:
test = halaman_parsed.select("h3")

if test == []:
  True

In [None]:
juduls = halaman_parsed.select("ul li.ipc-metadata-list-summary-item a.ipc-title-link-wrapper")

juduls

In [None]:
ratings = halaman_parsed.select("ul li span.ipc-rating-star--rating")

ratings[:5]

In [None]:
movies = halaman_parsed.select("ul li.ipc-metadata-list-summary-item")

hasil_data = []
for movie in movies:
    judul = movie.select('a.ipc-title-link-wrapper h3')[0].get_text().lstrip('0123456789. ')  # Remove leading numbers, periods, and spaces
    tahun = movie.select('div.cli-title-metadata span')[0].get_text()
    durasi = movie.select('div.cli-title-metadata span')[1].get_text()
    rating = movie.select('span.ipc-rating-star--rating')[0].get_text()
    hasil_data.append({"judul": judul, "tahun": tahun, "durasi": durasi, "rating": rating})

hasil_data

In [None]:
#Jika sekaligus scraping poster film
from os.path  import basename

movies = halaman_parsed.select("ul li.ipc-metadata-list-summary-item")
hasil_data = []
for movie in movies:
    judul = movie.select('a.ipc-title-link-wrapper h3')[0].get_text().lstrip('0123456789. ')
    tahun = movie.select('div.cli-title-metadata span')[0].get_text()
    durasi = movie.select('div.cli-title-metadata span')[1].get_text()
    rating = movie.select('span.ipc-rating-star--rating')[0].get_text()
    poster = movie.select('img.ipc-image')[0].get("src")
    hasil_data.append({"judul": judul, "tahun": tahun, "durasi": durasi, "rating": rating, "poster": poster})

    with open('/content/drive/MyDrive/TPD_2025/imdb_images/'+basename(poster), 'wb') as f:
      f.write(requests.get(poster).content)


In [None]:
import pandas as pd

hasil_data_df = pd.DataFrame(hasil_data)

hasil_data_df.head(5)

In [None]:
hasil_data_df.to_csv('/content/drive/MyDrive/TPD_2025/top_movies.csv', index=False)

##Scrapy

In [None]:
#Scrapy = complete package, tidak seperti BeautifulSoup yang harus juga import requests/selenium/urlib2 untuk "request"
!pip install Scrapy

In [70]:
import scrapy
print(scrapy.__version__)

2.12.0


In [72]:
import scrapy
import os
os.chdir('/content/drive/MyDrive/TPD_2025/scrapy_imdb/')

In [73]:
!pwd # get working directory, memastikan apakah kita sudah di folder project

/content/drive/MyDrive/TPD_2025/scrapy_imdb


In [None]:
!scrapy startproject scraping_movies

In [None]:
# membuat spider baru "movies"
os.chdir('/content/drive/MyDrive/TPD_2025/scrapy_imdb/scraping_movies/scraping_movies/spiders/')
!scrapy genspider movies https://www.imdb.com/chart/top/

In [None]:
!scrapy crawl movies

In [None]:
!scrapy crawl movies -o top_movies.csv

## Settting Selenium untuk Colab

As of Feb 2023 – an issue with Selenium induced as the colab system was updated from v18.04 to ubuntu v20.04 LTS recently

In [None]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

## BeautifulSoup + Selenium

In [99]:
# JALANKAN CODE DI BAWAH UNTUK IMPORT SELENIUM DAN PERSIAPAN DRIVER CHROME

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

service = Service(executable_path=r'/usr/bin/chromedriver')
options = webdriver.ChromeOptions()
options.add_argument('--headless') # menggunakan chrome tanpa GUI
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(service=service, options=options)

print(driver)

<selenium.webdriver.chrome.webdriver.WebDriver (session="5983a94f516e6d71a94cce3bfdbce1fd")>


In [118]:
import time
driver.get("https://www.detik.com/search/searchall?query=inflasi")

# pilih site - "detikfinance"
selectsite = Select(driver.find_element(By.ID, 'kanal'))
selectsite.select_by_value('29')

# pilih waktu - "Pilih Tanggal"
selecttime = Select(driver.find_element(By.ID, 'timeselect'))
selecttime.select_by_value('yes')

# isikan tanggal awal dan akhir
wait = WebDriverWait(driver, 10)
starttime = wait.until(EC.presence_of_element_located((By.ID, 'startPicker')))
starttime.send_keys('01/01/2025')  # Start date

endtime = wait.until(EC.presence_of_element_located((By.ID, 'endPicker')))
endtime.send_keys('28/02/2025')  # End date

endtime.send_keys(Keys.RETURN)
time.sleep(5)


In [119]:
# lakukan web scraping
from bs4 import BeautifulSoup

halaman_parsed = BeautifulSoup(driver.page_source, 'html.parser')

beritas = halaman_parsed.select("div.list-content article")

hasil_data_sel = []
for berita in beritas:
    info = berita.select('div.media__date')[0].get_text(strip=True)
    judul = berita.select('h3.media__title a')[0].get_text(strip=True)
    isi = berita.select_one('div.media__desc')
    isi_text = isi.get_text(strip=True) if isi else "-"

    hasil_data_sel.append({"judul": judul, "info": info, "isi": isi_text})

hasil_data_sel_df = pd.DataFrame(hasil_data_sel)
hasil_data_sel_df

Unnamed: 0,judul,info,isi
0,"Menteri PU Dilema Soal Truk Obesitas, Dilarang...","Jumat, 21 Feb 2025 19:15 WIB",Menteri PU Dody Hanggodo menyatakan larangan t...
1,"Inflasi Terendah Sepanjang Sejarah, Pemerintah...","Jumat, 03 Jan 2025 14:51 WIB",Kemenko Perekonomian melaporkan inflasi Desemb...
2,"Inflasi Tembus Rekor Terendah, Daya Beli Lesu?","Jumat, 03 Jan 2025 12:45 WIB",Peneliti Institute for Demographic and Affluen...
3,Video Kebijakan Penurunan Harga Tiket Pesawat ...,"Sabtu, 04 Jan 2025 09:00 WIB",-
4,"Inflasi RI Terendah Sepanjang Sejarah, Sri Mul...","Senin, 06 Jan 2025 11:50 WIB",BPS mencatat inflasi tahun kalender atau year ...
5,Video: BPS Catat Inflasi di Desember 2024 Capa...,"Kamis, 02 Jan 2025 21:46 WIB","Menurut data Badan Pusat Statistik, inflasi se..."
6,"Bahas Inflasi, Luhut Pesan ke Prabowo Jangan K...","Selasa, 18 Feb 2025 18:48 WIB",Ketua Dewan Ekonomi Nasional (DEN) Luhut Binsa...
7,"Inflasi Januari 0,76%, Terendah Sejak Tahun 2000","Minggu, 09 Feb 2025 20:00 WIB",-
8,Sri Mulyani Sebut Banyak Negara Iri dengan Tin...,"Selasa, 11 Feb 2025 15:53 WIB",Menteri Keuangan Sri Mulyani Indrawati mengkla...
9,"Inflasi Masih di Atas Target, Bank Sentral AS ...","Minggu, 05 Jan 2025 17:00 WIB",The Federal Reserve berupaya menangani inflasi...


## Scraping Data dengan APIs

In [116]:
# Wikipedia, cek https://www.mediawiki.org/wiki/API:Main_page untuk parameter setiap call
import requests
import pandas as pd

url = 'https://en.wikipedia.org/w/api.php'
params = {
        'action': 'query',
        'list':'search',
        'srsearch':'data engineer',
        'format': 'json'
    }
response = requests.get(url, params=params)

if response.status_code == 200:
    # Extract JSON data dari response
    data = response.json()

    dtwiki = pd.json_normalize(data['query']['search'])
    dtwiki.to_csv('/content/drive/MyDrive/TPD_scraping/wiki_dataenginner.csv', index=False)

else:
    print(f"Request gagal dengan status {response.status_code}")



In [None]:
# Twitter, cek https://developer.twitter.com/en/docs/api-reference-index
import requests

url = "https://api.twitter.com/2/tweets/search/recent"
headers = {
    "Authorization": "Bearer <your_access_token>"
}
params = {
    "query": "climate", #isikan dengan keyword untuk search
    "max_results": 10
}

response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
    # Extract JSON data dari response
    data = response.json()

    print(data)
else:
    print(f"Request gagal dengan status {response.status_code}")



## PENUGASAN (1 Week Assignment)

1. Buat File .ipynb yang baru dengan nama *Praktikum4_NIM.ipynb*

2. Lakukan web scraping menggunakan **Requests + BeautifulSoup** untuk mengekstrak Tabel **Matching Cryptocurrencies** pada tab **Table View** di Yahoo!Finance  https://finance.yahoo.com/crypto/ (cukup 3 halaman saja - 75 rows). **Hints**: lihat perubahan URL untuk setiap halaman table dan lakukan looping per halaman

3. Lakukan web scraping menggunakan **Scrapy** untuk mengekstrak data negara di https://www.scrapethissite.com/pages/simple/

4. Buat data scraping menggunakan API:
  - Pilih salah satu penyedia data: Twitter/Instagram/Facebook/Google/atau lainnya
  - Sign Up untuk mendapatkan akses autentikasi API (jika diperlukan)
  - Baca dan pelajari dokumentasi API
  - Ambil data dari penyedia data (cukup satu jenis data saja - 1 API method)


---



**Silakan submit di Google Classroom:**
1. File notebook .ipynb
2. File spider Scrapy yang sudah diedit dari no 3
3. Hasil screenshot data CSV hasil dari tugas no 2,3,4 (bukan file datanya)



