# Scrape News Website

In [None]:
!pip install --upgrade gspread
!pip install tldextract

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from bs4 import BeautifulSoup
import requests

import pandas as pd

from google.colab import auth
auth.authenticate_user()

from google.auth import default
from oauth2client.service_account import ServiceAccountCredentials
import gspread

from urllib.parse import urlparse

In [None]:
# Set up the Google Sheets API credentials
creds, _ = default()
client = gspread.authorize(creds)

In [None]:
# Get the tiers and keywords from the Google Sheet
sheet_url = "..."
sheet_name = "..."
sheet = client.open_by_url(sheet_url).worksheet(sheet_name)

keywords = sheet.col_values(1)[1:]  # Exclude the header
tier1 = sheet.col_values(2)[1:]
tier2 = sheet.col_values(3)[1:]
tier3 = sheet.col_values(4)[1:]
tier4 = sheet.col_values(5)[1:]

In [None]:
# Combine the keywords into a dictionary
keywords

['#TerusBelajar',
 'Adelina Anggraini',
 'Ayo Jadi Guru',
 'Budi Setiawan',
 'Budi Setiawan Muhammad',
 'Bukik Setiawan',
 'Cerita Guru Belajar',
 'Elisabet Indah Susanti',
 'Ilona Christina Kakerissa',
 'Kampus Guru Cikal',
 'Kampus Pemimpin Merdeka',
 'Lingkar Daerah Belajar',
 'Marsaria Primadonna',
 'Merayakan Asesmen Merdeka Belajar',
 'Nur Kholis Makki',
 'Pendidikan Penggerak Merdeka Belajar',
 'Rizqy Rahmat Hani',
 'Rizqy Rahmat Hanni',
 'Siap Kurikulum',
 'Surat Kabar Guru Belajar',
 'Surat Kabar Pemimpin Belajar',
 'Temu Pendidik Nusantara',
 'Temu Pendidik Nusantara 9',
 'TerusBelajar',
 'Wardah Inspiring Teacher',
 'Yayasan Guru Belajar']

In [None]:
# Combine the tiers into a dictionary
tiers = {'Tier 1': tier1, 'Tier 2': tier2, 'Tier 3': tier3, 'Tier 4': tier4}
tiers

{'Tier 1': ['detik.com',
  'Harian Haluan',
  'Harian Silampari',
  'igi.or.id',
  'kompas.com',
  'Koran Fajar',
  'Koran Sindo',
  'Koran Tribun Timur',
  'kumparan.com',
  'Linggau Pos',
  'liputan6.com',
  'Mantra Sukabumi - Pikiran Rakyat',
  'merdeka.com',
  'okezone.com',
  'pergunu.or.id',
  'pikiran-rakyat.com',
  'sindonews.com',
  'suaramerdeka.com',
  'tempo.co',
  'Tribun Muba',
  'tribunnews.com',
  'Website Resmi Disdik',
  'Website Resmi Pemerintah',
  'Website Resmi Sekolah'],
 'Tier 2': ['antaranews.com',
  'inews.id',
  'jawapos.com',
  'medcom.id',
  'mediaindonesia.com',
  'naikpangkat.com',
  'sonora.id',
  'wartaekonomi.co.id'],
 'Tier 3': ['alinea.id',
  'alonesia.com',
  'analisadaily.com',
  'batarapos.com',
  'beritabersatu.com',
  'beritajatim.com',
  'beritaminang.com',
  'beritanasional.id',
  'beritasulsel.com',
  'bidiknasional.id',
  'bloktuban.com',
  'bonepos.com',
  'chanelmuslim.com',
  'duniapendidikan.id',
  'elshinta.com',
  'fajar.co.id',
  'faj

In [None]:
# Get the sheet object based on the URL and sheet name
sheet_url = "..."
sheet_name = "..."
spreadsheet = client.open_by_url(sheet_url)
sheet = spreadsheet.worksheet(sheet_name)

# Get the input link from the Google Sheet
links = sheet.col_values(7)[1:]  # Exclude the header

print(links)
len(links)

['https://www.detik.com/edu/sekolah/d-6671995/tips-bikin-media-pembelajaran-yang-berdampak-buat-murid-guru-wajib-pahami', 'https://kumparan.com/beritaanaksurabaya/tips-buat-media-pembelajaran-lebih-seru-dari-hobi-murid-20EHXVngE4i', 'https://radarpekalongan.id/media-pembelajaran-yang-menarik/', 'https://tekno.tempo.co/read/1722153/kampus-pemimpin-merdeka-design-thinking-bantu-guru-sesuaikan-kebutuhan-murid', 'https://edukasi.kompas.com/read/2023/04/28/183000271/pengungsi-luar-negeri-di-indonesia-dibekali-pelatihan-menjadi-guru', 'https://kumparan.com/beritaanaksurabaya/21-pengungsi-ikuti-pelatihan-jadi-guru-merdeka-belajar-20Jmd83tsU7', 'https://www.detik.com/edu/edutainment/d-6693985/sejumlah-pengungsi-luar-negeri-dapat-pelatihan-jadi-guru-belajar-apa-saja', 'https://harianhaluan.id/pendidikan/hh-43404/bukik-setiawan-perubahan-pendidikan-berhasil-saat-orangtua-tidak-tanya-lagi-nilai-anak/', 'https://www.detik.com/edu/sekolah/d-6700693/pakar-perubahan-pendidikan-berhasil-saat-ortu-tak-

14

In [None]:
news_titles = []

# Loop through the list of links and extract the news title from each page
for link in links:
    response = requests.get(link)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    news_title = soup.find('h1')
    if news_title:
        news_title = news_title.text.strip()
    else:
        news_title = ''

    # Append the news title to the list
    news_titles.append(news_title)

print(news_titles)
len(news_titles)

['Tips Bikin Media Pembelajaran yang Berdampak buat Murid, Guru Wajib Pahami!', 'Access denied', 'Membuat 5 Media Pembelajaran yang Menarik: Libatkan Murid', 'Kampus Pemimpin Merdeka: Design Thinking Bantu Guru Sesuaikan Kebutuhan Murid', 'Pengungsi Luar Negeri di Indonesia Dibekali Pelatihan Menjadi Guru', 'Access denied', 'Sejumlah Pengungsi Luar Negeri Dapat Pelatihan Jadi Guru, Belajar Apa Saja?', 'Bukik Setiawan: Perubahan Pendidikan Berhasil Saat Orangtua Tidak Tanya Lagi Nilai Anak', 'Pakar: Perubahan Pendidikan Berhasil Saat Ortu Tak Lagi Tanya Nilai Anak', 'Ajak Anak Tingkatkan Literasi Dalam Berkarya Bercerita', 'Atasi Learning Loss, Kemendikbud Rilis Buku Panduan Literasi', '5 Tahapan Design Thinking untuk Membuat Media Pembelajaran Bermakna', '403 Forbidden', 'Kisah Stepanie Arum, Putuskan Pindah Profesi dari Dokter Jadi Guru']


14

In [None]:
news_keywords = []

# Loop through the list of links and find the keywords in the article
for link in links:
    response = requests.get(link)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    found_keywords = []
    for keyword in keywords:
      if keyword in html:
        found_keywords.append(keyword)
    news_keyword = ', '.join(found_keywords)

    # Append the news title to the list
    news_keywords.append(news_keyword)

print(news_keywords)
len(news_keywords)

['Kampus Pemimpin Merdeka, Rizqy Rahmat Hani, Yayasan Guru Belajar', '', 'Kampus Pemimpin Merdeka, Rizqy Rahmat Hani', 'Kampus Pemimpin Merdeka, Rizqy Rahmat Hani, Yayasan Guru Belajar', 'Kampus Guru Cikal, Marsaria Primadonna', '', 'Kampus Guru Cikal, Marsaria Primadonna, Yayasan Guru Belajar', 'Bukik Setiawan, Yayasan Guru Belajar', 'Bukik Setiawan, Yayasan Guru Belajar', 'Yayasan Guru Belajar', 'Bukik Setiawan, Yayasan Guru Belajar', 'Kampus Pemimpin Merdeka, Rizqy Rahmat Hani', '', 'Ayo Jadi Guru, Kampus Guru Cikal, Yayasan Guru Belajar']


14

In [None]:
news_orgs = []

# Get the news website's main link
domains = []
for url in links:
    domain = urlparse(url).netloc
    domains.append(domain)

news_orgs = []
for item in domains:
    parts = item.split('.')
    if len(parts) > 2:
        news_orgs.append(parts[-2] + '.' + parts[-1])
    else:
        news_orgs.append(item)

print(news_orgs)
len(news_orgs)

['detik.com', 'kumparan.com', 'radarpekalongan.id', 'tempo.co', 'kompas.com', 'kumparan.com', 'detik.com', 'harianhaluan.id', 'detik.com', 'tempo.co', 'tempo.co', 'radarpekalongan.id', 'rotasi.co', 'kompas.com']


14

In [None]:
news_org_tiers = []

# Determine the category based on the tiers
for news_org in news_orgs:
    news_org_tier = None
    for tier, websites in tiers.items():
        if news_org in websites:
            news_org_tier = tier
            break
    news_org_tiers.append(news_org_tier or '')

print(news_org_tiers)
len(news_org_tiers)

['Tier 1', 'Tier 1', 'Tier 4', 'Tier 1', 'Tier 1', 'Tier 1', 'Tier 1', 'Tier 3', 'Tier 1', 'Tier 1', 'Tier 1', 'Tier 4', 'Tier 4', 'Tier 1']


14

In [None]:
news_dates = []

# Loop through the list of links and extract the news date from each page
for link in links:
    response = requests.get(link)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    news_date = ""
    date_tag = soup.find('time')
    if date_tag:
      news_date = date_tag.text.strip()

    # Append the news title to the list
    news_dates.append(news_date)

print(news_dates)
len(news_dates)

['', '', '18 Apr 2023', '', '', '', '', '', '', '', '', '4 Mei 2023', '', '']


14

In [None]:
news_links = []
news_links = links

print(news_links)
len(news_links)

['https://www.detik.com/edu/sekolah/d-6671995/tips-bikin-media-pembelajaran-yang-berdampak-buat-murid-guru-wajib-pahami', 'https://kumparan.com/beritaanaksurabaya/tips-buat-media-pembelajaran-lebih-seru-dari-hobi-murid-20EHXVngE4i', 'https://radarpekalongan.id/media-pembelajaran-yang-menarik/', 'https://tekno.tempo.co/read/1722153/kampus-pemimpin-merdeka-design-thinking-bantu-guru-sesuaikan-kebutuhan-murid', 'https://edukasi.kompas.com/read/2023/04/28/183000271/pengungsi-luar-negeri-di-indonesia-dibekali-pelatihan-menjadi-guru', 'https://kumparan.com/beritaanaksurabaya/21-pengungsi-ikuti-pelatihan-jadi-guru-merdeka-belajar-20Jmd83tsU7', 'https://www.detik.com/edu/edutainment/d-6693985/sejumlah-pengungsi-luar-negeri-dapat-pelatihan-jadi-guru-belajar-apa-saja', 'https://harianhaluan.id/pendidikan/hh-43404/bukik-setiawan-perubahan-pendidikan-berhasil-saat-orangtua-tidak-tanya-lagi-nilai-anak/', 'https://www.detik.com/edu/sekolah/d-6700693/pakar-perubahan-pendidikan-berhasil-saat-ortu-tak-

14

In [None]:
score_tiers = []

for tier in news_org_tiers:
    if tier == "Tier 1":
        score_tiers.append(1)
    elif tier == "Tier 2":
        score_tiers.append(0.75)
    elif tier == "Tier 3":
        score_tiers.append(0.5)
    elif tier == "Tier 4":
        score_tiers.append(0.25)

print(score_tiers)
len(score_tiers)

[1, 1, 0.25, 1, 1, 1, 1, 0.5, 1, 1, 1, 0.25, 0.25, 1]


14

In [None]:
score_keywords = []

for keywords in news_keywords:
    num_keywords = len(keywords.split(', '))
    if num_keywords >= 3:
        score_keywords.append(1)
    elif num_keywords == 2:
        score_keywords.append(0.75)
    elif num_keywords == 1:
        score_keywords.append(0.5)

print(score_keywords)
len(score_keywords)

[1, 0.5, 0.75, 1, 0.75, 0.5, 1, 0.75, 0.75, 0.5, 0.75, 0.75, 0.5, 1]


14

In [None]:
# Create a DataFrame with the results
results = pd.DataFrame({'Judul': news_titles, 'Keyword': news_keywords, 'Nama Media': news_orgs, 'Tier Media': news_org_tiers, 'Tanggal': news_dates, 'Link': news_links, 'Wartawan': '', 'Narasumber 1': '', 'Narasumber 2': '', 'Sentimen ke YGB/Unit': '', 'Skor Tier': score_tiers, 'Skor Lokasi Brand': '', 'Skor Keyword': score_keywords})

# Reorder the columns
results = results[['Judul', 'Keyword', 'Nama Media', 'Tier Media', 'Tanggal', 'Link', 'Wartawan', 'Narasumber 1', 'Narasumber 2', 'Sentimen ke YGB/Unit', 'Skor Tier', 'Skor Lokasi Brand', 'Skor Keyword']]

results.head()

Unnamed: 0,Judul,Keyword,Nama Media,Tier Media,Tanggal,Link,Wartawan,Narasumber 1,Narasumber 2,Sentimen ke YGB/Unit,Skor Tier,Skor Lokasi Brand,Skor Keyword
0,Tips Bikin Media Pembelajaran yang Berdampak b...,"Kampus Pemimpin Merdeka, Rizqy Rahmat Hani, Ya...",detik.com,Tier 1,,https://www.detik.com/edu/sekolah/d-6671995/ti...,,,,,1.0,,1.0
1,Access denied,,kumparan.com,Tier 1,,https://kumparan.com/beritaanaksurabaya/tips-b...,,,,,1.0,,0.5
2,Membuat 5 Media Pembelajaran yang Menarik: Lib...,"Kampus Pemimpin Merdeka, Rizqy Rahmat Hani",radarpekalongan.id,Tier 4,18 Apr 2023,https://radarpekalongan.id/media-pembelajaran-...,,,,,0.25,,0.75
3,Kampus Pemimpin Merdeka: Design Thinking Bantu...,"Kampus Pemimpin Merdeka, Rizqy Rahmat Hani, Ya...",tempo.co,Tier 1,,https://tekno.tempo.co/read/1722153/kampus-pem...,,,,,1.0,,1.0
4,Pengungsi Luar Negeri di Indonesia Dibekali Pe...,"Kampus Guru Cikal, Marsaria Primadonna",kompas.com,Tier 1,,https://edukasi.kompas.com/read/2023/04/28/183...,,,,,1.0,,0.75


In [None]:
results_list = results.values.tolist()

# Get the tiers and keywords from the Google Sheet
sheet_url = "https://docs.google.com/spreadsheets/d/1kdvXCtxEIyc1MIoWjrX53881PCcJ49UgVWKAPBirbNE/edit#gid=1018494357"
sheet_name = "Alat Otomasi"
sheet = client.open_by_url(sheet_url).worksheet(sheet_name)

# Clear the contents of cells in the range C2:Z
cell_list = sheet.range("C2:Z")
for cell in cell_list:
    cell.value = ""
sheet.update_cells(cell_list)

sheet.update("C2", results_list)

{'spreadsheetId': '1kdvXCtxEIyc1MIoWjrX53881PCcJ49UgVWKAPBirbNE',
 'updatedRange': "'Alat Otomasi'!C2:O15",
 'updatedRows': 14,
 'updatedColumns': 13,
 'updatedCells': 182}