# Scrape News Website

In [None]:
!pip install --upgrade gspread
!pip install tldextract

In [None]:
from bs4 import BeautifulSoup
import requests

import pandas as pd

from google.colab import auth
auth.authenticate_user()

from google.auth import default
from oauth2client.service_account import ServiceAccountCredentials
import gspread

from urllib.parse import urlparse

In [None]:
# Set up the Google Sheets API credentials
creds, _ = default()
client = gspread.authorize(creds)

In [None]:
# Get the tiers and keywords from the Google Sheet
sheet_url = "..."
sheet_name = "..."
sheet = client.open_by_url(sheet_url).worksheet(sheet_name)

keywords = sheet.col_values(1)[1:]  # Exclude the header
tier1 = sheet.col_values(2)[1:]
tier2 = sheet.col_values(3)[1:]
tier3 = sheet.col_values(4)[1:]
tier4 = sheet.col_values(5)[1:]

In [None]:
# Combine the keywords into a dictionary
keywords

In [None]:
# Combine the tiers into a dictionary
tiers = {'Tier 1': tier1, 'Tier 2': tier2, 'Tier 3': tier3, 'Tier 4': tier4}
tiers

In [None]:
# Get the sheet object based on the URL and sheet name
sheet_url = "..."
sheet_name = "..."
spreadsheet = client.open_by_url(sheet_url)
sheet = spreadsheet.worksheet(sheet_name)

# Get the input link from the Google Sheet
links = sheet.col_values(7)[1:]  # Exclude the header

print(links)
len(links)

In [None]:
news_titles = []

# Loop through the list of links and extract the news title from each page
for link in links:
    response = requests.get(link)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    news_title = soup.find('h1')
    if news_title:
        news_title = news_title.text.strip()
    else:
        news_title = ''

    # Append the news title to the list
    news_titles.append(news_title)

print(news_titles)
len(news_titles)

In [None]:
news_keywords = []

# Loop through the list of links and find the keywords in the article
for link in links:
    response = requests.get(link)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    found_keywords = []
    for keyword in keywords:
      if keyword in html:
        found_keywords.append(keyword)
    news_keyword = ', '.join(found_keywords)

    # Append the news title to the list
    news_keywords.append(news_keyword)

print(news_keywords)
len(news_keywords)

In [None]:
news_orgs = []

# Get the news website's main link
domains = []
for url in links:
    domain = urlparse(url).netloc
    domains.append(domain)

news_orgs = []
for item in domains:
    parts = item.split('.')
    if len(parts) > 2:
        news_orgs.append(parts[-2] + '.' + parts[-1])
    else:
        news_orgs.append(item)

print(news_orgs)
len(news_orgs)

In [None]:
news_org_tiers = []

# Determine the category based on the tiers
for news_org in news_orgs:
    news_org_tier = None
    for tier, websites in tiers.items():
        if news_org in websites:
            news_org_tier = tier
            break
    news_org_tiers.append(news_org_tier or '')

print(news_org_tiers)
len(news_org_tiers)

In [None]:
news_dates = []

# Loop through the list of links and extract the news date from each page
for link in links:
    response = requests.get(link)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    news_date = ""
    date_tag = soup.find('time')
    if date_tag:
      news_date = date_tag.text.strip()

    # Append the news title to the list
    news_dates.append(news_date)

print(news_dates)
len(news_dates)

In [None]:
news_links = []
news_links = links

print(news_links)
len(news_links)

In [None]:
score_tiers = []

for tier in news_org_tiers:
    if tier == "Tier 1":
        score_tiers.append(1)
    elif tier == "Tier 2":
        score_tiers.append(0.75)
    elif tier == "Tier 3":
        score_tiers.append(0.5)
    elif tier == "Tier 4":
        score_tiers.append(0.25)

print(score_tiers)
len(score_tiers)

In [None]:
score_keywords = []

for keywords in news_keywords:
    num_keywords = len(keywords.split(', '))
    if num_keywords >= 3:
        score_keywords.append(1)
    elif num_keywords == 2:
        score_keywords.append(0.75)
    elif num_keywords == 1:
        score_keywords.append(0.5)

print(score_keywords)
len(score_keywords)

In [None]:
# Create a DataFrame with the results
results = pd.DataFrame({'Judul': news_titles, 'Keyword': news_keywords, 'Nama Media': news_orgs, 'Tier Media': news_org_tiers, 'Tanggal': news_dates, 'Link': news_links, 'Wartawan': '', 'Narasumber 1': '', 'Narasumber 2': '', 'Sentimen': '', 'Skor Tier': score_tiers, 'Skor Lokasi Brand': '', 'Skor Keyword': score_keywords})

# Reorder the columns
results = results[['Judul', 'Keyword', 'Nama Media', 'Tier Media', 'Tanggal', 'Link', 'Wartawan', 'Narasumber 1', 'Narasumber 2', 'Sentimen', 'Skor Tier', 'Skor Lokasi Brand', 'Skor Keyword']]

results.head()

In [None]:
results_list = results.values.tolist()

# Get the tiers and keywords from the Google Sheet
sheet_url = "..."
sheet_name = "..."
sheet = client.open_by_url(sheet_url).worksheet(sheet_name)

# Clear the contents of cells in the range C2:Z
cell_list = sheet.range("C2:Z")
for cell in cell_list:
    cell.value = ""
sheet.update_cells(cell_list)

sheet.update("C2", results_list)