# Install Dependency

In [None]:
!pip install beautifulsoup4 lxml nltk transformers torch requests

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

# Import Library

In [None]:
import bs4 as bs
import urllib.request
import string
import nltk
from nltk.corpus import stopwords
from transformers import pipeline, AutoTokenizer

In [None]:
# Download stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Web Scraping

In [None]:
# Input URL dari pengguna
url = input("Please input the URL you want to scrape: ")

Please input the URL you want to scrape: https://www.bbc.com/sport/formula1/articles/c78egyj4ng3o


In [None]:
# Ambil dan parsing konten halaman
web_scraping = urllib.request.urlopen(url)
content = web_scraping.read()
parsing = bs.BeautifulSoup(content, 'lxml')

In [None]:
# Mengambil teks utama dari website
content_div = parsing.find('div', {'class': 'mw-parser-output'})
paragraphs = content_div.find_all(['p', 'li']) if content_div else parsing.find_all('p')

# Gabungkan seluruh teks artikel
article_text = " ".join([p.text for p in paragraphs])

# Text Cleaning

In [None]:
# Fungsi untuk membersihkan teks
def clean_text(text):
    if not text:
        return ""
    text = ''.join([char for char in text if char not in string.punctuation])
    words = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    return ' '.join(words)

# Bersihkan teks
cleaned_text = clean_text(article_text)

In [None]:
# Validasi teks
if not cleaned_text:
    print("Teks kosong setelah dibersihkan. Tidak dapat diringkas.")
    exit()

if len(cleaned_text.split()) < 50:
    print("Teks terlalu pendek untuk diringkas. Gunakan artikel yang lebih panjang.")
    exit()

In [None]:
# Batasi teks maksimal 1024 kata
max_words = 1024
word_list = cleaned_text.split()
if len(word_list) > max_words:
    cleaned_text = " ".join(word_list[:max_words])

In [None]:
print("Original Text:\n", article_text[:1000])
print("\nCleaned Text:\n", cleaned_text)

Original Text:
 Seven-time champion Hamilton won his first ever sprint race on Saturday Lewis Hamilton hit out at "yapping" critics after taking his first win for Ferrari in the sprint race at the Chinese Grand Prix. The seven-time champion followed up his win in only his second event for his new team with fifth place on the grid for Sunday's main event but said he was "optimistic" of a good result. Hamilton did not identify the people he was referring to but said they "lacked understanding" of how difficult it was to achieve success straight away with a new team. The 40-year-old said: "People just love to be negative at any opportunity. Even with the smallest things, they'll just be negative about it. "That's just the difficult time that we're living in. "I see certain individuals – and again, I don't read the news, but I see bits here and there – see people that I've admired for years just talking out of turn. "Clearly some of them really just making uneducated guesses of what's goin

# Tokenization & Modeling

In [None]:
# Memuat model summarization dan tokenizer menggunakan AutoTokenizer
model_name = "facebook/bart-large-cnn"
summarizer = pipeline("summarization", model=model_name)  # Memuat pipeline untuk merangkum teks
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Memuat tokenizer untuk model

In [None]:
# Lakukan summarization
summary = summarizer(cleaned_text, max_length=150, min_length=50, do_sample=False)

# Output

In [None]:
import textwrap

# Fungsi untuk wrap teks
def wrap_text(text, width=80):
    return "\n".join(textwrap.wrap(text, width=width))

# Output ringkasan
print("\nOriginal Text:\n", wrap_text(article_text[:1000]))
print("\nSummary:\n", wrap_text(summary[0]['summary_text']))


Original Text:
 Seven-time champion Hamilton won his first ever sprint race on Saturday Lewis
Hamilton hit out at "yapping" critics after taking his first win for Ferrari in
the sprint race at the Chinese Grand Prix. The seven-time champion followed up
his win in only his second event for his new team with fifth place on the grid
for Sunday's main event but said he was "optimistic" of a good result. Hamilton
did not identify the people he was referring to but said they "lacked
understanding" of how difficult it was to achieve success straight away with a
new team. The 40-year-old said: "People just love to be negative at any
opportunity. Even with the smallest things, they'll just be negative about it.
"That's just the difficult time that we're living in. "I see certain individuals
– and again, I don't read the news, but I see bits here and there – see people
that I've admired for years just talking out of turn. "Clearly some of them
really just making uneducated guesses of what's goi