<a href="https://colab.research.google.com/github/faisalrizqin/UTS-STKI/blob/main/uts_stki.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# Import library
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [14]:
# Download resource NLTK
nltk.download('stopwords')

# Inisialisasi stopwords & stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Load dataset BBC News dari Hugging Face
df = pd.read_csv("bbc-text.csv")
print("Dataset berhasil dimuat. Jumlah dokumen:", len(df))

df.head()

Dataset berhasil dimuat. Jumlah dokumen: 2225


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [15]:
# Fungsi preprocessing teks
def preprocess(text):
    text = text.lower()                                      # Case folding
    text = re.sub(r'[^a-z\s]', '', text)                     # Remove non-alphabet
    tokens = text.split()                                    # Tokenization
    tokens = [w for w in tokens if w not in stop_words]      # Stopword removal
    tokens = [stemmer.stem(w) for w in tokens]               # Stemming
    return ' '.join(tokens)

# Terapkan preprocessing
df['clean_text'] = df['text'].apply(preprocess)

# Simpan hasil preprocessing menjadi file CSV
df.to_csv("bbc-text-preprocessed.csv", index=False)
print("File 'bbc-text-preprocessed.csv' berhasil disimpan.")

File 'bbc-text-preprocessed.csv' berhasil disimpan.


In [16]:
# -------------------------------------------------------
# Load dataset HASIL PREPROCESSING
# -------------------------------------------------------
df = pd.read_csv("bbc-text-preprocessed.csv")
print("Dataset preprocessed berhasil dimuat. Jumlah dokumen:", len(df))

# Cek 5 baris pertama
df.head()

Dataset preprocessed berhasil dimuat. Jumlah dokumen: 2225


Unnamed: 0,category,text,clean_text
0,tech,tv future in the hands of viewers with home th...,tv futur hand viewer home theatr system plasma...
1,business,worldcom boss left books alone former worldc...,worldcom boss left book alon former worldcom b...
2,sport,tigers wary of farrell gamble leicester say ...,tiger wari farrel gambl leicest say rush make ...
3,sport,yeading face newcastle in fa cup premiership s...,yead face newcastl fa cup premiership side new...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean twelv raid box offic ocean twelv crime c...


In [17]:
# -------------------------------------------
# Mulai proses indexing & pencarian (VSM)
# -------------------------------------------

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['clean_text'])

# Input query
query = "global economic growth"
query_vec = vectorizer.transform([query])

# Hitung cosine similarity
similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()

# Ambil 5 dokumen paling relevan
top_indices = similarity.argsort()[-5:][::-1]

for i in top_indices:
    print(f"\n[Category: {df.iloc[i]['category']}]")
    print(f"Text: {df.iloc[i]['text'][:200]}...")
    print(f"Similarity Score: {similarity[i]:.4f}")


[Category: business]
Text: newest eu members underpin growth the european union s newest members will bolster europe s economic growth in 2005  according to a new report.  the eight central european states which joined the eu l...
Similarity Score: 0.3381

[Category: business]
Text: consumers drive french economy france s economic growth accelerated in the last three months of 2004  driven by consumer spending  a report shows.  gross domestic product (gdp) rose by 0.8% in the fou...
Similarity Score: 0.2461

[Category: business]
Text: singapore growth at 8.1% in 2004 singapore s economy grew by 8.1% in 2004  its best performance since 2000  figures from the trade ministry show.  the advance  the second-fastest in asia after china  ...
Similarity Score: 0.2202

[Category: business]
Text: singapore growth at 8.1% in 2004 singapore s economy grew by 8.1% in 2004  its best performance since 2000  figures from the trade ministry show.  the advance  the second-fastest in asia after china  .