In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re
import string
import json
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### **LOAD DATASET**

In [2]:
# Load dataset
data_path = "/content/News_Category_Dataset_v3.json"  # Make sure you upload this to Colab

In [3]:
# Load and preprocess
articles = []
with open(data_path, 'r') as f:
    for line in f:
        try:
            articles.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Skipping line due to JSONDecodeError: {e}")
            # Optionally print the problematic line to inspect it:
            # print(f"Problematic line: {line}")
            continue # Skip the problematic line and continue with the next

df = pd.DataFrame(articles)
df = df[['headline', 'short_description', 'category']]
df.dropna(inplace=True)
df['text'] = df['headline'] + " " + df['short_description']

Skipping line due to JSONDecodeError: Unterminated string starting at: line 1 column 324 (char 323)


In [4]:
# Text preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    return " ".join(tokens)

In [5]:
# Ambil sampel 10.000 data untuk efisiensi komputasi
df = df.sample(n=10000, random_state=42).reset_index(drop=True)

df['clean_text'] = df['text'].apply(clean_text)

### Menggunakan TF-IDF

In [11]:
# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['clean_text'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [13]:
# Function to get top-N similar articles
def recommend_articles(idx, cosine_sim=cosine_sim, df=df, top_n=5):
    # Ambil skor similarity untuk artikel ke-idx
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Urutkan berdasarkan skor tertinggi (kecuali dirinya sendiri)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Skip index 0 (artikel itu sendiri)

    print(f"\n=== Artikel Asli (Index {idx}) ===")
    print(f"Kategori  : {df.iloc[idx]['category']}")
    print(f"Judul     : {df.iloc[idx]['headline']}")
    print(f"Deskripsi : {df.iloc[idx]['short_description']}")

    print(f"\n=== {top_n} Artikel Mirip ===")
    for i, (rec_idx, score) in enumerate(sim_scores):
        print(f"\n[{i+1}] Skor Similarity: {score:.4f}")
        print(f"Kategori  : {df.iloc[rec_idx]['category']}")
        print(f"Judul     : {df.iloc[rec_idx]['headline']}")
        print(f"Deskripsi : {df.iloc[rec_idx]['short_description']}")

In [14]:
recommend_articles(10, top_n=5)


=== Artikel Asli (Index 10) ===
Kategori  : HEALTHY LIVING
Judul     : Expanding the Emergency Room Model: 'Central Care System' Could Help Americans Gain Universal Health Care Access
Deskripsi : 

=== 5 Artikel Mirip ===

[1] Skor Similarity: 0.3097
Kategori  : HEALTHY LIVING
Judul     : Trump Budget Undercuts Fight Against Cancer
Deskripsi : We face a crisis that could put unbearable strains on our health care system.

[2] Skor Similarity: 0.2883
Kategori  : HEALTHY LIVING
Judul     : The Face of Health Care Reform
Deskripsi : 

[3] Skor Similarity: 0.2815
Kategori  : BUSINESS
Judul     : Banking Saves Health Care
Deskripsi : The problem that confronts health care represents a lucrative business opportunity for the industry that does data transaction best: banking.

[4] Skor Similarity: 0.2714
Kategori  : POLITICS
Judul     : GOP Congressman: 'Nobody Dies Because They Don't Have Access To Health Care'
Deskripsi : Factcheck: False.

[5] Skor Similarity: 0.2589
Kategori  : RELIGION
Ju

### Menggunakan BERT

In [6]:
!pip install -U sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [7]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# Load pre-trained BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate dense embeddings
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)

# Cosine similarity
cosine_sim = cosine_similarity(embeddings, embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [9]:
# Function to get top-N similar articles
def recommend_articles(idx, cosine_sim=cosine_sim, df=df, top_n=5):
    # Ambil skor similarity untuk artikel ke-idx
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Urutkan berdasarkan skor tertinggi (kecuali dirinya sendiri)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Skip index 0 (artikel itu sendiri)

    print(f"\n=== Artikel Asli (Index {idx}) ===")
    print(f"Kategori  : {df.iloc[idx]['category']}")
    print(f"Judul     : {df.iloc[idx]['headline']}")
    print(f"Deskripsi : {df.iloc[idx]['short_description']}")

    print(f"\n=== {top_n} Artikel Mirip ===")
    for i, (rec_idx, score) in enumerate(sim_scores):
        print(f"\n[{i+1}] Skor Similarity: {score:.4f}")
        print(f"Kategori  : {df.iloc[rec_idx]['category']}")
        print(f"Judul     : {df.iloc[rec_idx]['headline']}")
        print(f"Deskripsi : {df.iloc[rec_idx]['short_description']}")

In [10]:
recommend_articles(42, top_n=5)


=== Artikel Asli (Index 42) ===
Kategori  : QUEER VOICES
Judul     : Daniela Vega To Be The Oscar's First Openly Transgender Presenter
Deskripsi : And the winner is ... all of us!

=== 5 Artikel Mirip ===

[1] Skor Similarity: 0.5557
Kategori  : QUEER VOICES
Judul     : 'Parks And Rec' Star Natalie Morales Comes Out As Queer
Deskripsi : “I think it’s important that I tell you that this familiar face you see on your TV is the Q part of LGBTQ."

[2] Skor Similarity: 0.5310
Kategori  : QUEER VOICES
Judul     : Cuba Inspires Names Candis Cayne As Honorary LGBT Ambassador To Cuba
Deskripsi : The trans actress and activist will help promote LGBT rights.

[3] Skor Similarity: 0.4790
Kategori  : QUEER VOICES
Judul     : Katie Couric Responds To Controversy Over Invasive Question About Transgender Guest
Deskripsi : "Even if some thought my question was off base, I wanted to make sure my question and Carmen's answer stayed in the show

[4] Skor Similarity: 0.4765
Kategori  : ENTERTAINMENT
Judul