# TF-IDF Processing

## Import Packages

In [1]:
import re
from pymongo import MongoClient
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

## Setup Database Connection

In [2]:
client = MongoClient('mongodb://localhost:27017/')
db = client['local']
news_data_collection = db['news_data']
tf_idf_collection = db['tf_idf']
feature_collection = db['feature']

## Read Data from Database

In [3]:
documents = news_data_collection.find()
sample_news = []
for doc in documents:
    sample_news.append({
        '_id' : doc.get('_id'),
        'title': doc.get('Judul', 'Judul tidak ditemukan'),
        'content': doc.get('Isi Berita', 'Isi tidak ditemukan')
    })    


## Function to Preprocess

In [4]:
news_data = sample_news
preprocessed_docs = []

stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()

def preprocess_text(text):
    text = re.sub(r'[^\w\s]|[\d]', ' ', text)
    text = text.lower()
    text = stopword_remover.remove(text)
    text = stemmer.stem(text)
    tokens = text.split()     
    return tokens

def _preprocess_all_documents():
    for doc in news_data:
        full_text = f"{doc['title']} {doc['content']}"
        processed_tokens = preprocess_text(full_text)
        preprocessed_docs.append(processed_tokens)
        print(f"Dokumen dengan id {doc['_id']}:", processed_tokens)

## Compute and Process TF-IDF then Save to Database

In [5]:
_preprocess_all_documents()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in preprocessed_docs])

dense_tfidf_matrix = tfidf_matrix.toarray()
feature_names = vectorizer.get_feature_names_out().tolist()

data_to_insert = []
for i, doc in enumerate(sample_news):
    tfidf_vector = dense_tfidf_matrix[i]
    
    data_to_insert.append({
        "Document_id": doc['_id'],
        "tfidf_vector": tfidf_vector.tolist(),
    })

try:
    tf_idf_collection.insert_many(data_to_insert)
    feature_collection.insert_one({
        "feature_names": feature_names
    })
    print("Data inserted successfully")
except:
    print("Error while inserting data to database")


Dokumen dengan id 6733fa16d440647753304d23: ['bintang', 'voli', 'indonesia', 'karier', 'luar', 'negeri', 'megawati', 'hangestri', 'hingga', 'baru', 'rendy', 'tamamilang', 'ikut', 'empat', 'voli', 'indonesia', 'karier', 'luar', 'negeri', 'mana', 'megawati', 'hangestri', 'hingga', 'paling', 'baru', 'rendy', 'tamamilang', 'baru', 'baru', 'volimania', 'geger', 'kabar', 'gabung', 'main', 'timnas', 'voli', 'indonesia', 'rendy', 'tamamilang', 'sama', 'klub', 'vietnam', 'main', 'jakarta', 'bhayangkara', 'presisi', 'gabung', 'klub', 'ho', 'chi', 'minh', 'city', 'police', 'liga', 'voli', 'vietnam', 'rendy', 'tamamilang', 'bukan', 'satu', 'satu', 'main', 'indonesia', 'abroad', 'karier', 'luar', 'negeri', 'baca', 'ada', 'tiga', 'voli', 'indonesia', 'putus', 'lanjut', 'karier', 'mentas', 'negeri', 'orang', 'lebih', 'lengkap', 'ikut', 'empat', 'voli', 'indonesia', 'yang', 'putus', 'karier', 'luar', 'negeri']
Dokumen dengan id 6733fa16d440647753304d24: ['bahlil', 'minta', 'warga', 'lokal', 'libat', '