In [1]:
import pandas as pd
import re
import csv
from collections import Counter
import math

### DATA PREPARATION

In [2]:
dataset = pd.read_csv('dataset/dataset.csv')
stopwords = set()
with open('./dataset/stopwordindonesia.csv', mode='r') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        stopwords.add(row[0].strip())

### EDA

In [3]:
# remove unused column
dataset.drop(columns=['label'], inplace=True)

### PREPROCESSING

In [4]:
# format text (text cleaning), applied stopwords
def preprocess(text):
    # Ubah ke huruf kecil
    text = text.lower()
    # Hapus tanda baca
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenisasi dan hilangkan stopwords
    words = text.split()
    words = [word for word in words if word not in stopwords]
    return words

# compute TF
def compute_tf(doc):
    tf_dict = {}
    counter = Counter(doc)
    total_terms = len(doc)
    for word, count in counter.items():
        tf_dict[word] = count / total_terms
    return tf_dict

# compute IDF
def compute_idf(docs):
    idf_dict = {}
    total_docs = len(docs)
    all_words = set(word for doc in docs for word in doc)

    for word in all_words:
        doc_count = sum(1 for doc in docs if word in doc)
        idf_dict[word] = math.log(total_docs / (1 + doc_count))  # menggunakan log smoothing
    return idf_dict

# compute TF-IDF
def compute_tfidf(tf, idf):
    tfidf_dict = {}
    for word, tf_value in tf.items():
        tfidf_dict[word] = tf_value * idf.get(word, 0)
    return tfidf_dict

# unique word
def to_vector(tfidf_dict, all_words):
    return [tfidf_dict.get(word, 0) for word in all_words]

In [5]:
preprocessed_text = [preprocess(doc) for doc in dataset['title']]
tf_docs = [compute_tf(doc) for doc in preprocessed_text]
idf_docs = compute_idf(tf_docs)
tfidf_docs = [compute_tfidf(tf, idf_docs) for tf in tf_docs]

all_words = sorted(idf_docs.keys())
tfidf_vectors = [to_vector(tfidf, all_words) for tfidf in tfidf_docs]

In [15]:
len(tfidf_vectors)

15000