In [35]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import numpy as np

In [38]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
# Load the dataset
df = pd.read_csv('kamus_clean.csv')

In [47]:
# 1. Tokenization
def tokenize(text):
    return word_tokenize(text.lower())

df['tokens'] = df['TIDAK BAKU'].apply(tokenize)

In [48]:
# 2. Stopword Removal
stop_words = set(stopwords.words('indonesian'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df['filtered_tokens'] = df['tokens'].apply(remove_stopwords)

In [None]:
# 3. Optimized Stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Caching for stemmed words
stem_cache = {}

def cached_stem(word):
    if word not in stem_cache:
        stem_cache[word] = stemmer.stem(word)
    return stem_cache[word]

def stem_words_cached(tokens):
    return [cached_stem(word) for word in tokens]

# Apply stemming with progress bar
tqdm.pandas(desc="Stemming")
df['stemmed_tokens'] = df['filtered_tokens'].progress_apply(stem_words_cached)


Stemming:   3%|▎         | 495/15022 [01:15<22:55, 10.56it/s]

In [None]:
# 4. Index Formation
index = set()
for tokens in df['stemmed_tokens']:
    index.update(tokens)

In [43]:
# 5. TF-IDF Calculation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['BAKU'])

In [44]:
# 6. Query Matching
def search_query(query):
    query_vector = tfidf_vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-6:-1]
    return df.iloc[related_docs_indices]

In [45]:
# Example usage
print("Tokenized words:")
print(df['tokens'].head())
print("\nFiltered words:")
print(df['filtered_tokens'].head())
print("\nStemmed words:")
print(df['stemmed_tokens'].head())
print("\nIndex (first 10 terms):")
print(list(index)[:10])
print("\nTF-IDF Matrix shape:")
print(tfidf_matrix.shape)
print("\nSearch query example:")
result = search_query("mahasiswa")
print(result[['TIDAK BAKU', 'BAKU']])

Tokenized words:
0         [mangtab]
1     [evolusionis]
2        [abstarck]
3    [sebenenarnya]
4     [kataakanlah]
Name: tokens, dtype: object

Filtered words:
0         [mangtab]
1     [evolusionis]
2        [abstarck]
3    [sebenenarnya]
4     [kataakanlah]
Name: filtered_tokens, dtype: object

Stemmed words:
0         [mangtab]
1     [evolusionis]
2        [abstarck]
3    [sebenenarnya]
4     [kataakanlah]
Name: stemmed_tokens, dtype: object

Index (first 10 terms):
['sebanrnya', 'omdo', 'hubnya', 'sebenenarnya', 'semster', 'gua', 'berkwalitas', 'ntn', 'taeeekkk', 'plis']

TF-IDF Matrix shape:
(42, 38)

Search query example:
   TIDAK BAKU       BAKU
36   mahasiwa  mahasiswa
41     gercep    tangkas
9       kanca     kancah
17      Alloh      Allah
16   intgrasi  integrasi
