# Perbandingan Metode Prepro

# ----------------------------------------------------------------


## Mengunakaan Libary Sastrawi

In [1]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re

# Teks awal
teks = "Daftar ke KPU, Prabowo: Kita Ingin Pemilu yang Sejuk-Rukun"

# Menghapus karakter non-alfanumerik
teks = re.sub(r'[^a-zA-Z0-9\s]', '', teks)

# Case folding
teks = teks.lower()

# Menghapus stopwords
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()
teks = stopword.remove(teks)

# Stemming
# factory = StemmerFactory()
# stemmer = factory.create_stemmer()
# teks = stemmer.stem(teks)

# Tokenisasi
tokens = teks.split()

print(tokens)

['daftar', 'kpu', 'prabowo', 'pemilu', 'sejukrukun']


## Mengunakaan Libary NLTK

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re

# Download resource untuk NLTK
nltk.download('punkt')
nltk.download('stopwords')

# Teks awal
teks = "Daftar ke KPU, Prabowo: Kita Ingin Pemilu yang Sejuk-Rukun"

# Menghapus karakter non-alfanumerik
teks = re.sub(r'[^a-zA-Z0-9\s]', '', teks)

# Case folding
teks = teks.lower()

# Tokenisasi
tokens = word_tokenize(teks)

# Menghapus stopwords
stop_words = set(stopwords.words('indonesian'))
tokens = [word for word in tokens if word not in stop_words]

# Stemming
# factory = StemmerFactory()
# stemmer = factory.create_stemmer()
# stemmed_tokens = [stemmer.stem(token) for token in tokens]

print(tokens)


['daftar', 'kpu', 'prabowo', 'pemilu', 'sejukrukun']


[nltk_data] Downloading package punkt to /home/krisna/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/krisna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Mengunakaan Libary Spacy

## Mengunakaan Libary Stanza

In [3]:
import stanza
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re

# Download model Bahasa Indonesia jika belum diunduh
#stanza.download('id')

# Inisialisasi prosesor untuk Bahasa Indonesia
nlp = stanza.Pipeline(lang='id', processors='tokenize')

# Teks awal
teks = "Daftar ke KPU, Prabowo: Kita Ingin Pemilu yang Sejuk-Rukun"

# Menghapus karakter non-alfanumerik
teks = re.sub(r'[^a-zA-Z0-9\s]', '', teks)

# Case folding
teks = teks.lower()

# Tokenisasi dengan Stanza
doc = nlp(teks)
tokens = [word.text for sent in doc.sentences for word in sent.words]

# Menghapus stopwords
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()
tokens = stopword.remove(' '.join(tokens)).split()

# Stemming
# factory = StemmerFactory()
# stemmer = factory.create_stemmer()
# stemmed_tokens = [stemmer.stem(token) for token in tokens]

print(tokens)


  from .autonotebook import tqdm as notebook_tqdm
2024-03-17 18:01:10 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 10.8MB/s]                    
2024-03-17 18:01:10 INFO: Downloaded file to /home/krisna/stanza_resources/resources.json
2024-03-17 18:01:10 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |

2024-03-17 18:01:11 INFO: Using device: cpu
2024-03-17 18:01:11 INFO: Loading: tokenize
2024-03-17 18:01:11 INFO: Loading: mwt
2024-03-17 18:01:12 INFO: Done loading processors!


['daftar', 'kpu', 'prabowo', 'pemilu', 'sejukrukun']


In [4]:
import stanza
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

# Download model Bahasa Indonesia jika belum diunduh
#stanza.download('id')

# Inisialisasi prosesor untuk Bahasa Indonesia
nlp = stanza.Pipeline(lang='id', processors='tokenize')

# Teks awal
teks = "Daftar ke KPU, Prabowo: Kita Ingin Pemilu yang Sejuk-Rukun"

# Menghapus karakter non-alfanumerik
teks = re.sub(r'[^a-zA-Z0-9\s]', '', teks)

# Case folding
teks = teks.lower()

# Tokenisasi dengan Stanza
doc = nlp(teks)
tokens = [word.text for sent in doc.sentences for word in sent.words]

# Menghapus stopwords
stop_words = set(stopwords.words('indonesian'))
filtered_tokens = [token for token in tokens if token not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

print(stemmed_tokens)


2024-03-17 18:01:12 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 15.2MB/s]                    
2024-03-17 18:01:12 INFO: Downloaded file to /home/krisna/stanza_resources/resources.json
2024-03-17 18:01:12 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |

2024-03-17 18:01:12 INFO: Using device: cpu
2024-03-17 18:01:12 INFO: Loading: tokenize
2024-03-17 18:01:12 INFO: Loading: mwt
2024-03-17 18:01:12 INFO: Done loading processors!


['daftar', 'kpu', 'prabowo', 'pemilu', 'sejukrukun']
