In [1]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.metrics import accuracy_score

pd.options.mode.chained_assignment = None
np.random.seed(0)

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/cryoras/natural_disaster_validation/refs/heads/main/dataset.csv')
df =df.sample(n=10000)

In [4]:
df.head()

Unnamed: 0,text,label
9394,Pembangunan posko baru di Bengkulu selesai min...,not_relevant
898,Gempa 3.3 SR mengguncang Bengkulu pada 2022 08...,gempabumi
2398,Banjir setinggi 2.9 meter melanda Jakarta sela...,banjir
5906,Tanah longsor di Puncak pada 2023 03 05. Area ...,tanah_longsor
2343,Banjir setinggi 2.1 meter melanda Semarang sel...,banjir


In [5]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
not_relevant,2000
gempabumi,2000
banjir,2000
tanah_longsor,2000
tsunami,2000


#preprocessing data

In [6]:
df = df.drop_duplicates()
df.shape


(7955, 2)

In [7]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
gempabumi,2000
banjir,2000
tanah_longsor,1995
tsunami,1348
not_relevant,612


In [8]:
def cleaningText(t):
  t = re.sub(r'[0-9]+', '', t)
  t = re.sub(r'[^\w\s]', '', t)
  t = t.replace('\n', ' ') # mengganti baris baru dengan spasi
  t = t.translate(str.maketrans('', '', string.punctuation)) # menghapus semua tanda baca
  t = t.strip(' ') # menghapus karakter spasi dari kiri dan kanan teks
  return t

def caseFold(t):
  t = t.lower()
  return t

def tokenizeText(t):
  t = word_tokenize(t)
  return t

def filteringText(text): # Menghapus stopwords dalam teks
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)
    listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def stemmingText(t):
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()
  words = t.split()
  stemmed_words = [stemmer.stem(word) for word in words]
  stemmed_text = ' '.join(stemmed_words)
  return stemmed_text

def toSentence(list_words):
  sentence = ' '.join(word for word in list_words)
  return sentence

lang_dict = {
    # Istilah darurat tidak baku
    "gempa": "gempabumi",
    "gempa tektonik": "gempabumi",
    "gempa bumi": "gempabumi",
    "banjir bandang": "banjir",
    "longsor": "tanah longsor",
    "tsu": "tsunami",
    "angin puting": "angin puting beliung",
    "kebakaran hutan": "karhutla",

    # Singkatan darurat
    "bpbd": "Badan Penanggulangan Bencana Daerah",
    "posko": "posko bencana",
    "pengungsi": "korban terdampak",
    "mksd": "maksud",
    "jln": "jalan",

    # Ekspresi populer
    "wih": "waduh",
    "parah": "berat",
    "kacau": "rusak parah",
    "gede": "besar",
    "ampe": "sampai",
    "bnyk": "banyak",
    "krn": "karena",
    "yg": "yang"
}

def fix_slangwords(text):
    words = text.split()
    fixed_words = []

    for word in words:
        if word.lower() in lang_dict:
            fixed_words.append(lang_dict[word.lower()])
        else:
            fixed_words.append(word)

    fixed_text = ' '.join(fixed_words)
    return fixed_text

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')
df['text_clean'] = df['text'].apply(cleaningText)
df['text_casefold'] = df['text_clean'].apply(caseFold)
df['text_slang'] = df['text_casefold'].apply(fix_slangwords)
df['text_tokenize'] = df['text_slang'].apply(tokenizeText)
df['text_stopword'] = df['text_tokenize'].apply(filteringText)
df['text_end'] = df['text_stopword'].apply(toSentence)
df['text_stemmer'] = df['text_end'].apply(stemmingText)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['label_encoded'] = encoder.fit_transform(df['label'])