In [1]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.metrics import accuracy_score

pd.options.mode.chained_assignment = None
np.random.seed(0)

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/cryoras/natural_disaster_validation/refs/heads/main/dataset.csv')
df =df.sample(n=10000)

In [4]:
df.head()

Unnamed: 0,text,label
9394,Pembangunan posko baru di Bengkulu selesai min...,not_relevant
898,Gempa 3.3 SR mengguncang Bengkulu pada 2022 08...,gempabumi
2398,Banjir setinggi 2.9 meter melanda Jakarta sela...,banjir
5906,Tanah longsor di Puncak pada 2023 03 05. Area ...,tanah_longsor
2343,Banjir setinggi 2.1 meter melanda Semarang sel...,banjir


In [5]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
not_relevant,2000
gempabumi,2000
banjir,2000
tanah_longsor,2000
tsunami,2000


#preprocessing data

In [6]:
df = df.drop_duplicates()
df.shape


(7955, 2)

In [7]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
gempabumi,2000
banjir,2000
tanah_longsor,1995
tsunami,1348
not_relevant,612


In [8]:
def cleaningText(t):
  t = re.sub(r'[0-9]+', '', t)
  t = re.sub(r'[^\w\s]', '', t)
  t = t.replace('\n', ' ') # mengganti baris baru dengan spasi
  t = t.translate(str.maketrans('', '', string.punctuation)) # menghapus semua tanda baca
  t = t.strip(' ') # menghapus karakter spasi dari kiri dan kanan teks
  return t

def caseFold(t):
  t = t.lower()
  return t

def tokenizeText(t):
  t = word_tokenize(t)
  return t

def filteringText(text): # Menghapus stopwords dalam teks
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)
    listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def stemmingText(t):
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()
  words = t.split()
  stemmed_words = [stemmer.stem(word) for word in words]
  stemmed_text = ' '.join(stemmed_words)
  return stemmed_text

def toSentence(list_words):
  sentence = ' '.join(word for word in list_words)
  return sentence

lang_dict = {
    # Istilah darurat tidak baku
    "gempa": "gempabumi",
    "gempa tektonik": "gempabumi",
    "gempa bumi": "gempabumi",
    "banjir bandang": "banjir",
    "longsor": "tanah longsor",
    "tsu": "tsunami",
    "angin puting": "angin puting beliung",
    "kebakaran hutan": "karhutla",

    # Singkatan darurat
    "bpbd": "Badan Penanggulangan Bencana Daerah",
    "posko": "posko bencana",
    "pengungsi": "korban terdampak",
    "mksd": "maksud",
    "jln": "jalan",

    # Ekspresi populer
    "wih": "waduh",
    "parah": "berat",
    "kacau": "rusak parah",
    "gede": "besar",
    "ampe": "sampai",
    "bnyk": "banyak",
    "krn": "karena",
    "yg": "yang"
}

def fix_slangwords(text):
    words = text.split()
    fixed_words = []

    for word in words:
        if word.lower() in lang_dict:
            fixed_words.append(lang_dict[word.lower()])
        else:
            fixed_words.append(word)

    fixed_text = ' '.join(fixed_words)
    return fixed_text

In [11]:
nltk.download('punkt_tab')
nltk.download('stopwords')
df['text_clean'] = df['text'].apply(cleaningText)
df['text_casefold'] = df['text_clean'].apply(caseFold)
df['text_slang'] = df['text_casefold'].apply(fix_slangwords)
df['text_tokenize'] = df['text_slang'].apply(tokenizeText)
df['text_stopword'] = df['text_tokenize'].apply(filteringText)
df['text_end'] = df['text_stopword'].apply(toSentence)



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
df.head()

Unnamed: 0,text,label,text_clean,text_casefold,text_slang,text_tokenize,text_stopword,text_end
9394,Pembangunan posko baru di Bengkulu selesai min...,not_relevant,Pembangunan posko baru di Bengkulu selesai min...,pembangunan posko baru di bengkulu selesai min...,pembangunan posko bencana baru di bengkulu sel...,"[pembangunan, posko, bencana, baru, di, bengku...","[pembangunan, posko, bencana, bengkulu, selesa...",pembangunan posko bencana bengkulu selesai minggu
898,Gempa 3.3 SR mengguncang Bengkulu pada 2022 08...,gempabumi,Gempa SR mengguncang Bengkulu pada fasili...,gempa sr mengguncang bengkulu pada fasili...,gempabumi sr mengguncang bengkulu pada fasilit...,"[gempabumi, sr, mengguncang, bengkulu, pada, f...","[gempabumi, sr, mengguncang, bengkulu, fasilit...",gempabumi sr mengguncang bengkulu fasilitas pu...
2398,Banjir setinggi 2.9 meter melanda Jakarta sela...,banjir,Banjir setinggi meter melanda Jakarta selama ...,banjir setinggi meter melanda jakarta selama ...,banjir setinggi meter melanda jakarta selama j...,"[banjir, setinggi, meter, melanda, jakarta, se...","[banjir, meter, melanda, jakarta, jam, warga, ...",banjir meter melanda jakarta jam warga mengungsi
5906,Tanah longsor di Puncak pada 2023 03 05. Area ...,tanah_longsor,Tanah longsor di Puncak pada Area seluas h...,tanah longsor di puncak pada area seluas h...,tanah tanah longsor di puncak pada area seluas...,"[tanah, tanah, longsor, di, puncak, pada, area...","[tanah, tanah, longsor, puncak, area, seluas, ...",tanah tanah longsor puncak area seluas hektar ...
2343,Banjir setinggi 2.1 meter melanda Semarang sel...,banjir,Banjir setinggi meter melanda Semarang selama...,banjir setinggi meter melanda semarang selama...,banjir setinggi meter melanda semarang selama ...,"[banjir, setinggi, meter, melanda, semarang, s...","[banjir, meter, melanda, semarang, jam, jalan,...",banjir meter melanda semarang jam jalan protok...


In [13]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['label_encoded'] = encoder.fit_transform(df['label'])

In [26]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
gempabumi,2000
banjir,2000
tanah_longsor,1995
tsunami,1348
not_relevant,612


In [27]:
df['label_encoded'].value_counts()

Unnamed: 0_level_0,count
label_encoded,Unnamed: 1_level_1
1,2000
0,2000
3,1995
4,1348
2,612


# model building

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, precision_score

In [16]:
X = df['text_end']
Y = df['label_encoded']

In [17]:
tfidf = TfidfVectorizer(max_features=200, min_df=17, max_df=0.8 )
X_tfidf = tfidf.fit_transform(X)

In [18]:
features_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
features_df

Unnamed: 0,aceh,agam,aktivitas,aman,area,badan,bakti,banda,bandung,banjarmasin,...,tanah,tenggara,terdampak,terdeteksi,terendam,tergenang,tsunami,warga,wonosobo,yogyakarta
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.423637,0.0,0.0
3,0.0,0.0,0.0,0.0,0.296595,0.0,0.0,0.0,0.0,0.0,...,0.522072,0.0,0.296595,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.381255,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7950,0.0,0.0,0.0,0.0,0.298718,0.0,0.0,0.0,0.0,0.0,...,0.525809,0.0,0.298718,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
7951,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.283545,0.000000,0.000000,0.283545,0.000000,0.0,0.0
7952,0.0,0.0,0.0,0.0,0.297066,0.0,0.0,0.0,0.0,0.0,...,0.522900,0.0,0.297066,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
7953,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.426767,0.0,0.0


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, Y, test_size=0.2, random_state=42)

In [21]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [22]:
naive_bayes = BernoulliNB()
naive_bayes.fit(X_train.toarray(), y_train)
y_pred_train_nb = naive_bayes.predict(X_train.toarray())
y_pred_test_nb = naive_bayes.predict(X_test.toarray())
accuracy_train_nb = accuracy_score(y_pred_train_nb, y_train)
accuracy_test_nb = accuracy_score(y_pred_test_nb, y_test)
print('Naive Bayes - accuracy_train:', accuracy_train_nb)
print('Naive Bayes - accuracy_test:', accuracy_test_nb)

Naive Bayes - accuracy_train: 1.0
Naive Bayes - accuracy_test: 1.0


In [23]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train.toarray(), y_train)
y_pred_train_rf = random_forest.predict(X_train.toarray())
y_pred_test_rf = random_forest.predict(X_test.toarray())
accuracy_train_rf = accuracy_score(y_pred_train_rf, y_train)
accuracy_test_rf = accuracy_score(y_pred_test_rf, y_test)
print('Random Forest - accuracy_train:', accuracy_train_rf)
print('Random Forest - accuracy_test:', accuracy_test_rf)

Random Forest - accuracy_train: 1.0
Random Forest - accuracy_test: 1.0


In [24]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train.toarray(), y_train)
y_pred_train_lr = logistic_regression.predict(X_train.toarray())
y_pred_test_lr = logistic_regression.predict(X_test.toarray())
accuracy_train_lr = accuracy_score(y_pred_train_lr, y_train)
accuracy_test_lr = accuracy_score(y_pred_test_lr, y_test)
print('Logistic Regression - accuracy_train:', accuracy_train_lr)
print('Logistic Regression - accuracy_test:', accuracy_test_lr)

Logistic Regression - accuracy_train: 1.0
Logistic Regression - accuracy_test: 1.0


In [25]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train.toarray(), y_train)
y_pred_train_dt = decision_tree.predict(X_train.toarray())
y_pred_test_dt = decision_tree.predict(X_test.toarray())
accuracy_train_dt = accuracy_score(y_pred_train_dt, y_train)
accuracy_test_dt = accuracy_score(y_pred_test_dt, y_test)
print('Decision Tree - accuracy_train:', accuracy_train_dt)
print('Decision Tree - accuracy_test:', accuracy_test_dt)

Decision Tree - accuracy_train: 1.0
Decision Tree - accuracy_test: 1.0


# inference

In [35]:
# Input kalimat baru dari pengguna
kalimat_baru = input("Masukkan kalimat baru: ")

# Melakukan preprocessing pada kalimat baru
kalimat_baru_cleaned = cleaningText(kalimat_baru)
kalimat_baru_casefolded = caseFold(kalimat_baru_cleaned)
kalimat_baru_slangfixed = fix_slangwords(kalimat_baru_casefolded)
kalimat_baru_tokenized = tokenizeText(kalimat_baru_slangfixed)
kalimat_baru_filtered = filteringText(kalimat_baru_tokenized)
kalimat_baru_final = toSentence(kalimat_baru_filtered)

# Menggunakan objek tfidf yang sudah di-fit dari pelatihan sebelumnya
X_kalimat_baru = tfidf.transform([kalimat_baru_final])

# Memperoleh prediksi sentimen kalimat baru
prediksi_sentimen = logistic_regression.predict(X_kalimat_baru)

print(prediksi_sentimen)
# Menampilkan hasil prediksi
# if prediksi_sentimen[0] == 'banjir':
#     print("Sentimen kalimat baru adalah banjir.")
# if prediksi_sentimen[1] == 'gempabumi':
#     print("Sentimen kalimat baru adalah gempabumi.")
# if prediksi_sentimen[2] == 'tidak relevan':
#     print("Sentimen kalimat baru adalah banjir.")
# if prediksi_sentimen[3] == 'tanah_longsor':
#     print("Sentimen kalimat baru adalah tanah_longsor.")
# if prediksi_sentimen[4] == 'tsunami':
#     print("Sentimen kalimat baru adalah tsunami.")

Masukkan kalimat baru: Melalui pemrosesan dan pelatihan data yang ekstensif, model klasifikasi teks dapat mencapai tingkat akurasi yang tinggi dan memungkinkan otomatisasi tugas-tugas yang sebelumnya membutuhkan intervensi manusia. Di masa depan, dengan kemajuan teknologi yang terus berlanjut, kita dapat mengharapkan kemampuan klasifikasi teks yang semakin canggih dan luas, memberikan manfaat yang signifikan dalam berbagai aspek kehidupan.
[2]


# save model

In [None]:
import joblib
# x variabel model
# y nama model
joblib.dump(x,y)