KLASIFIKASI NLP

In [31]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Download data NLTK yang benar
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Gabriel
[nltk_data]     Azarya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Gabriel
[nltk_data]     Azarya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
# Load dataset
df = pd.read_csv('tweets_selenium1.csv')
df.head()

Unnamed: 0,text
0,Karena industri China pada pindah ke vietnam y...
1,Agak susah bukan percaya informasi tentang Chi...
2,Airlangga Hartarto sebut Indonesia bisa manfaa...
3,@WikiDPR\n #Baleg #OmnibusLaw #Ciptaker Tim Pe...
4,"Akibat Virus Corona, Amerika Dan China Kemungk..."


In [33]:
# Case folding
def case_folding(text):
    return text.lower()

In [34]:
# Cleaning
def cleaning(text):
    text = re.sub(r'http\S+', '', text)  # Hapus URL
    text = re.sub(r'@\w+', '', text)  # Hapus mention
    text = re.sub(r'#\w+', '', text)  # Hapus hashtag
    text = re.sub(r'\d+', '', text)  # Hapus angka
    text = text.translate(str.maketrans('', '', string.punctuation))  # Hapus tanda baca
    text = text.strip()
    return text

In [35]:
# Tokenization
def tokenization(text):
    # Pakai word_tokenize bawaan nltk
    return word_tokenize(text)

In [36]:
# Normalization
normalisasi_dict = {
    'ga': 'tidak',
    'gak': 'tidak',
    'nggak': 'tidak',
    'aja': 'saja',
    'kalo': 'kalau',
    'yg': 'yang',
    'dr': 'dari'
}

In [37]:
def normalization(tokens):
    return [normalisasi_dict.get(token, token) for token in tokens]

In [38]:
# Stopword removal
stop_words = set(stopwords.words('indonesian'))

def stopword_removal(tokens):
    return [token for token in tokens if token not in stop_words]

In [39]:
# Stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming(tokens):
    return [stemmer.stem(token) for token in tokens]

In [40]:
print(df.columns)

Index(['text'], dtype='object')


In [41]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')   # kalau Anda pakai stopword_removal
nltk.download('wordnet')     # kalau Anda pakai stemming pakai WordNetLemmatizer


[nltk_data] Downloading package punkt to C:\Users\Gabriel
[nltk_data]     Azarya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Gabriel
[nltk_data]     Azarya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Gabriel
[nltk_data]     Azarya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [46]:
def tokenization(text):
    # Simple tokenizer tanpa tergantung NLTK punkt_tab
    return text.split()

def preprocess(text):
    try:
        text = case_folding(text)
        text = cleaning(text)
        tokens = tokenization(text)
        tokens = normalization(tokens)
        tokens = stopword_removal(tokens)
        tokens = stemming(tokens)
        return ' '.join(tokens)
    except Exception as e:
        print(f"Error processing text: {text}\n{e}")
        return ''


In [47]:
# Tokenization & Padding
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['preprocessed_text'])
sequences = tokenizer.texts_to_sequences(df['preprocessed_text'])
padded_sequences = pad_sequences(sequences, padding='post', maxlen=100)

In [55]:
label_list = ['Pro Amerika', 'Pro China', 'Netral'] 

In [57]:
df['label'] = label_list

ValueError: Length of values (3) does not match length of index (437)

In [53]:
print(df.columns)

Index(['text', 'preprocessed_text'], dtype='object')


In [48]:
# Encode label
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['label'])  # Ganti sesuai nama kolom label Anda

KeyError: 'label'

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [None]:
# Build model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(np.unique(labels)), activation='softmax'))

In [None]:
# Train model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

In [None]:
# Evaluate model
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)

In [None]:
# Classification report
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred_labels)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()