# Modelling

Import Library

In [None]:
import pandas as pd
import numpy as np
import joblib
import re

import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.layers import Dense, Embedding, GlobalAveragePooling1D, LSTM, Input, Dropout, Bidirectional
from tensorflow.keras.metrics import Precision, Recall 
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.models import load_model


Data Loading

In [3]:
df = pd.read_csv("./output/data_clean.csv")

In [4]:
df['text_final'].isnull().sum()

np.int64(1)

In [5]:
df.dropna(subset=['content'], inplace=True)

In [6]:
df.isna().sum()

content                 0
score                   0
text_clean              0
text_casefoldingText    0
text_slangwords         0
text_tokenizingText     0
text_stopword           0
text_final              0
polarity_score          0
polarity                0
dtype: int64

Laber Encoder

In [7]:
label_encoder = preprocessing.LabelEncoder() 
df['polarity_encode'] = label_encoder.fit_transform(df['polarity']) 

In [8]:
X_input = df['text_final']
y_input = df['polarity_encode']

Splitting

In [9]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_input, y_input, test_size=0.2, stratify=y_input, random_state=42)

TF-IDF Vectorizer

In [10]:
tfidf = TfidfVectorizer(
    max_features=5000, 
    min_df=3, 
    max_df=0.85,
    ngram_range=(1,2),
)

X_train_tfidf = tfidf.fit_transform(X_train_tfidf)
X_test_tfidf = tfidf.transform(X_test_tfidf)

### **Model Random Forest & TF-IDF**

Training model

In [11]:
random_forest = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=10,
    min_samples_leaf=4,
    random_state=42,
    max_features='sqrt',
    class_weight='balanced',
)

random_forest.fit(X_train_tfidf.toarray(), y_train_tfidf)

Evaluate Model

In [12]:
y_pred_train_tfidf = random_forest.predict(X_train_tfidf.toarray())
y_pred_test_tfidf = random_forest.predict(X_test_tfidf.toarray())
 
accuracy_train_tfidf = accuracy_score(y_pred_train_tfidf, y_train_tfidf)
accuracy_test_tfidf = accuracy_score(y_pred_test_tfidf, y_test_tfidf)

print('accuracy_train:', accuracy_train_tfidf)
print('accuracy_test:', accuracy_test_tfidf)
print(classification_report(y_test_tfidf, y_pred_test_tfidf))
print(confusion_matrix(y_test_tfidf, y_pred_test_tfidf))

accuracy_train: 0.9093655589123867
accuracy_test: 0.8708333333333333
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1004
           1       0.26      0.27      0.27        55
           2       0.92      0.88      0.90      1341

    accuracy                           0.87      2400
   macro avg       0.68      0.68      0.68      2400
weighted avg       0.87      0.87      0.87      2400

[[ 895   11   98]
 [  29   15   11]
 [ 130   31 1180]]


Save model RF & TF-IDF

In [13]:
joblib.dump(tfidf, './assets/tf_idf.joblib')
joblib.dump(random_forest, './assets/rf_tfidf.joblib')

['./assets/rf_tfidf.joblib']

Testing

In [37]:
loaded_tfidf = joblib.load('./assets/tf_idf.joblib')
loaded_rf = joblib.load('./assets/rf_tfidf.joblib')

label_map = {0: 'NEGATIVE', 1: 'NEUTRAL', 2: 'POSITIVE'}

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

def predict(text):
    clean_input = clean_text(text)
    vec_input = loaded_tfidf.transform([clean_input])
    
    pred_index = loaded_rf.predict(vec_input)[0]
    pred_proba = loaded_rf.predict_proba(vec_input)[0]
    
    result = label_map[pred_index]
    score = pred_proba[pred_index] * 100
    
    return result, score

txt = "Kurang membantu, lambat, tidak efektif"
result, score = predict(txt)
print(f"Analyzed: {result} (Score: {score:.1f}%)")

Analyzed: NEGATIVE (Score: 46.2%)


### **Model SVM & TF-IDF**

Training Model

In [14]:
svm = LinearSVC(random_state=42, C=1.0)

svm.fit(X_train_tfidf, y_train_tfidf)

Evaluate Model

In [15]:
y_train_svm = svm.predict(X_train_tfidf)
y_pred_svm = svm.predict(X_test_tfidf)

acc_train_svm = accuracy_score(y_train_tfidf, y_train_svm)
acc_test_svm = accuracy_score(y_test_tfidf, y_pred_svm)

print(f"train accuracy score: {acc_train_svm}")
print(f"test accuracy_score: {acc_test_svm}")
print(classification_report(y_test_tfidf, y_pred_svm))
print(confusion_matrix(y_test_tfidf, y_pred_svm))

train accuracy score: 0.9929159287425774
test accuracy_score: 0.9325
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      1004
           1       1.00      0.07      0.14        55
           2       0.95      0.95      0.95      1341

    accuracy                           0.93      2400
   macro avg       0.95      0.66      0.67      2400
weighted avg       0.93      0.93      0.92      2400

[[ 955    0   49]
 [  30    4   21]
 [  62    0 1279]]


Save model SVM & TF-IDF

In [16]:
joblib.dump(svm, './assets/svm_tfidf.joblib')

['./assets/svm_tfidf.joblib']

Testing

In [46]:
loaded_svm = joblib.load('./assets/svm_tfidf.joblib')

def predict_svm_tfidf(text):
    clean_input = clean_text(text)
    vec_input = loaded_tfidf.transform([clean_input])

    pred_array = loaded_svm.predict(vec_input)

    pred_index = pred_array[0] 
    result = label_map[pred_index]
    
    decision_function = loaded_svm.decision_function(vec_input)[0]

    score_mentah = decision_function[pred_index] 
    
    return result, score_mentah

txt = "Kurang membantu, lambat, tidak efektif"
result, score = predict_svm_tfidf(txt)

print(f"Analyzed: {result} (Confidence: {score:.2f})")

Analyzed: NEGATIVE (Confidence: 0.63)


### **Model Random Forest & W2VEC**

In [17]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Tokenize

In [18]:
X_tokenize = [word_tokenize(sentence.lower()) for sentence in X_input]

Splitting Data

In [19]:
X_train_w2vec, X_test_w2vec, y_train_w2vec, y_test_w2vec = train_test_split(X_tokenize, y_input, test_size=0.3, random_state=42)

Word to Vector

In [20]:
model = Word2Vec(sentences=X_train_w2vec, vector_size=100, window=5, min_count=1, workers=4)

def get_sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

X_train_w2v = np.array([get_sentence_vector(text, model) for text in X_train_w2vec])
X_test_w2v = np.array([get_sentence_vector(text, model) for text in X_test_w2vec])

Training Model

In [21]:
random_forest = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=8,
    min_samples_leaf=6,
    random_state=42,
    class_weight='balanced',
)
 
random_forest.fit(X_train_w2v, y_train_w2vec)

Evaluate Model

In [22]:
y_pred_train_rf = random_forest.predict(X_train_w2v)
y_pred_test_rf = random_forest.predict(X_test_w2v)
 
accuracy_train_rf = accuracy_score(y_pred_train_rf, y_train_w2vec)
accuracy_test_rf = accuracy_score(y_pred_test_rf, y_test_w2vec)

print('accuracy_train:', accuracy_train_rf)
print('accuracy_test:', accuracy_test_rf)
print(classification_report(y_test_w2vec, y_pred_test_rf))
print(confusion_matrix(y_test_w2vec, y_pred_test_rf))

accuracy_train: 0.9297535420883438
accuracy_test: 0.8630555555555556
              precision    recall  f1-score   support

           0       0.82      0.89      0.85      1471
           1       0.33      0.03      0.06        89
           2       0.90      0.88      0.89      2040

    accuracy                           0.86      3600
   macro avg       0.68      0.60      0.60      3600
weighted avg       0.85      0.86      0.85      3600

[[1309    1  161]
 [  49    3   37]
 [ 240    5 1795]]


Save model Random Forest & W2Vec

In [23]:
joblib.dump(random_forest, './assets/rf_w2v.joblib')
model.save("./assets/word2vec.model")

Testing

In [47]:
rf_model = joblib.load('./assets/rf_w2v.joblib')
w2v_model = Word2Vec.load("./assets/word2vec.model")

def get_avg_vector(tokens, model):
    vector_size = model.vector_size
    vec = np.zeros(vector_size)
    count = 0
    
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    
    if count > 0:
        vec /= count
        
    return vec.reshape(1, -1)

def predict(text):
    tokens = clean_text(text)
    
    if len(tokens) == 0:
        return "Not Valid Text"
        
    vector_input = get_avg_vector(tokens, w2v_model)
    
    pred_index = rf_model.predict(vector_input)[0]
    pred_proba = rf_model.predict_proba(vector_input)[0]
    
    result = label_map[pred_index]
    confidence = np.max(pred_proba) * 100
    
    return result, confidence

input_text = "Kurang membantu, lambat, tidak efektif"
    
result, confidence = predict(input_text)
print(f"Analyzed: {result} (Score: {confidence:.1f}%)")

Analyzed: NEGATIVE (Score: 88.6%)


### LSTM

Tokenization

In [24]:
sentences = df['text_final'].astype(str).tolist()

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index 

sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

Save Tokenizer

In [25]:
with open('./assets/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
dataset = df.values

X_input = np.array(padded).astype('float32')
y_input = np.array(dataset[:,10]).astype('float32')

Random Oversampling

In [None]:
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_input, y_input)

print("Jumlah Data Setelah ROS:", len(X_resampled))

Jumlah Data Setelah ROS: 20118


Data Splitting

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, stratify=y_resampled, test_size=0.3)

Modelling

In [29]:
model = Sequential([
    Input(shape=(100,)),
    Embedding(input_dim=5000, output_dim=128),

    Bidirectional(LSTM(units=64, dropout=0.4, recurrent_dropout=0.2)),

    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

In [30]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

Initialize Callback

In [31]:
checkpoint = ModelCheckpoint(
    './assets/best_model.keras',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max'
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

Training Model

In [None]:
history = model.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=128,
    validation_split=0.2,
    callbacks=[checkpoint, early_stop]
)

Epoch 1/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 337ms/step - accuracy: 0.6410 - loss: 0.7898 - val_accuracy: 0.8342 - val_loss: 0.4394
Epoch 2/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 306ms/step - accuracy: 0.8958 - loss: 0.3173 - val_accuracy: 0.9322 - val_loss: 0.2092
Epoch 3/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 310ms/step - accuracy: 0.9502 - loss: 0.1612 - val_accuracy: 0.9450 - val_loss: 0.1789
Epoch 4/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 307ms/step - accuracy: 0.9669 - loss: 0.1102 - val_accuracy: 0.9457 - val_loss: 0.1939
Epoch 5/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 321ms/step - accuracy: 0.9767 - loss: 0.0817 - val_accuracy: 0.9610 - val_loss: 0.1397
Epoch 6/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 325ms/step - accuracy: 0.9727 - loss: 0.0946 - val_accuracy: 0.9574 - val_loss: 0.1542
Epoch 7/20
[1m89/89[

Evaluation

In [33]:
model.evaluate(X_test, y_test, batch_size=1)

[1m6036/6036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 7ms/step - accuracy: 0.9529 - loss: 0.1747


[0.1746925264596939, 0.9529489874839783]

Testing

In [None]:
model = load_model('./assets/best_model.keras')

with open('./assets/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

MAX_LEN = 100

def predict(text):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=MAX_LEN, padding='post', truncating='post')
    result = model.predict(padded, verbose=0)

    label_index = np.argmax(result)
    score = np.max(result)

    _predict = label_map[label_index]
    
    return _predict, score


input_text = "Kurang membantu, lambat, tidak efektif"    
_predict, score = predict(input_text)
print(f"Analyzed: {_predict} (Score: {score*100:.1f}%)")

Analyzed: NEUTRAL (Score: 98.6%)
