### Import Library

In [4]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import pandas as pd
import nltk
import joblib

### Data Loading

In [5]:
df = pd.read_csv("./output/data_clean.csv")
df.sample()

Unnamed: 0,content,score,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_final,polarity_score,polarity
9598,kecewa sekali pesan jam baru ada kabar dari ku...,1,kecewa sekali pesan jam baru ada kabar dari ku...,kecewa sekali pesan jam baru ada kabar dari ku...,kecewa sekali pesan jam baru ada kabar dari ku...,"['kecewa', 'sekali', 'pesan', 'jam', 'baru', '...","['kecewa', 'pesan', 'jam', 'kabar', 'kurir', '...",kecewa pesan jam kabar kurir jam via whatsapp ...,-11,negative


### Data Preprocessing

Drop empty row

In [6]:
df.dropna(subset=['content'], inplace=True)

Encoding label

In [7]:
label_encoder = preprocessing.LabelEncoder() 
df['polarity_encode'] = label_encoder.fit_transform(df['polarity']) 

In [8]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
X = df['text_final']
y = df['polarity_encode']

X_tokenize = [word_tokenize(sentence.lower()) for sentence in X]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_tokenize, y, test_size=0.3, random_state=42)

Word to Vector 

In [11]:
model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)

def get_sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

In [12]:
X_train_w2v = np.array([get_sentence_vector(text, model) for text in X_train])
X_test_w2v = np.array([get_sentence_vector(text, model) for text in X_test])

### Modelling

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
 
random_forest = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=8,
    min_samples_leaf=6,
    random_state=42,
    class_weight='balanced',
)
 
random_forest.fit(X_train_w2v, y_train)
 
y_pred_train_rf = random_forest.predict(X_train_w2v)
y_pred_test_rf = random_forest.predict(X_test_w2v)
 
accuracy_train_rf = accuracy_score(y_pred_train_rf, y_train)
accuracy_test_rf = accuracy_score(y_pred_test_rf, y_test)

### Evaluation

In [14]:
print('accuracy_train:', accuracy_train_rf)
print('accuracy_test:', accuracy_test_rf)
print(classification_report(y_test, y_pred_test_rf))
print(confusion_matrix(y_test, y_pred_test_rf))

accuracy_train: 0.9297535420883438
accuracy_test: 0.8644444444444445
              precision    recall  f1-score   support

           0       0.82      0.90      0.85      1471
           1       0.27      0.03      0.06        89
           2       0.91      0.88      0.89      2040

    accuracy                           0.86      3600
   macro avg       0.67      0.60      0.60      3600
weighted avg       0.85      0.86      0.86      3600

[[1317    3  151]
 [  50    3   36]
 [ 243    5 1792]]


Save Model

In [None]:
joblib.dump(random_forest, './assets/rf_w2v.joblib')
model.save("./assets/word2vec.model")