<a href="https://colab.research.google.com/github/elangbijak4/LLM-SLM-Examples/blob/main/Analisis_sentimen_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import nltk

nltk.download('punkt')

# Contoh korpus dengan label sentimen (1 untuk positif, 0 untuk negatif)
corpus = [
    ("The movie was fantastic!", 1),
    ("I hated the film.", 0),
    ("It was a wonderful experience.", 1),
    ("The plot was very boring.", 0),
    ("Amazing performances by the actors!", 1),
    ("I did not like the movie at all.", 0)
]

# Pra-pemrosesan: Tokenisasi
tokenized_corpus = [(word_tokenize(sentence.lower()), label) for sentence, label in corpus]

# Latih model Word2Vec
sentences = [tokens for tokens, _ in tokenized_corpus]
word2vec_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, sg=0)

# Fungsi untuk mendapatkan rata-rata vektor embedding Word2Vec dari suatu kalimat
def get_sentence_vector(sentence_tokens, model):
    vectors = [model.wv[word] for word in sentence_tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# Mengubah korpus menjadi vektor
X = np.array([get_sentence_vector(tokens, word2vec_model) for tokens, _ in tokenized_corpus])
y = np.array([label for _, label in tokenized_corpus])

# Membagi data menjadi set pelatihan dan set pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Melatih model klasifikasi (misalnya, Logistic Regression)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Memprediksi sentimen pada set pengujian
y_pred = classifier.predict(X_test)

# Mengevaluasi model
accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi: {accuracy * 100:.2f}%")

# Contoh penggunaan model untuk prediksi baru
new_sentence = "The movie was not good."
new_tokens = word_tokenize(new_sentence.lower())
new_vector = get_sentence_vector(new_tokens, word2vec_model)
predicted_sentiment = classifier.predict([new_vector])[0]
print(f"Sentimen prediksi untuk kalimat '{new_sentence}': {'Positif' if predicted_sentiment == 1 else 'Negatif'}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Akurasi: 100.00%
Sentimen prediksi untuk kalimat 'The movie was not good.': Negatif
