# Word Embbeded

In [5]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

import nltk

nltk.download('punkt')

text_data = {
    'Saya suka makan bakso',
    'Bakso enak dan lezat',
    'Makanan favorit saya adalah nasi goreng',
    'Nasi goreng pedas adalah makanan favorit saya',
    'Saya suka makanan manis seperti es krim'
}

tokenized_data = [word_tokenize(sentence.lower()) for sentence in text_data]

model = Word2Vec(sentences=tokenized_data, vector_size=100, window=5, min_count=1, workers=4)

word_vectors = model.wv

similar_words = word_vectors.most_similar('bakso', topn=3)
print("Kata-kata yang mirip dengan bakso: ", similar_words)

vector = word_vectors['bakso']
print("Vektor untuk bakso", vector)


Kata-kata yang mirip dengan bakso:  [('seperti', 0.2528931498527527), ('nasi', 0.1701788604259491), ('enak', 0.15016479790210724)]
Vektor untuk bakso [-0.00713902  0.00124103 -0.00717672 -0.00224462  0.0037193   0.00583312
  0.00119818  0.00210273 -0.00411039  0.00722533 -0.00630704  0.00464722
 -0.00821997  0.00203647 -0.00497705 -0.00424769 -0.00310898  0.00565521
  0.0057984  -0.00497465  0.00077333 -0.00849578  0.00780981  0.00925729
 -0.00274233  0.00080022  0.00074665  0.00547788 -0.00860608  0.00058446
  0.00686942  0.00223159  0.00112468 -0.00932216  0.00848237 -0.00626413
 -0.00299237  0.00349379 -0.00077263  0.00141129  0.00178199 -0.0068289
 -0.00972481  0.00904058  0.00619805 -0.00691293  0.00340348  0.00020606
  0.00475375 -0.00711994  0.00402695  0.00434743  0.00995737 -0.00447374
 -0.00138926 -0.00731732 -0.00969783 -0.00908026 -0.00102275 -0.00650329
  0.00484973 -0.00616403  0.00251919  0.00073944 -0.00339215 -0.00097922
  0.00997913  0.00914589 -0.00446183  0.00908303

[nltk_data] Downloading package punkt to /Users/fepriyadi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Term Frequency-Inverse Document Frequency (TF-IDF)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [ "Saya suka makan bakso",
             "Bakso enak dan lezat",
             "Makanan favorit saya adalah nasi goreng",
             "Nasi goreng pedas adalah makanan favorit saya",
             "Saya suka makanan manis seperti es krim"
             ]

tfid_vectorizer = TfidfVectorizer()
tfidf_matrix = tfid_vectorizer.fit_transform(documents)

print("Vocabulary: ", tfid_vectorizer.vocabulary_)
print("TF-IDF Matrix")
print(tfidf_matrix.toarray())

Vocabulary:  {'saya': 14, 'suka': 16, 'makan': 9, 'bakso': 1, 'enak': 3, 'dan': 2, 'lezat': 8, 'makanan': 10, 'favorit': 5, 'adalah': 0, 'nasi': 12, 'goreng': 6, 'pedas': 13, 'manis': 11, 'seperti': 15, 'es': 4, 'krim': 7}
TF-IDF Matrix
[[0.         0.49851188 0.         0.         0.         0.
  0.         0.         0.         0.61789262 0.         0.
  0.         0.         0.34810993 0.         0.49851188]
 [0.         0.42224214 0.52335825 0.52335825 0.         0.
  0.         0.         0.52335825 0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.43951606 0.         0.         0.         0.         0.43951606
  0.43951606 0.         0.         0.         0.36483803 0.
  0.43951606 0.         0.30691325 0.         0.        ]
 [0.38596041 0.         0.         0.         0.         0.38596041
  0.38596041 0.         0.         0.         0.320382   0.
  0.38596041 0.47838798 0.26951544 0.         0.        ]
 [0.         0.         0.         

# Bag of Words (BoW)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "Ini adalah contoh dokumen pertama",
    "Ini adalah contoh dokumen kedua",
    "Ini adalah contoh dokumen ketiga",
    "Ini adalah contoh contoh contoh"
]

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(documents)
bow_matrix.toarray()

features = vectorizer.get_feature_names_out()
print("Matrix Bow")
print(bow_matrix.toarray())

print("\nDaftar Fitur")
print(features)

Matrix Bow
[[1 1 1 1 0 0 1]
 [1 1 1 1 1 0 0]
 [1 1 1 1 0 1 0]
 [1 3 0 1 0 0 0]]

Daftar Fitur
['adalah' 'contoh' 'dokumen' 'ini' 'kedua' 'ketiga' 'pertama']


# N-gram

In [7]:
from nltk.util import ngrams

sentences = [
    "Saya suka makan bakso enak di warung dekat rumah.",
    "Nasi goreng adalah salah satu makanan favorit saya.",
    "Es krim coklat sangat lezat dan menyegarkan.",
    "Saat hari hujan, saya suka minum teh hangat.",
    "Pemandangan pegunungan di pagi hari sangat indah.",
    "Bola basket adalah olahraga favorit saya sejak kecil."
]

for sentence in sentences:
    words = sentence.split()
    unigrams = list(ngrams(words, 1))
    bigrams = list(ngrams(words, 2))
    trigrams = list(ngrams(words, 3))

    print("\nKalimat: ", sentence)
    print("1-gram:")
    for gram in unigrams:
        print(gram)
    
    print("2-gram:")
    for gram in bigrams:
        print(gram)

    print("3-gram:")
    for gram in trigrams:
        print(gram)




Kalimat:  Saya suka makan bakso enak di warung dekat rumah.
1-gram:
('Saya',)
('suka',)
('makan',)
('bakso',)
('enak',)
('di',)
('warung',)
('dekat',)
('rumah.',)
2-gram:
('Saya', 'suka')
('suka', 'makan')
('makan', 'bakso')
('bakso', 'enak')
('enak', 'di')
('di', 'warung')
('warung', 'dekat')
('dekat', 'rumah.')
3-gram:
('Saya', 'suka', 'makan')
('suka', 'makan', 'bakso')
('makan', 'bakso', 'enak')
('bakso', 'enak', 'di')
('enak', 'di', 'warung')
('di', 'warung', 'dekat')
('warung', 'dekat', 'rumah.')

Kalimat:  Nasi goreng adalah salah satu makanan favorit saya.
1-gram:
('Nasi',)
('goreng',)
('adalah',)
('salah',)
('satu',)
('makanan',)
('favorit',)
('saya.',)
2-gram:
('Nasi', 'goreng')
('goreng', 'adalah')
('adalah', 'salah')
('salah', 'satu')
('satu', 'makanan')
('makanan', 'favorit')
('favorit', 'saya.')
3-gram:
('Nasi', 'goreng', 'adalah')
('goreng', 'adalah', 'salah')
('adalah', 'salah', 'satu')
('salah', 'satu', 'makanan')
('satu', 'makanan', 'favorit')
('makanan', 'favorit', 