In [3]:
import numpy as np
import pandas as pd
import collections
from collections import defaultdict 

## Read File

In [4]:
filename = "Word2Vec/Sample Feature Extraction.csv"

In [5]:
df = pd.read_csv(filename, header=0)
df.head(5)

Unnamed: 0,Berita,Label,Category_id
0,"['faktafakta', 'santri', 'tewas', 'saksi', 'sa...",teman,3
1,"['astri', 'culik', 'anak', 'majikan', 'bandung...",pengasuh,1
2,"['polisi', 'keras', 'sma', 'bus', 'serpong', '...",teman,3
3,"['tega', 'bima', 'pukulgigit', 'anak', 'gegara...",keluarga,0
4,"['polisi', 'tangkap', 'pria', 'duga', 'cabul',...",orang asing,4


### Parse the strings into lists

In [6]:
import ast
df['Berita'] = df['Berita'].apply(ast.literal_eval)

df['Berita'].head()

0    [faktafakta, santri, tewas, saksi, sangka, pon...
1    [astri, culik, anak, majikan, bandung, rp, jut...
2    [polisi, keras, sma, bus, serpong, duga, kali,...
3    [tega, bima, pukulgigit, anak, gegara, kesal, ...
4    [polisi, tangkap, pria, duga, cabul, anak, sd,...
Name: Berita, dtype: object

### Flatten the tokens (Make single list)

In [7]:
flattened_tokens = [token for tokens in df['Berita'] for token in tokens]
print(flattened_tokens)


['faktafakta', 'santri', 'tewas', 'saksi', 'sangka', 'ponpes', 'izin', 'santri', 'usia', 'bintang', 'balqis', 'maulana', 'pondok', 'pesantren', 'tartilul', 'quran', 'pptq', 'alhanifiyyah', 'mojo', 'kabupaten', 'diri', 'tewas', 'aniaya', 'santri', 'dasar', 'periksa', 'polisi', 'bintang', 'aniaya', 'tewas', 'februari', 'februari', 'februariberikut', 'faktafakta', 'aniaya', 'bintang', 'rangkum', 'cnn', 'indonesia', 'polisi', 'tetap', 'orang', 'sangka', 'mn', 'sidoarjo', 'ma', 'nganjuk', 'af', 'denpasar', 'ak', 'kota', 'surabaya', 'kena', 'pasal', 'ayat', 'uu', 'nomor', 'lindung', 'anak', 'pasal', 'kuhp', 'pasal', 'kuhp', 'ancam', 'hukum', 'maksimal', 'penjarapengacara', 'laku', 'aniaya', 'santri', 'diri', 'tewas', 'rini', 'puspitasari', 'pelakua', 'aniaya', 'korbang', 'susah', 'nasihat', 'bintang', 'perkara', 'salat', 'jemaah', 'bintang', 'sembuh', 'sakit', 'sekolah', 'salat', 'jemaah', 'kamar', 'info', 'ak', 'af', 'sepupu', 'tegur', 'si', 'bintang', 'salat', 'bintang', 'nyambung', 'rabu'

## Create vocabulary mapping from words to unique ids

In [8]:
def create_vocabulary(tokens):
    word_count = defaultdict(int)
    for token in tokens:
        word_count[token] += 1
    word_to_id = {word: i for i, (word, _) in enumerate(word_count.items())}
    id_to_word = {i: word for word, i in word_to_id.items()}
    return word_to_id, id_to_word

In [9]:
word_to_id, id_to_word = create_vocabulary(flattened_tokens)
word_to_id

{'faktafakta': 0,
 'santri': 1,
 'tewas': 2,
 'saksi': 3,
 'sangka': 4,
 'ponpes': 5,
 'izin': 6,
 'usia': 7,
 'bintang': 8,
 'balqis': 9,
 'maulana': 10,
 'pondok': 11,
 'pesantren': 12,
 'tartilul': 13,
 'quran': 14,
 'pptq': 15,
 'alhanifiyyah': 16,
 'mojo': 17,
 'kabupaten': 18,
 'diri': 19,
 'aniaya': 20,
 'dasar': 21,
 'periksa': 22,
 'polisi': 23,
 'februari': 24,
 'februariberikut': 25,
 'rangkum': 26,
 'cnn': 27,
 'indonesia': 28,
 'tetap': 29,
 'orang': 30,
 'mn': 31,
 'sidoarjo': 32,
 'ma': 33,
 'nganjuk': 34,
 'af': 35,
 'denpasar': 36,
 'ak': 37,
 'kota': 38,
 'surabaya': 39,
 'kena': 40,
 'pasal': 41,
 'ayat': 42,
 'uu': 43,
 'nomor': 44,
 'lindung': 45,
 'anak': 46,
 'kuhp': 47,
 'ancam': 48,
 'hukum': 49,
 'maksimal': 50,
 'penjarapengacara': 51,
 'laku': 52,
 'rini': 53,
 'puspitasari': 54,
 'pelakua': 55,
 'korbang': 56,
 'susah': 57,
 'nasihat': 58,
 'perkara': 59,
 'salat': 60,
 'jemaah': 61,
 'sembuh': 62,
 'sakit': 63,
 'sekolah': 64,
 'kamar': 65,
 'info': 66,
 '

## Generate Training Data

<p> Melihat kata target dan kata konteksnya. Dia berlaku iterasi jadi kata target bisa juga menjadi kata konteks </p>

In [10]:
def generate_training_data(data, word_to_id, window_size):
    """Generate training data as context-target pairs."""
    training_data = []
    for tokens in data:
        for i, word in enumerate(tokens):
            start = max(0, i - window_size)
            end = min(len(tokens), i + window_size + 1)
            for j in range(start, end):
                if i != j:
                    training_data.append((word_to_id[word], word_to_id[tokens[j]]))
    return training_data

In [11]:
window_size = 2
training_data = generate_training_data(df['Berita'], word_to_id, window_size)
training_data

[(0, 1),
 (0, 2),
 (1, 0),
 (1, 2),
 (1, 3),
 (2, 0),
 (2, 1),
 (2, 3),
 (2, 4),
 (3, 1),
 (3, 2),
 (3, 4),
 (3, 5),
 (4, 2),
 (4, 3),
 (4, 5),
 (4, 6),
 (5, 3),
 (5, 4),
 (5, 6),
 (5, 1),
 (6, 4),
 (6, 5),
 (6, 1),
 (6, 7),
 (1, 5),
 (1, 6),
 (1, 7),
 (1, 8),
 (7, 6),
 (7, 1),
 (7, 8),
 (7, 9),
 (8, 1),
 (8, 7),
 (8, 9),
 (8, 10),
 (9, 7),
 (9, 8),
 (9, 10),
 (9, 11),
 (10, 8),
 (10, 9),
 (10, 11),
 (10, 12),
 (11, 9),
 (11, 10),
 (11, 12),
 (11, 13),
 (12, 10),
 (12, 11),
 (12, 13),
 (12, 14),
 (13, 11),
 (13, 12),
 (13, 14),
 (13, 15),
 (14, 12),
 (14, 13),
 (14, 15),
 (14, 16),
 (15, 13),
 (15, 14),
 (15, 16),
 (15, 17),
 (16, 14),
 (16, 15),
 (16, 17),
 (16, 18),
 (17, 15),
 (17, 16),
 (17, 18),
 (17, 19),
 (18, 16),
 (18, 17),
 (18, 19),
 (18, 2),
 (19, 17),
 (19, 18),
 (19, 2),
 (19, 20),
 (2, 18),
 (2, 19),
 (2, 20),
 (2, 1),
 (20, 19),
 (20, 2),
 (20, 1),
 (20, 21),
 (1, 2),
 (1, 20),
 (1, 21),
 (1, 22),
 (21, 20),
 (21, 1),
 (21, 22),
 (21, 23),
 (22, 1),
 (22, 21),
 (22, 23)

### Initialize weights for the Word2Vec model

In [12]:
def initialize_weights(vocab_size, embedding_dim):
    W1 = np.random.rand(vocab_size, embedding_dim)  # Input weights
    W2 = np.random.rand(embedding_dim, vocab_size)  # Output weights
    return W1, W2

In [13]:
W1, W2 = initialize_weights(len(word_to_id), embedding_dim=100)

In [14]:
W1.shape

(381, 100)

In [15]:
W2.shape

(100, 381)

In [16]:
def one_hot_encode(id, vocab_size):
    """Perform one-hot encoding for a given ID."""
    res = [0] * vocab_size
    res[id] = 1
    return res

In [17]:
def softmax(x):
    """Compute the softmax of a vector."""
    exp_x = np.exp(x - np.max(x))  # Subtract max(x) to improve stability
    return exp_x / np.sum(exp_x)

In [18]:
def forward_propagation(word, W1, W2):
    """Perform forward propagation."""
    """
    Perform the forward propagation step.
    Args:
    - word: index of the target word
    - W1: input-to-hidden weight matrix
    - W2: hidden-to-output weight matrix
    
    Returns:
    - hidden: hidden layer activation (embedding vector)
    - predicted: predicted probability distribution (output layer)
    """

    # Hidden 1
    input_layer = one_hot_encode(word, len(W1))
    
    # A1 = W1[word]           # Shape (100, )
    A1 = np.dot(input_layer, W1)

    # Hidden 2 (Output)
    A2 = np.dot(A1, W2)     # Shape (381, )
    Z = softmax(A2)         # Shape (381, )

    return A1, A2, Z

In [19]:
def backward_propagation(word, context, A1, Z, W1, W2, learning_rate):
    """Perform backward propagation"""
    """
    Perform the backward propagation step and update weights.
    Args:
    - word: index of the target word
    - context: index of the context word
    - hidden: hidden layer activation (embedding vector)
    - predicted: predicted probability distribution (output layer)
    - W1: input-to-hidden weight matrix
    - W2: hidden-to-output weight matrix
    - learning_rate: learning rate for gradient updates
    
    Returns:
    - Updated W1 and W2
    """

    target = one_hot_encode(context, len(W1))

    dA2 = Z - target
    dW2 = np.outer(A1, dA2)      # Shape (100, 381)
    dA1 = np.dot(dA2, W2.T)     # Shape (100,)
    dW1 = np.outer(dA1, one_hot_encode(word, len(W1))).T # (100,381)

    """Update weights"""
    W1 -= learning_rate * dW1
    W2 -= learning_rate * dW2
    return W1, W2

In [24]:
def train_word2vec(training_data, W1, W2, learning_rate, epochs):
    """Train the Word2Vec model using the provided training data."""
    """
    Train the Word2Vec model using forward and backward propagation.
    Args:
    - training_data: list of (target, context) pairs
    - W1: input-to-hidden weight matrix
    - W2: hidden-to-output weight matrix
    - learning_rate: learning rate for gradient updates
    - epochs: number of training iterations
    
    Returns:
    - Trained W1 and W2
    """
    for epoch in range(epochs):
        total_loss = 0  # Initialize total loss for the epoch
        for word, context in training_data:
            input_layer = one_hot_encode(word, len(W1))  # Ensure NumPy array
            # Forward pass
            A1, A2, Z = forward_propagation(word, W1, W2)
                        
            # Compute loss (cross-entropy loss)
            target = one_hot_encode(context, len(W1))
            loss = -np.sum(target * np.log(Z + 1e-9))  # Adding 1e-9 for numerical stability
            total_loss += loss
            
            # Backward pass
            W1, W2 = backward_propagation(word, context, A1, Z, W1, W2, learning_rate)
            # print(word, context, input_layer, target, loss)
        
        # Print loss for the current epoch
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")
    
    return W1, W2

In [25]:
train_word2vec(training_data, W1, W2, learning_rate=0.1, epochs=5)

135 345 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

(array([[ 0.54782209,  0.12487386,  0.48021872, ...,  0.29307181,
          0.65547957,  0.42369441],
        [ 0.06226542,  0.27234275,  0.16238663, ...,  0.36362408,
          0.63745867, -0.01023069],
        [ 0.49752807,  0.37392496,  0.37883103, ...,  0.5607267 ,
          0.87210891,  0.18028128],
        ...,
        [ 0.56189147,  0.8655217 ,  0.09555241, ...,  0.29058151,
          0.93020055, -0.2135157 ],
        [ 0.46334433,  0.77203873,  0.12708131, ...,  0.18274218,
          0.94995951, -0.21703221],
        [ 0.44888111, -0.22574506,  0.13099161, ...,  0.65474122,
          0.74218039,  0.27663945]]),
 array([[ 0.57289699,  0.94254185,  0.68172689, ...,  1.04703741,
          0.35552107,  0.27922696],
        [ 1.05178285,  0.46996043,  0.34307319, ...,  0.67253877,
          0.42134248,  0.85507469],
        [ 0.75011922,  0.68234466,  0.92660513, ...,  0.31514159,
         -0.06807719,  0.16076308],
        ...,
        [ 0.90097949,  0.87519882,  0.60875211, ...,  

In [20]:
import pandas as pd

def train_word2vec(training_data, W1, W2, learning_rate, epochs):
    """Train the Word2Vec model using the provided training data."""
    log_data = []  # List to store log data for each training step
    for epoch in range(epochs):
        total_loss = 0  # Initialize total loss for the epoch
        for word, context in training_data:
            # Forward pass
            input_layer = one_hot_encode(word, len(W1))  # Ensure NumPy array
            A1, A2, Z = forward_propagation(word, W1, W2)
            
            # Compute loss (cross-entropy loss)
            target = one_hot_encode(context, len(W1))
            loss = -np.sum(target * np.log(Z + 1e-9))  # Adding 1e-9 for numerical stability
            total_loss += loss
            
            # Backward pass
            W1, W2 = backward_propagation(word, context, A1, Z, W1, W2, learning_rate)
            
            # Log the data
            log_data.append({
                'epoch': epoch + 1,
                'word': word,
                'context': context,
                'input_layer': input_layer,
                'A1': A1.tolist(),  # Convert numpy array to list for saving
                'A2': A2.tolist(),
                'Z': Z.tolist(),
                'W1': W1.tolist(),  # You may want to save only a subset of W1
                'W2': W2.tolist(),   # You may want to save only a subset of W2
                'target': target,
                'loss': loss
            })
        
        # Print loss for the current epoch
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")
    
    # Convert log data to DataFrame and save to CSV
    log_df = pd.DataFrame(log_data)
    log_df.to_csv('training_log.csv', index=False)
    
    return W1, W2

In [21]:
train_word2vec(training_data, W1, W2, learning_rate=0.1, epochs=10)

In [None]:
import pickle

# Simpan word embeddings dan word_to_id ke file
with open("word_embeddings.pkl", "wb") as f:
    pickle.dump({'W1': W1, 'word_to_id': word_to_id}, f)

In [21]:
# Load word embeddings
with open("word_embeddings.pkl", "rb") as f:
    data = pickle.load(f)
    W1 = data['W1']
    word_to_id = data['word_to_id']

In [22]:
def text_to_vector(text, W1, word_to_id):
    vectors = []
    for word in text:
        if word in word_to_id:  # Cek apakah kata ada dalam indeks (word_to_id)
            idx = word_to_id[word]  # Ambil indeks kata
            vectors.append(W1[idx])  # Ambil vektor embedding dari W1
    if vectors:  # Jika ada vektor yang diambil
        return np.mean(vectors, axis=0)  # Hitung rata-rata vektor
    else:
        return np.zeros(W1.shape[1])  # Return vektor nol jika kata tidak ditemukan


In [23]:
# Contoh konversi teks berita ke vektor
df['vector'] = df['Berita'].apply(lambda x: text_to_vector(x, W1, word_to_id))
print(df['vector'].head())

0    [0.25225140609069235, 0.28106427943395607, 0.1...
1    [0.20986075928799947, 0.1570422318589323, 0.32...
2    [0.26285828669792183, 0.25489374277488064, 0.3...
3    [0.29659347250854273, 0.23533209298224644, 0.3...
4    [0.29148650175932234, 0.2623562864145001, 0.27...
Name: vector, dtype: object


In [29]:
df['vector'][2]

array([0.26285829, 0.25489374, 0.31899058, 0.23683798, 0.23992023,
       0.26367723, 0.33280472, 0.25915192, 0.39331957, 0.24158967,
       0.21514879, 0.27816849, 0.22649596, 0.22184882, 0.28352007,
       0.3466886 , 0.26155716, 0.26406822, 0.27088016, 0.33209688,
       0.26597628, 0.36803344, 0.25694654, 0.18857838, 0.27406567,
       0.20057812, 0.32358393, 0.32888544, 0.30292681, 0.31073885,
       0.33904889, 0.22703675, 0.32405234, 0.26377988, 0.22460401,
       0.31005022, 0.26700485, 0.3299351 , 0.24439866, 0.2819761 ,
       0.22409343, 0.27337568, 0.26353495, 0.17137004, 0.27989938,
       0.28319918, 0.30863095, 0.20011149, 0.23083898, 0.2314717 ,
       0.27856928, 0.297812  , 0.29085809, 0.2435252 , 0.23518438,
       0.12770614, 0.33681627, 0.27662726, 0.16527968, 0.32556671,
       0.37219301, 0.22490185, 0.35766824, 0.18952202, 0.39089057,
       0.24906507, 0.30969457, 0.35026465, 0.25074594, 0.25872628,
       0.23394036, 0.11882992, 0.20765482, 0.20590907, 0.19500