In [1]:
import numpy as np
import pandas as pd
import collections
from collections import defaultdict 

## Read File

In [22]:
filename = "Preprocessing/Sample Dataset Preprocessing.csv"

In [23]:
df = pd.read_csv(filename, header=0)
df.head(10)

Unnamed: 0,Berita,Label,Category_id
0,"['perkosa', 'anak', 'kandung', 'pria', 'dompu'...",keluarga,0
1,"['pria', 'cabul', 'anak', 'lelaki', 'rumdis', ...",orang asing,4
2,"['muncul', 'klarifikasi', 'duga', 'salur', 'pe...",pengasuh,1
3,"['heboh', 'video', 'gadis', 'ditelanjangidipuk...",orang asing,4
4,"['satria', 'mahathir', 'cogil', 'sangka', 'ker...",teman,3
5,"['yasin', 'ringkus', 'lapor', 'istri', 'perkos...",keluarga,0
6,"['viral', 'ayah', 'probolinggo', 'banting', 'a...",keluarga,0
7,"['video', 'keras', 'murid', 'smp', 'cilacap', ...",teman,3
8,"['guruortu', 'bikin', 'petisi', 'kepala', 'sma...",tenaga pendidik,2
9,"['polisi', 'periksa', 'duga', 'laku', 'rundung...",teman,3


In [24]:
df['Berita'].apply(type).value_counts()

Berita
<class 'str'>    49
Name: count, dtype: int64

In [25]:
df.dtypes

Berita         object
Label          object
Category_id     int64
dtype: object

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Berita       49 non-null     object
 1   Label        49 non-null     object
 2   Category_id  49 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 1.3+ KB


### Parse the strings into lists

In [27]:
import ast
df['Berita'] = df['Berita'].apply(ast.literal_eval)

df['Berita'].head()

0    [perkosa, anak, kandung, pria, dompu, tangkap,...
1    [pria, cabul, anak, lelaki, rumdis, wabup, lan...
2    [muncul, klarifikasi, duga, salur, pengasuh, a...
3    [heboh, video, gadis, ditelanjangidipukul, lak...
4    [satria, mahathir, cogil, sangka, keroyok, ana...
Name: Berita, dtype: object

In [28]:
df['Berita'].apply(type).value_counts()

Berita
<class 'list'>    49
Name: count, dtype: int64

### Flatten the tokens (Make single list)

In [29]:
flattened_tokens = [token for tokens in df['Berita'] for token in tokens]
print(flattened_tokens)


['perkosa', 'anak', 'kandung', 'pria', 'dompu', 'tangkap', 'pria', 'inisial', 'am', 'kabupaten', 'dompu', 'nusa', 'tenggara', 'barat', 'ntb', 'tangkap', 'duga', 'perkosa', 'anak', 'kandung', 'tangkap', 'polisi', 'lapor', 'warga', 'kasi', 'humas', 'polres', 'dompu', 'ipda', 'zuharis', 'am', 'tangkap', 'rumah', 'tangkap', 'am', 'polisi', 'malam', 'iya', 'tangkap', 'malam', 'rabu', 'zuharis', 'detikbali', 'kamis', 'zuharis', 'rinci', 'kronologi', 'aksi', 'bejat', 'pria', 'sebut', 'tugas', 'susun', 'lapor', 'tunggu', 'lapor', 'lengkap', 'ya', 'salah', 'warga', 'rumah', 'korban', 'ahmad', 'tangkap', 'am', 'warga', 'heboh', 'warga', 'geram', 'am', 'tega', 'buat', 'asusila', 'anak', 'kandung', 'heboh', 'malam', 'warga', 'kerumun', 'cepat', 'aman', 'polisi', 'laku', 'am', 'milik', 'orang', 'anak', 'perempuan', 'aksi', 'bejat', 'am', 'rumahnyaada', 'anak', 'istri', 'habis', 'pikir', 'jadi', 'pria', 'cabul', 'anak', 'lelaki', 'rumdis', 'wabup', 'langkat', 'adik', 'anggota', 'dprd', 'pria', 'inis

## Create vocabulary mapping from words to unique ids

In [6]:
def create_vocabulary(tokens):
    word_count = defaultdict(int)
    for token in tokens:
        word_count[token] += 1
    word_to_id = {word: i for i, (word, _) in enumerate(word_count.items())}
    id_to_word = {i: word for word, i in word_to_id.items()}
    return word_to_id, id_to_word

In [8]:
word_to_id, id_to_word = create_vocabulary(flattened_tokens)
word_to_id

{'perkosa': 0,
 'anak': 1,
 'kandung': 2,
 'pria': 3,
 'dompu': 4,
 'tangkap': 5,
 'inisial': 6,
 'am': 7,
 'kabupaten': 8,
 'nusa': 9,
 'tenggara': 10,
 'barat': 11,
 'ntb': 12,
 'duga': 13,
 'polisi': 14,
 'lapor': 15,
 'warga': 16,
 'kasi': 17,
 'humas': 18,
 'polres': 19,
 'ipda': 20,
 'zuharis': 21,
 'rumah': 22,
 'malam': 23,
 'iya': 24,
 'rabu': 25,
 'detikbali': 26,
 'kamis': 27,
 'rinci': 28,
 'kronologi': 29,
 'aksi': 30,
 'bejat': 31,
 'sebut': 32,
 'tugas': 33,
 'susun': 34,
 'tunggu': 35,
 'lengkap': 36,
 'ya': 37,
 'salah': 38,
 'korban': 39,
 'ahmad': 40,
 'heboh': 41,
 'geram': 42,
 'tega': 43,
 'buat': 44,
 'asusila': 45,
 'kerumun': 46,
 'cepat': 47,
 'aman': 48,
 'laku': 49,
 'milik': 50,
 'orang': 51,
 'perempuan': 52,
 'rumahnyaada': 53,
 'istri': 54,
 'habis': 55,
 'pikir': 56,
 'jadi': 57,
 'cabul': 58,
 'lelaki': 59,
 'rumdis': 60,
 'wabup': 61,
 'langkat': 62,
 'adik': 63,
 'anggota': 64,
 'dprd': 65,
 'zs': 66,
 'leceh': 67,
 'seksual': 68,
 'dinas': 69,
 'wak

## Generate Training Data

<p> Melihat kata y dan kata konteksnya. Dia berlaku iterasi jadi kata y bisa juga menjadi kata konteks </p>

In [8]:
def generate_training_data(data, word_to_id, window_size):
    """Generate training data as context-y pairs."""
    training_data = []
    for tokens in data:
        for i, word in enumerate(tokens):
            start = max(0, i - window_size)
            end = min(len(tokens), i + window_size + 1)
            for j in range(start, end):
                if i != j:
                    training_data.append((word_to_id[word], word_to_id[tokens[j]]))
    return training_data

In [9]:
window_size = 2
training_data = generate_training_data(df['Berita'], word_to_id, window_size)
training_data

[(0, 1),
 (0, 2),
 (1, 0),
 (1, 2),
 (1, 3),
 (2, 0),
 (2, 1),
 (2, 3),
 (2, 4),
 (3, 1),
 (3, 2),
 (3, 4),
 (3, 5),
 (4, 2),
 (4, 3),
 (4, 5),
 (4, 3),
 (5, 3),
 (5, 4),
 (5, 3),
 (5, 6),
 (3, 4),
 (3, 5),
 (3, 6),
 (3, 7),
 (6, 5),
 (6, 3),
 (6, 7),
 (6, 8),
 (7, 3),
 (7, 6),
 (7, 8),
 (7, 4),
 (8, 6),
 (8, 7),
 (8, 4),
 (8, 9),
 (4, 7),
 (4, 8),
 (4, 9),
 (4, 10),
 (9, 8),
 (9, 4),
 (9, 10),
 (9, 11),
 (10, 4),
 (10, 9),
 (10, 11),
 (10, 12),
 (11, 9),
 (11, 10),
 (11, 12),
 (11, 5),
 (12, 10),
 (12, 11),
 (12, 5),
 (12, 13),
 (5, 11),
 (5, 12),
 (5, 13),
 (5, 0),
 (13, 12),
 (13, 5),
 (13, 0),
 (13, 1),
 (0, 5),
 (0, 13),
 (0, 1),
 (0, 2),
 (1, 13),
 (1, 0),
 (1, 2),
 (1, 5),
 (2, 0),
 (2, 1),
 (2, 5),
 (2, 14),
 (5, 1),
 (5, 2),
 (5, 14),
 (5, 15),
 (14, 2),
 (14, 5),
 (14, 15),
 (14, 16),
 (15, 5),
 (15, 14),
 (15, 16),
 (15, 17),
 (16, 14),
 (16, 15),
 (16, 17),
 (16, 18),
 (17, 15),
 (17, 16),
 (17, 18),
 (17, 19),
 (18, 16),
 (18, 17),
 (18, 19),
 (18, 4),
 (19, 17),
 (19, 18)

### Initialize weights for the Word2Vec model

In [10]:
def initialize_weights(vocab_size, embedding_dim):
    w = np.random.rand(vocab_size, embedding_dim)  # Input weights
    v = np.random.rand(embedding_dim, vocab_size)  # Output weights
    return w, v

In [11]:
w, v = initialize_weights(len(word_to_id), embedding_dim=100)

In [12]:
w.shape

(1867, 100)

In [13]:
v.shape

(100, 1867)

In [14]:
def one_hot_encode(id, vocab_size):
    """Perform one-hot encoding for a given ID."""
    res = [0] * vocab_size
    res[id] = 1
    return res

In [15]:
def softmax(x):
    """Compute the softmax of a vector."""
    exp_x = np.exp(x - np.max(x))  # Subtract max(x) to improve stability
    return exp_x / np.sum(exp_x)

## Neural Network word2Vec

In [16]:
def forward_propagation(word, w, v):
    """Perform forward propagation."""
    """
    Perform the forward propagation step.
    Args:
    - word: index of the y word
    - w: input-to-hidden weight matrix
    - v: hidden-to-output weight matrix
    
    Returns:
    - hidden: hidden layer activation (embedding vector)
    - predicted: predicted probability distribution (output layer)
    """

    # Input Layer
    X = one_hot_encode(word, len(w))
    
    # Hidden Layer          # Shape (100, )
    A = np.dot(X, w)

    # Hidden 2 (Output)
    B = np.dot(A, v)     # Shape (381, )
    Z = softmax(B)         # Shape (381, )

    return A, B, Z

In [17]:
def backward_propagation(word, context, A, Z, w, v, learning_rate):
    """Perform backward propagation"""
    """
    Perform the backward propagation step and update weights.
    Args:
    - word: index of the y word
    - context: index of the context word
    - hidden: hidden layer activation (embedding vector)
    - predicted: predicted probability distribution (output layer)
    - w: input-to-hidden weight matrix
    - v: hidden-to-output weight matrix
    - learning_rate: learning rate for gradient updates
    
    Returns:
    - Updated w and v
    """

    y = one_hot_encode(context, len(w))

    dB = Z - y
    dv = np.outer(A, dB)      # Shape (100, 381)
    dA = np.dot(dB, v.T)     # Shape (100,)
    dw = np.outer(dA, one_hot_encode(word, len(w))).T # (100,381)

    """Update weights"""
    w -= learning_rate * dw
    v -= learning_rate * dv
    return w, v

In [18]:
def train_word2vec(training_data, w, v, learning_rate, epochs):
    """Train the Word2Vec model using the provided training data."""
    """
    Train the Word2Vec model using forward and backward propagation.
    Args:
    - training_data: list of (y, context) pairs
    - w: input-to-hidden weight matrix
    - v: hidden-to-output weight matrix
    - learning_rate: learning rate for gradient updates
    - epochs: number of training iterations
    
    Returns:
    - Trained w and v
    """
    for epoch in range(epochs):
        total_loss = 0  # Initialize total loss for the epoch
        for word, context in training_data:
            # Forward pass
            A, B, Z = forward_propagation(word, w, v)
            
            # Compute loss (cross-entropy loss)
            y = one_hot_encode(context, len(w))
            L = -np.sum(y * np.log(Z + 1e-9))  # Adding 1e-9 for numerical stability
            total_loss += L
            
            # Backward pass
            w, v = backward_propagation(word, context, A, Z, w, v, learning_rate)
        
        # Print loss for the current epoch
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")
    
    return w, v

In [19]:
train_word2vec(training_data, w, v, learning_rate=0.01, epochs=10)

Epoch 1/10, Loss: 223009.8803
Epoch 2/10, Loss: 208914.4440
Epoch 3/10, Loss: 202410.6566


KeyboardInterrupt: 

In [20]:
#w

array([[ 0.33873711, -0.13408298,  0.00607258, ..., -0.01951817,
         0.22225208, -0.49043355],
       [ 0.11687252,  0.2034469 ,  0.26760142, ..., -0.05551917,
         0.01819217, -0.23282326],
       [ 0.54133126,  0.00577759,  0.53093181, ...,  0.0988413 ,
        -0.24090315, -0.35099139],
       ...,
       [ 0.7348749 ,  0.50175242,  0.37687409, ...,  0.39741863,
         0.385695  ,  0.05497061],
       [ 0.5520787 ,  0.09674647,  0.52441286, ...,  0.79849027,
         0.68788005,  0.17259819],
       [ 0.43049079,  0.14151292,  0.52916242, ...,  0.22206568,
         0.58070342,  0.89463913]])

In [21]:
#v

array([[ 0.35032911,  0.87536514,  0.34931059, ...,  1.10182228,
         0.95541663,  0.96919229],
       [ 0.40360829,  0.42553825,  0.25976093, ..., -0.22064363,
         0.01718495,  0.62859583],
       [ 0.44722416,  0.95264522,  0.49084048, ...,  0.18837631,
         0.0229247 ,  0.23258476],
       ...,
       [ 0.31209222,  0.17208014,  0.4420775 , ...,  0.84851963,
         1.03898032, -0.13980799],
       [ 0.58351097,  0.15011712,  0.50890513, ...,  0.2980816 ,
        -0.28580361,  0.61069594],
       [ 0.15186599,  0.38157011,  0.1528174 , ...,  0.61187116,
         0.73713281,  1.07906275]])

In [30]:
def aggregate_vectors_per_document(data, w, word_to_id):
    V = []
    for tokens in data:
        token_indices = [word_to_id[word] for word in tokens if word in word_to_id] #id setiap kata per dokumen
        #print("Token indices:", token_indices)
        doc_vector = np.mean(w[token_indices], axis=0) #rata-rata vektor per dokumen 
        V.append(doc_vector)
    return np.array(V)

# Hitung vektor dokumen
V = aggregate_vectors_per_document(df['Berita'], w, word_to_id)

array([[0.35328689, 0.38188585, 0.36088204, 0.3926816 , 0.34539149,
        0.28964305, 0.34815066, 0.39979946, 0.35722227, 0.31163451,
        0.34769791, 0.37611829, 0.41046073, 0.3152673 , 0.32829925,
        0.36282047, 0.38433664, 0.38134284, 0.33544798, 0.38105525,
        0.32940771, 0.35073623, 0.31419185, 0.33986089, 0.37987728,
        0.37810579, 0.26429667, 0.35449624, 0.3658832 , 0.32561927,
        0.40146074, 0.3263327 , 0.38280165, 0.32384681, 0.38186593,
        0.36804312, 0.42102392, 0.34924557, 0.30808403, 0.29533136,
        0.37098147, 0.32230914, 0.4049664 , 0.36153792, 0.33069211,
        0.31732435, 0.33248184, 0.31004436, 0.36084785, 0.39417641,
        0.34363397, 0.31197689, 0.24124766, 0.36411797, 0.28853914,
        0.31743158, 0.36168025, 0.3819469 , 0.34319048, 0.31775364,
        0.36563914, 0.35415113, 0.30222021, 0.30889231, 0.39477149,
        0.38752017, 0.30915789, 0.31086074, 0.363947  , 0.37514209,
        0.37588064, 0.33927625, 0.29478871, 0.39

In [31]:
# Tambahkan hasil agregasi ke dataframe dalam bentuk list agar bisa disimpan ke CSV
df['Word2Vec Vector'] = V.tolist()

In [32]:
df.head()

Unnamed: 0,Berita,Label,Category_id,Word2Vec Vector
0,"[faktafakta, santri, tewas, saksi, sangka, pon...",teman,3,"[0.3532868892949142, 0.3818858491610933, 0.360..."
1,"[astri, culik, anak, majikan, bandung, rp, jut...",pengasuh,1,"[0.3049016176906463, 0.29333553840422477, 0.27..."
2,"[polisi, keras, sma, bus, serpong, duga, kali,...",teman,3,"[0.32533741227239577, 0.36108487987479615, 0.3..."
3,"[tega, bima, pukulgigit, anak, gegara, kesal, ...",keluarga,0,"[0.2558586191227341, 0.4227631065065994, 0.374..."
4,"[polisi, tangkap, pria, duga, cabul, anak, sd,...",orang asing,4,"[0.2781720919008791, 0.36719086104651916, 0.44..."


In [33]:
# Simpan ke dalam file CSV
output_file = 'Word2vec Vector.csv'
df.to_csv(output_file, index=False)