In [1]:
import nltk
from nltk.tokenize import word_tokenize
from datetime import datetime

# Preprocessing step: tokenization and cleaning
INDO4B_DATASET_SIZE = 500000

# Define the context window size
window_size = 2

# Open the text file
with open('/home/jupyter-23521027/refresh-bert/data/indo4b/dataset_all_uncased_blankline.txt', 'r') as file:
    # Iterate over each line in the file
    now = datetime.now()
    word_pairs = []

    for l, line in enumerate(file):
        # Tokenize the line into words
        words = word_tokenize(line)
        
        # Generate word pairs
        for i, target_word in enumerate(words):
            for j in range(i - window_size, i + window_size + 1):
                if j != i and j >= 0 and j < len(words):
                    context_word = words[j]
                    word_pairs.append((target_word, context_word))
                    
        if ( l + 1 ) % 100000 == 0:
            print(f"processed {l+1} files in : {datetime.now() - now}")
            now = datetime.now()
        
        if (l + 1) == INDO4B_DATASET_SIZE:
            break

# Assuming you have word_pairs as a list of word pairs

# Create word-to-index and index-to-word mappings
words = set([pair[0] for pair in word_pairs] + [pair[1] for pair in word_pairs])
word_to_index = {word: index for index, word in enumerate(words)}
index_to_word = {index: word for index, word in enumerate(words)}

# Convert word pairs to numerical representations
numeric_pairs = [(word_to_index[pair[0]], word_to_index[pair[1]]) for pair in word_pairs]

processed 100000 files in : 0:00:12.910917
processed 200000 files in : 0:00:12.783126
processed 300000 files in : 0:00:12.846023
processed 400000 files in : 0:00:12.658142
processed 500000 files in : 0:00:12.468404


In [2]:
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten, Activation

# Convert numeric pairs to numpy arrays
target_words = np.array([pair[0] for pair in numeric_pairs])
context_words = np.array([pair[1] for pair in numeric_pairs])

In [9]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [10]:
# Build the skip-gram model
vocab_size = len(word_to_index)
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1, embeddings_initializer="glorot_uniform",))
model.add(Dense(units=50, activation='tanh'))
model.add(Dense(units=50, activation='tanh'))
model.add(Dense(units=vocab_size, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [11]:
# Train the model
model.fit(target_words, context_words, epochs=5, batch_size=1028)

Epoch 1/5
    6/24510 [..............................] - ETA: 29:53:12 - loss: 12.0277

KeyboardInterrupt: 

In [None]:
from keras.models import save_model
save_model(model, f'/home/jupyter-23521027/refresh-bert/data/indo4b/skipgram_model_{INDO4B_DATASET_SIZE}.h5')
