In [21]:
from datetime import datetime
import numpy as np
from sentence_transformers import SentenceTransformer, util

def create_word2vec_word2idx(vector_file):
    word2idx = {
        "_PAD": 0,
        "_UNK": 1,
    }
    idx = 2

    with open(vector_file, 'r', encoding='utf-8') as file:
        now = datetime.now()
        
        for line in file:
            try:
                line = line.strip().split()
                word = line[0]

                if word not in word2idx:
                    word2idx[word] = idx
                    idx += 1
            except Exception as e:
                continue

            if ( idx + 1 ) % 100000 == 0:
                print(f"processed {idx+1} lines in : {datetime.now() - now}")
                now = datetime.now()
            
    return word2idx

vector_file = '/home/jupyter-23521027/refresh-bert/data/indo4b/fasttext.4B.id.300.epoch5.uncased.vec'
word2idx = create_word2vec_word2idx(vector_file)
model = SentenceTransformer('/home/jupyter-23521027/refresh-bert/data/sbert/indobert-large-p1_dense_trained-epoch14', device='cuda:0')

processed 100000 lines in : 0:00:01.483051
processed 200000 lines in : 0:00:01.504671
processed 300000 lines in : 0:00:01.557410
processed 400000 lines in : 0:00:01.536786
processed 500000 lines in : 0:00:01.534796
processed 600000 lines in : 0:00:01.540017
processed 700000 lines in : 0:00:01.603163
processed 800000 lines in : 0:00:01.558970
processed 900000 lines in : 0:00:01.567573
processed 1000000 lines in : 0:00:01.559157
processed 1100000 lines in : 0:00:01.572110
processed 1200000 lines in : 0:00:01.581173
processed 1300000 lines in : 0:00:01.586955
processed 1400000 lines in : 0:00:01.657445
processed 1500000 lines in : 0:00:01.557385
processed 1600000 lines in : 0:00:01.578946
processed 1700000 lines in : 0:00:01.576346
processed 1800000 lines in : 0:00:01.578137
processed 1900000 lines in : 0:00:01.601763
processed 2000000 lines in : 0:00:01.576514
processed 2100000 lines in : 0:00:01.597541
processed 2200000 lines in : 0:00:01.635089
processed 2300000 lines in : 0:00:01.6032

In [13]:
import pandas as pd

def prepare_data(sent_tokenized):
    MAX_DOC_LENGTH = 100
    MAX_SENT_EMBEDDING_LENGTH = 250
    DATASET_SIZE = 999999
    data = {
        "id": [1],
        "sentences": [sent_tokenized]
    }
    df = pd.DataFrame(data)

    file = open("/home/jupyter-23521027/refresh-bert/data/preprocessed-input-directory/liputan6.inference.doc" , 'w')
    for i, row in df.iterrows():
        file.write(f'liputan6-{str(row["id"])}' + "\n")

        sentences = list(row["sentences"][:MAX_DOC_LENGTH])
        for sentence in sentences:
            for word in sentence.split():
                try:
                    index = str(word2idx[word.lower()])
                except Exception as KeyError:
                    index = str(word2idx["_UNK"])
                file.write(index + " ")

            file.write("\n")
        file.write("\n")
    file.close()

    file_path = "/home/jupyter-23521027/refresh-bert/data/preprocessed-input-directory/liputan6.inference.sbert"
    file = open(file_path , 'w')
    for i, row in df.iterrows():
        file.write(f"liputan6-{str(row['id'])}\n")

        for sent_txt in row["sentences"][:MAX_DOC_LENGTH]:
            sent_embedding = model.encode(sent_txt, convert_to_numpy=True).tolist()[:MAX_SENT_EMBEDDING_LENGTH]

            for element in sent_embedding:
                file.write(str(element) + " ")

            file.write("\n")

        file.write("\n")

    file.close()

In [20]:
import pandas as pd

file_path = "/home/jupyter-23521027/refresh-bert/data/Liputan6-Filtered-TokenizedSegmented/inference/1.mainbody"
with open(file_path, 'r') as file:
    lines = file.readlines()
    sent_tokenized = [line.strip() for line in lines]
    print(sent_tokenized)

prepare_data(sent_tokenized)

['Salah satu warga Banyuwangi, Jeni (50) mengaku, beberapa hari tidak bisa memasak karena tidak ada LPG 3 kilogram .', 'Dia terpaksa membeli masakan siap saji untuk makan setiap hari .', '" Kenapa ini gas ini kok bisa langka seperti ini . Saya sudah tiga hari tidak masak karena tidak ada gas , masak di Banyuwangi terjadi Kelangkaan seperti ini," sesal Jeni sembari mengantre mendapatkan gas LPG 3 kilogram pada Senin 24 Juli 2023.', 'Kata Jeni, selain langka, harga gas 3 kilogram di tingkat pangkalan juga naik.', 'Dari yang harga normalnya Rp 18 ribu per tabung, saat ini sudah mencapai Rp 22 ribu hingga Rp 25 ribu per tabung.', '"Saya tidak mempermasalahkan kenaikan itu, tapi yang penting barangnya ada. Naik tapi barangnya tidak ada ini kan justru menyulitkan kita, karena mau masak tidak bisa, kembali menggunakan kayu juga tidak bisa, siapa yang jualan kayu sekarang," tegasnya.', 'Masalah kelangkaan LPG 3 kg ini sampai di telinga Presiden Joko Widodo atau Jokowi.', 'Dia pun kemudian mene