In [13]:
from datetime import datetime
import numpy as np

def create_word2vec_word2idx(vector_file):
    word2vec = {}
    word2idx = {
        "_PAD": 0,
        "_UNK": 1,
    }
    idx = 2

    with open(vector_file, 'r', encoding='utf-8') as file:
        now = datetime.now()
        
        for line in file:
            try:
                line = line.strip().split()
                word = line[0]
                vector = [float(val) for val in line[1:]]

                # Store word-to-vector mapping
                word2vec[word] = vector

                # Create word-to-index mapping
                if word not in word2idx:
                    word2idx[word] = idx
                    idx += 1
            except Exception as e:
                print(word, e)

            if ( idx + 1 ) % 100000 == 0:
                print(f"processed {idx+1} lines in : {datetime.now() - now}")
                now = datetime.now()
            
    return word2vec, word2idx

In [None]:
vector_file = '/home/jupyter-23521027/refresh-bert/data/indo4b/fasttext.4B.id.300.epoch5.uncased.vec'
word2vec, word2idx = create_word2vec_word2idx(vector_file)

In [23]:
import os
import json
import pandas as pd

folder_path = '/home/jupyter-23521027/refresh-bert/data/liputan6_data/canonical/train' 

file_list = os.listdir(folder_path)
json_data_list = []
file_list = os.listdir(folder_path)

for file_name in file_list:
    file_path = os.path.join(folder_path, file_name) 

    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            json_data_list.append(json_data)

df = pd.DataFrame(json_data_list)
df['sentences'] = df['clean_article'].apply(lambda x: [' '.join(sentence) for sentence in x])
df

Unnamed: 0,id,url,clean_article,clean_summary,extractive_summary,sentences
0,48951,https://www.liputan6.com/news/read/48951/polis...,"[[Liputan6, ., com, ,, Lamongan, :, Tim, Inves...","[[Senjata, api, dan, ratusan, butir, peluru, m...","[1, 2]","[Liputan6 . com , Lamongan : Tim Investigasi B..."
1,124973,https://www.liputan6.com/news/read/124973/gent...,"[[Liputan6, ., com, ,, Depok, :, Abdul, Madjid...","[[Genta, buatan, Abdul, Madjid, bisa, melahirk...","[0, 1]","[Liputan6 . com , Depok : Abdul Madjid punya k..."
2,298677,https://www.liputan6.com/news/read/298677/wasp...,"[[Liputan6, ., com, ,, Bekasi, :, Aparat, Kepo...","[[Berhati-hatilah, Anda, bila, bepergian, ke, ...","[0, 5]","[Liputan6 . com , Bekasi : Aparat Kepolisian R..."
3,40569,https://www.liputan6.com/news/read/40569/imbau...,"[[Liputan6, ., com, ,, Denpasar, :, Imbauan, M...","[[Peringatan, untuk, tidak, pergi, ke, Indones...","[5, 8]","[Liputan6 . com , Denpasar : Imbauan Menteri L..."
4,242869,https://www.liputan6.com/news/read/242869/rapa...,"[[Liputan6, ., com, ,, Jakarta, :, Gempa, kuat...","[[Rapat, pleno, KPU, terhenti, ketika, gempa, ...","[0, 4]","[Liputan6 . com , Jakarta : Gempa kuat yang me..."
...,...,...,...,...,...,...
193878,164143,https://www.liputan6.com/news/read/164143/tahu...,"[[Liputan6, ., com, ,, Jakarta, :, Warga, yang...","[[Dirjen, Pajak, membuat, aturan, setiap, oran...","[2, 4]","[Liputan6 . com , Jakarta : Warga yang belum m..."
193879,150247,https://www.liputan6.com/news/read/150247/mass...,"[[Liputan6, ., com, ,, Bogor, :, Ribuan, umat,...","[[Ribuan, umat, Islam, menyerbu, vila, milik, ...","[0, 1]","[Liputan6 . com , Bogor : Ribuan umat Islam me..."
193880,204203,https://www.liputan6.com/news/read/204203/bate...,"[[Chelsea, melayangkan, keberatannya, kepada, ...","[[Ketua, Leeds, United, ,, Ken, Bates, berhasi...","[0, 13]",[Chelsea melayangkan keberatannya kepada Asosi...
193881,159188,https://www.liputan6.com/news/read/159188/peny...,"[[Liputan6, ., com, ,, Denpasar, :, Kebakaran,...","[[Kebakaran, toko, sepatu, di, Denpasar, ,, Ba...","[0, 3]","[Liputan6 . com , Denpasar : Kebakaran toko se..."


In [24]:
from datetime import datetime

MAX_DOC_LENGTH = 100
DATASET_SIZE = 999999
docs = []

now = datetime.now()
file = open("/home/jupyter-23521027/refresh-bert/data/preprocessed-input-directory/liputan6.train.doc" , 'w')

for i, row in df.iterrows():
    file.write(f'liputan6-{str(row["id"])}' + "\n")

    sentences = list(row["sentences"][:MAX_DOC_LENGTH])
    for sentence in sentences:
        for word in sentence:
            try:
                index = str(word2idx[word])
            except Exception as KeyError:
                index = str(word2idx["_UNK"])
            file.write(index + " ")
            
        file.write("\n")

    if (i + 1) % 10000 == 0:
        print(f"processing {i + 1} with time: {datetime.now() - now}")
        now = datetime.now()

    file.write("\n")
    
file.close()

processing 10000 with time: 0:00:08.089907
processing 20000 with time: 0:00:08.033513
processing 30000 with time: 0:00:08.009633
processing 40000 with time: 0:00:08.023161
processing 50000 with time: 0:00:08.003744
processing 60000 with time: 0:00:07.985763
processing 70000 with time: 0:00:08.002104
processing 80000 with time: 0:00:07.951278
processing 90000 with time: 0:00:08.121879
processing 100000 with time: 0:00:08.041578
processing 110000 with time: 0:00:07.938214
processing 120000 with time: 0:00:07.884338
processing 130000 with time: 0:00:08.110205
processing 140000 with time: 0:00:07.950020
processing 150000 with time: 0:00:08.004892
processing 160000 with time: 0:00:08.070434
processing 170000 with time: 0:00:08.079337
processing 180000 with time: 0:00:08.058899
processing 190000 with time: 0:00:08.000445


In [13]:
def prepare_vocab_embeddingdict():
    vocab_dict = {}

    # Add padding
    vocab_dict["_PAD"] = 0
    # Add UNK
    vocab_dict["_UNK"] = 1

    # Read word embedding file
    wordembed_filename = '/home/jupyter-23521027/refresh-bert/data/indo4b/fasttext.4B.id.300.epoch5.uncased.vec'
    print("Reading pretrained word embeddings file: %s" % wordembed_filename)

    linecount = 0
    with open(wordembed_filename, "r") as fembedd:
        for line in fembedd:
            try:
                if linecount == 0:
                    vocabsize = int(line.split()[0])
                else:
                    linedata = line.split()
                    vocab_dict[linedata[0]] = linecount + 1
                linecount += 1
            except Exception as e:
                continue

            if linecount % 500000 == 0:
                print(str(linecount) + " ...")

    print("Size of vocab: %d (_PAD:0, _UNK:1)" % len(vocab_dict))

    return vocab_dict

In [14]:
vocab_dict = prepare_vocab_embeddingdict()

Reading pretrained word embeddings file: /home/jupyter-23521027/refresh-bert/data/indo4b/fasttext.4B.id.300.epoch5.uncased.vec
500000 ...
1000000 ...
1500000 ...
2000000 ...
2500000 ...
3000000 ...
3500000 ...
4000000 ...
Size of vocab: 4277095 (_PAD:0, _UNK:1)


In [19]:
import os
import json
import pandas as pd

folder_path = '/home/jupyter-23521027/refresh-bert/data/liputan6_data/canonical/test' 

file_list = os.listdir(folder_path)
json_data_list = []
file_list = os.listdir(folder_path)

for file_name in file_list:
    file_path = os.path.join(folder_path, file_name) 

    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            json_data_list.append(json_data)

df = pd.DataFrame(json_data_list)
df['sentences'] = df['clean_article'].apply(lambda x: [' '.join(sentence) for sentence in x])
df

Unnamed: 0,id,url,clean_article,clean_summary,extractive_summary,sentences
0,19962,https://www.liputan6.com/news/read/19962/gas-b...,"[[Liputan6, ., com, ,, Jakarta, :, Untuk, meme...","[[Pertamina, akan, menyalurkan, gas, alam, dar...","[0, 4]","[Liputan6 . com , Jakarta : Untuk memenuhi ken..."
1,23319,https://www.liputan6.com/news/read/23319/pelaj...,"[[Liputan6, ., com, ,, Yogyakarta, :, Sekitar,...","[[Ratusan, siswa, Sekolah, Menangah, Umum, 6, ...","[1, 2]","[Liputan6 . com , Yogyakarta : Sekitar 700 sis..."
2,18513,https://www.liputan6.com/news/read/18513/bamba...,"[[Liputan6, ., com, ,, Jakarta, :, Ekspektasi,...","[[Tim, Ekonomi, Pemerintah, yang, sedang, beru...","[0, 6]","[Liputan6 . com , Jakarta : Ekspektasi pasar y..."
3,23951,https://www.liputan6.com/news/read/23951/bengk...,"[[Liputan6, ., com, ,, Jakarta, :, Perampokan,...","[[Bengkel, Toyota, Astra, dan, Indomaret, di, ...","[0, 1]","[Liputan6 . com , Jakarta : Perampokan bersenj..."
4,13167,https://www.liputan6.com/news/read/13167/bentr...,"[[Liputan6, ., com, ,, Jakarta, :, Bentrokan, ...","[[Bentrokan, antara, mahasiswa, satu, kampus, ...","[0, 7]","[Liputan6 . com , Jakarta : Bentrokan antara m..."
...,...,...,...,...,...,...
10967,15906,https://www.liputan6.com/news/read/15906/soepa...,"[[Liputan6, ., com, ,, Jakarta, :, Ketua, DPR,...","[[Kepergian, Baharuddin, Lopa, membuat, Akbar,...","[0, 3]","[Liputan6 . com , Jakarta : Ketua DPR Akbar Ta..."
10968,19241,https://www.liputan6.com/news/read/19241/golka...,"[[Liputan6, ., com, ,, Jakarta, :, Partai, Gol...","[[Ketua, Umum, DPP, Partai, Golkar, Akbar, Tan...","[2, 3]","[Liputan6 . com , Jakarta : Partai Golongan Ka..."
10969,20408,https://www.liputan6.com/news/read/20408/pemer...,"[[Liputan6, ., com, ,, Jakarta, :, Pemerintah,...","[[Pemerintah, akan, memberlakukan, sistem, UMR...","[1, 4, 5]","[Liputan6 . com , Jakarta : Pemerintah berenca..."
10970,26145,https://www.liputan6.com/news/read/26145/pemer...,"[[Liputan6, ., com, ,, Medan, :, Pemerintah, t...","[[Biro, perjalanan, haji, yang, menggunakan, p...","[1, 5]","[Liputan6 . com , Medan : Pemerintah tetap tak..."


In [20]:
from datetime import datetime

MAX_DOC_LENGTH = 100
DATASET_SIZE = 999999
docs = []

now = datetime.now()
file = open("/home/jupyter-23521027/refresh-bert/data/preprocessed-input-directory/liputan6.test.doc" , 'w')

for i, row in df.iterrows():
    file.write(f'liputan6-{str(row["id"])}' + "\n")

    sentences = list(row["sentences"][:MAX_DOC_LENGTH])
    for sentence in sentences:
        for word in sentence:
            try:
                index = str(vocab_dict[word])
            except Exception as KeyError:
                index = str(vocab_dict["_UNK"])
            file.write(index + " ")
            
        file.write("\n")

    if (i + 1) % 10000 == 0:
        print(f"processing {i + 1} with time: {datetime.now() - now}")
        now = datetime.now()

    file.write("\n")
    
file.close()

processing 10000 with time: 0:00:07.910342
