In [12]:
from datetime import datetime
import numpy as np

def create_word2vec_word2idx(vector_file):
    word2vec = {}
    word2idx = {
        "_PAD": 0,
        "_UNK": 1,
    }
    idx = 2

    with open(vector_file, 'r', encoding='utf-8') as file:
        now = datetime.now()
        
        for line in file:
            try:
                line = line.strip().split()
                word = line[0]
                vector = [float(val) for val in line[1:]]

                # Store word-to-vector mapping
                word2vec[word] = vector

                # Create word-to-index mapping
                if word not in word2idx:
                    word2idx[word] = idx
                    idx += 1
            except Exception as e:
                continue

            if ( idx + 1 ) % 100000 == 0:
                print(f"processed {idx+1} lines in : {datetime.now() - now}")
                now = datetime.now()
            
    return word2vec, word2idx

In [13]:
vector_file = '/home/jupyter-23521027/refresh-bert/data/indo4b/fasttext.4B.id.300.epoch5.uncased.vec'
word2vec, word2idx = create_word2vec_word2idx(vector_file)

processed 100000 lines in : 0:00:07.976401
processed 200000 lines in : 0:00:07.840785
processed 300000 lines in : 0:00:07.939245
processed 400000 lines in : 0:00:11.714102
processed 500000 lines in : 0:00:07.842369
processed 600000 lines in : 0:00:07.912034
processed 700000 lines in : 0:00:08.199151
processed 800000 lines in : 0:00:08.059702
processed 900000 lines in : 0:00:08.793836
processed 1000000 lines in : 0:00:08.694127
processed 1100000 lines in : 0:00:07.842651
processed 1200000 lines in : 0:00:07.753648
processed 1300000 lines in : 0:00:07.812415
processed 1400000 lines in : 0:00:07.804120
processed 1500000 lines in : 0:00:16.303724
processed 1600000 lines in : 0:00:07.521860
processed 1700000 lines in : 0:00:07.515390
processed 1800000 lines in : 0:00:07.502304
processed 1900000 lines in : 0:00:07.547782
processed 2000000 lines in : 0:00:07.505438
processed 2100000 lines in : 0:00:07.655357
processed 2200000 lines in : 0:00:07.544271
processed 2300000 lines in : 0:00:07.6759

In [29]:
import os
import json
import pandas as pd

folder_path = '/home/jupyter-23521027/refresh-bert/data/liputan6_data/canonical/train' 

file_list = os.listdir(folder_path)
json_data_list = []
file_list = os.listdir(folder_path)

for file_name in file_list:
    file_path = os.path.join(folder_path, file_name) 

    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            json_data_list.append(json_data)

df = pd.DataFrame(json_data_list)
df['sentences'] = df['clean_article'].apply(lambda x: [' '.join(sentence) for sentence in x])
df

Unnamed: 0,id,url,clean_article,clean_summary,extractive_summary,sentences
0,48951,https://www.liputan6.com/news/read/48951/polis...,"[[Liputan6, ., com, ,, Lamongan, :, Tim, Inves...","[[Senjata, api, dan, ratusan, butir, peluru, m...","[1, 2]","[Liputan6 . com , Lamongan : Tim Investigasi B..."
1,124973,https://www.liputan6.com/news/read/124973/gent...,"[[Liputan6, ., com, ,, Depok, :, Abdul, Madjid...","[[Genta, buatan, Abdul, Madjid, bisa, melahirk...","[0, 1]","[Liputan6 . com , Depok : Abdul Madjid punya k..."
2,298677,https://www.liputan6.com/news/read/298677/wasp...,"[[Liputan6, ., com, ,, Bekasi, :, Aparat, Kepo...","[[Berhati-hatilah, Anda, bila, bepergian, ke, ...","[0, 5]","[Liputan6 . com , Bekasi : Aparat Kepolisian R..."
3,40569,https://www.liputan6.com/news/read/40569/imbau...,"[[Liputan6, ., com, ,, Denpasar, :, Imbauan, M...","[[Peringatan, untuk, tidak, pergi, ke, Indones...","[5, 8]","[Liputan6 . com , Denpasar : Imbauan Menteri L..."
4,242869,https://www.liputan6.com/news/read/242869/rapa...,"[[Liputan6, ., com, ,, Jakarta, :, Gempa, kuat...","[[Rapat, pleno, KPU, terhenti, ketika, gempa, ...","[0, 4]","[Liputan6 . com , Jakarta : Gempa kuat yang me..."
...,...,...,...,...,...,...
193878,164143,https://www.liputan6.com/news/read/164143/tahu...,"[[Liputan6, ., com, ,, Jakarta, :, Warga, yang...","[[Dirjen, Pajak, membuat, aturan, setiap, oran...","[2, 4]","[Liputan6 . com , Jakarta : Warga yang belum m..."
193879,150247,https://www.liputan6.com/news/read/150247/mass...,"[[Liputan6, ., com, ,, Bogor, :, Ribuan, umat,...","[[Ribuan, umat, Islam, menyerbu, vila, milik, ...","[0, 1]","[Liputan6 . com , Bogor : Ribuan umat Islam me..."
193880,204203,https://www.liputan6.com/news/read/204203/bate...,"[[Chelsea, melayangkan, keberatannya, kepada, ...","[[Ketua, Leeds, United, ,, Ken, Bates, berhasi...","[0, 13]",[Chelsea melayangkan keberatannya kepada Asosi...
193881,159188,https://www.liputan6.com/news/read/159188/peny...,"[[Liputan6, ., com, ,, Denpasar, :, Kebakaran,...","[[Kebakaran, toko, sepatu, di, Denpasar, ,, Ba...","[0, 3]","[Liputan6 . com , Denpasar : Kebakaran toko se..."


In [27]:
for i, row in df.iterrows():
    sentences = list(row["sentences"][:200])
    for sentence in sentences:
        for word in sentence.split():
            print(word)
            try:
                index = str(word2idx[word.lower()])
            except Exception as KeyError:
                index = str(word2idx["_UNK"])
            print(index)
            break

    break

Liputan6
71000
S
1823
Bimantoro
78860
Demikian
452
Dia
49
Menurut
202


In [30]:
from datetime import datetime

MAX_DOC_LENGTH = 100
DATASET_SIZE = 999999
docs = []

now = datetime.now()
file = open("/home/jupyter-23521027/refresh-bert/data/preprocessed-input-directory/liputan6.training.doc" , 'w')

for i, row in df.iterrows():
    file.write(f'liputan6-{str(row["id"])}' + "\n")

    sentences = list(row["sentences"][:MAX_DOC_LENGTH])
    for sentence in sentences:
        for word in sentence.split():
            try:
                index = str(word2idx[word.lower()])
            except Exception as KeyError:
                index = str(word2idx["_UNK"])
            file.write(index + " ")
            
        file.write("\n")

    if (i + 1) % 10000 == 0:
        print(f"processing {i + 1} with time: {datetime.now() - now}")
        now = datetime.now()

    file.write("\n")
    
file.close()

processing 10000 with time: 0:00:02.625798
processing 20000 with time: 0:00:02.369671
processing 30000 with time: 0:00:02.686024
processing 40000 with time: 0:00:02.446972
processing 50000 with time: 0:00:02.367008
processing 60000 with time: 0:00:02.287152
processing 70000 with time: 0:00:02.316060
processing 80000 with time: 0:00:02.299353
processing 90000 with time: 0:00:02.354883
processing 100000 with time: 0:00:02.330369
processing 110000 with time: 0:00:02.306927
processing 120000 with time: 0:00:02.267288
processing 130000 with time: 0:00:02.299044
processing 140000 with time: 0:00:02.293668
processing 150000 with time: 0:00:02.302514
processing 160000 with time: 0:00:02.328968
processing 170000 with time: 0:00:02.345570
processing 180000 with time: 0:00:02.294996
processing 190000 with time: 0:00:02.272586


In [31]:
import os
import json
import pandas as pd

folder_path = '/home/jupyter-23521027/refresh-bert/data/liputan6_data/canonical/test' 

file_list = os.listdir(folder_path)
json_data_list = []
file_list = os.listdir(folder_path)

for file_name in file_list:
    file_path = os.path.join(folder_path, file_name) 

    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            json_data_list.append(json_data)

df = pd.DataFrame(json_data_list)
df['sentences'] = df['clean_article'].apply(lambda x: [' '.join(sentence) for sentence in x])
df

Unnamed: 0,id,url,clean_article,clean_summary,extractive_summary,sentences
0,19962,https://www.liputan6.com/news/read/19962/gas-b...,"[[Liputan6, ., com, ,, Jakarta, :, Untuk, meme...","[[Pertamina, akan, menyalurkan, gas, alam, dar...","[0, 4]","[Liputan6 . com , Jakarta : Untuk memenuhi ken..."
1,23319,https://www.liputan6.com/news/read/23319/pelaj...,"[[Liputan6, ., com, ,, Yogyakarta, :, Sekitar,...","[[Ratusan, siswa, Sekolah, Menangah, Umum, 6, ...","[1, 2]","[Liputan6 . com , Yogyakarta : Sekitar 700 sis..."
2,18513,https://www.liputan6.com/news/read/18513/bamba...,"[[Liputan6, ., com, ,, Jakarta, :, Ekspektasi,...","[[Tim, Ekonomi, Pemerintah, yang, sedang, beru...","[0, 6]","[Liputan6 . com , Jakarta : Ekspektasi pasar y..."
3,23951,https://www.liputan6.com/news/read/23951/bengk...,"[[Liputan6, ., com, ,, Jakarta, :, Perampokan,...","[[Bengkel, Toyota, Astra, dan, Indomaret, di, ...","[0, 1]","[Liputan6 . com , Jakarta : Perampokan bersenj..."
4,13167,https://www.liputan6.com/news/read/13167/bentr...,"[[Liputan6, ., com, ,, Jakarta, :, Bentrokan, ...","[[Bentrokan, antara, mahasiswa, satu, kampus, ...","[0, 7]","[Liputan6 . com , Jakarta : Bentrokan antara m..."
...,...,...,...,...,...,...
10967,15906,https://www.liputan6.com/news/read/15906/soepa...,"[[Liputan6, ., com, ,, Jakarta, :, Ketua, DPR,...","[[Kepergian, Baharuddin, Lopa, membuat, Akbar,...","[0, 3]","[Liputan6 . com , Jakarta : Ketua DPR Akbar Ta..."
10968,19241,https://www.liputan6.com/news/read/19241/golka...,"[[Liputan6, ., com, ,, Jakarta, :, Partai, Gol...","[[Ketua, Umum, DPP, Partai, Golkar, Akbar, Tan...","[2, 3]","[Liputan6 . com , Jakarta : Partai Golongan Ka..."
10969,20408,https://www.liputan6.com/news/read/20408/pemer...,"[[Liputan6, ., com, ,, Jakarta, :, Pemerintah,...","[[Pemerintah, akan, memberlakukan, sistem, UMR...","[1, 4, 5]","[Liputan6 . com , Jakarta : Pemerintah berenca..."
10970,26145,https://www.liputan6.com/news/read/26145/pemer...,"[[Liputan6, ., com, ,, Medan, :, Pemerintah, t...","[[Biro, perjalanan, haji, yang, menggunakan, p...","[1, 5]","[Liputan6 . com , Medan : Pemerintah tetap tak..."


In [32]:
from datetime import datetime

MAX_DOC_LENGTH = 100
DATASET_SIZE = 999999
docs = []

now = datetime.now()
file = open("/home/jupyter-23521027/refresh-bert/data/preprocessed-input-directory/liputan6.test.doc" , 'w')

for i, row in df.iterrows():
    file.write(f'liputan6-{str(row["id"])}' + "\n")

    sentences = list(row["sentences"][:MAX_DOC_LENGTH])
    for sentence in sentences:
        for word in sentence.split():
            try:
                index = str(word2idx[word.lower()])
            except Exception as KeyError:
                index = str(word2idx["_UNK"])
            file.write(index + " ")
            
        file.write("\n")

    if (i + 1) % 10000 == 0:
        print(f"processing {i + 1} with time: {datetime.now() - now}")
        now = datetime.now()

    file.write("\n")
    
file.close()

processing 10000 with time: 0:00:02.115055


In [21]:
len(vocab_dict)

4277095

In [33]:
import os
import json
import pandas as pd

folder_path = '/home/jupyter-23521027/refresh-bert/data/liputan6_data/canonical/dev' 

file_list = os.listdir(folder_path)
json_data_list = []
file_list = os.listdir(folder_path)

for file_name in file_list:
    file_path = os.path.join(folder_path, file_name) 

    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            json_data_list.append(json_data)

df = pd.DataFrame(json_data_list)
df['sentences'] = df['clean_article'].apply(lambda x: [' '.join(sentence) for sentence in x])
df

Unnamed: 0,id,url,clean_article,clean_summary,extractive_summary,sentences
0,8279,https://www.liputan6.com/news/read/8279/enam-t...,"[[Liputan6, ., com, ,, Waringin, :, Kepala, Po...","[[Polri, telah, meringkus, enam, tersangka, pe...","[2, 4]","[Liputan6 . com , Waringin : Kepala Polri Jend..."
1,1774,https://www.liputan6.com/news/read/1774/bantua...,"[[Liputan6, ., com, ,, Kupang, :, Subsidi, dan...","[[Alokasi, dana, subsidi, BBM, bagi, masyaraka...","[0, 1]","[Liputan6 . com , Kupang : Subsidi dana bahan ..."
2,11207,https://www.liputan6.com/news/read/11207/kapol...,"[[Liputan6, ., com, ,, Jakarta, :, Kepala, Kep...","[[Kapolri, meminta, tanggal, pelaksanaan, isti...","[1, 4]","[Liputan6 . com , Jakarta : Kepala Kepolisian ..."
3,2125,https://www.liputan6.com/news/read/2125/komnas...,"[[Liputan6, ., com, ,, Jakarta, :, Komisi, Nas...","[[Untuk, mencari, fakta, atas, Insiden, Wamena...","[0, 2]","[Liputan6 . com , Jakarta : Komisi Nasional Ha..."
4,421,https://www.liputan6.com/news/read/421/uang-me...,"[[Liputan6, ., com, ,, Jakarta, :, Beragam, ge...","[[Soal, money, politics, yang, mewarnai, kerus...","[1, 3]","[Liputan6 . com , Jakarta : Beragam gejolak ti..."
...,...,...,...,...,...,...
10967,7782,https://www.liputan6.com/news/read/7782/sri-so...,"[[Liputan6, ., com, ,, Jakarta, :, Aksi, massa...","[[Aksi, kekerasan, yang, akhir-akhir, ini, mar...","[0, 4]","[Liputan6 . com , Jakarta : Aksi massa yang ke..."
10968,5241,https://www.liputan6.com/news/read/5241/bi-dim...,"[[Liputan6, ., com, ,, Jakarta, :, Perbankan, ...","[[Bank, Indonesia, diminta, meninjau, kembali,...","[0, 10]","[Liputan6 . com , Jakarta : Perbankan nasional..."
10969,8517,https://www.liputan6.com/news/read/8517/komisi...,"[[Liputan6, ., com, ,, Jakarta, :, Pengamat, p...","[[Pengamat, penyiaran, lebih, setuju, bila, Ko...","[0, 1]","[Liputan6 . com , Jakarta : Pengamat pers Atma..."
10970,11954,https://www.liputan6.com/news/read/11954/api-m...,"[[Liputan6, ., com, ,, Jakarta, :, Puluhan, ru...","[[Kobaran, api, melalap, puluhan, rumah, dan, ...","[1, 2]","[Liputan6 . com , Jakarta : Puluhan rumah di r..."


In [34]:
from datetime import datetime

MAX_DOC_LENGTH = 100
DATASET_SIZE = 999999
docs = []

now = datetime.now()
file = open("/home/jupyter-23521027/refresh-bert/data/preprocessed-input-directory/liputan6.validation.doc" , 'w')

for i, row in df.iterrows():
    file.write(f'liputan6-{str(row["id"])}' + "\n")

    sentences = list(row["sentences"][:MAX_DOC_LENGTH])
    for sentence in sentences:
        for word in sentence.split():
            try:
                index = str(word2idx[word.lower()])
            except Exception as KeyError:
                index = str(word2idx["_UNK"])
            file.write(index + " ")
            
        file.write("\n")

    if (i + 1) % 10000 == 0:
        print(f"processing {i + 1} with time: {datetime.now() - now}")
        now = datetime.now()

    file.write("\n")
    
file.close()

processing 10000 with time: 0:00:02.165036
