# Dataset Load: NERGRIT

In [1]:
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nergrit_dataset = load_dataset("id_nergrit_corpus", 'ner')
nergrit_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 12532
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2399
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2521
    })
})

In [3]:
test = nergrit_dataset['test']
train = nergrit_dataset['train']
validation = nergrit_dataset['validation']

tokens = []
ner_tags = []
id_data = []

for dataset in [train, test, validation]:
    tokens.extend(dataset['tokens'])
    ner_tags.extend(dataset['ner_tags'])
    id_data.extend(dataset['id'])

nergrit_tokens_combined = tokens.copy()

dataset = {"id":id_data, "tokens":tokens, "ner_tags":ner_tags}

dataset.pop('id')
    
dataset = pd.DataFrame(dataset)

In [4]:
# Hapus data duplikat dataset

new_dataset = pd.DataFrame()

new_dataset['tokens'] = dataset['tokens'].apply(tuple)
new_dataset['ner_tags'] = dataset['ner_tags'].apply(tuple)

new_dataset = new_dataset.drop_duplicates()
new_dataset = new_dataset.reset_index(drop=True)
new_dataset

Unnamed: 0,tokens,ner_tags
0,"(Indonesia, mengekspor, produk, industri, skal...","(4, 38, 38, 38, 38, 38, 38, 4, 23, 38, 38, 38,..."
1,"(Martahan, Sohuturon, ,, CNN, Indonesia, |, Ra...","(12, 31, 38, 11, 30, 38, 1, 20, 20, 20, 20, 20..."
2,"(Polri, Awasi, Peredaran, Bahan, ', The, Mothe...","(9, 38, 38, 38, 38, 14, 33, 33, 33, 38, 9, 38,..."
3,"(Jakarta, ,, CNN, Indonesia, --, Kepolisian, N...","(4, 38, 11, 30, 38, 9, 28, 28, 28, 38, 9, 38, ..."
4,"(Kepala, Divisi, Hubungan, Masyarakat, (, Kadi...","(38, 11, 30, 30, 38, 11, 30, 38, 9, 38, 38, 38..."
...,...,...
16823,"("", Namun, ,, jumlahnya, jauh, lebih, rendah, ...","(38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 0..."
16824,"("", Dua, kematian, lainnya, melibatkan, pengem...","(38, 0, 38, 38, 38, 38, 14, 38, 38, 38, 38, 38..."
16825,"(Dia, mengatakan, tidak, ada, kematian, yang, ...","(38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 1..."
16826,"(Sementara, pada, tahun, 2017, ,, ada, dua, da...","(38, 38, 38, 1, 38, 38, 0, 38, 0, 38, 38, 38, ..."


In [5]:
# tokens_cari = ['Gempa', 'bumi', 'mengguncang']
# hasil_pencarian = new_dataset[new_dataset['tokens'].apply(lambda x: set(tokens_cari).issubset(x))]
# hasil_pencarian

print(new_dataset.iloc[136]['tokens'])
print(new_dataset.iloc[136]['ner_tags'])

# Contoh DataFrame
# data = {'Nama': ['John', 'Jane', 'Doe', 'Alice'],
#         'Deskripsi': ['Gempa bumi berkekuatan 5,1 skala richer mengguncang wilayah Klungkung',
#                       'Cuaca cerah hari ini',
#                       'Berita terbaru tentang olahraga',
#                       'Klub buku membaca novel bersama']}
# df = pd.DataFrame(data)

# # Teks yang ingin dicari
# teks_cari = 'Gempa bumi'

# # Mencari baris yang mengandung teks tertentu pada sebagian data kolom 'Deskripsi'
# hasil_pencarian = df[df['Deskripsi'].str.contains(teks_cari, case=False, regex=True)]

# # Menampilkan hasil pencarian
# print("Hasil Pencarian:")
# print(hasil_pencarian)

('Jakarta', '-', 'Gempa', 'bumi', 'berkekuatan', '5', ',', '1', 'skala', 'Richter', 'mengguncang', 'wilayah', 'Klungkung', ',', 'Bali', '.', 'Gempa', 'tersebut', 'berpusat', 'di', 'laut', '.')
(4, 38, 38, 38, 38, 15, 34, 34, 34, 34, 38, 38, 7, 38, 4, 38, 38, 38, 38, 38, 38, 38)


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
train_dataset, test_dataset = train_test_split(new_dataset, test_size=0.2, random_state=11)

print(len(train_dataset))
print(len(test_dataset))

13462
3366


# Tokenization

In [8]:
import nltk
from nltk.tokenize import word_tokenize

In [9]:
for i, sen in enumerate(dataset['tokens']):
    dataset['tokens'][i] = word_tokenize(sen)

TypeError: expected string or bytes-like object, got 'list'

In [None]:
# digunakan untuk menggabungkan pronouns 's, dolar ($), @, dan hashtag (#)
problematic_data_indices = []

for i,sen in enumerate(dataset['tokens']):
    if len(dataset['tokens'][i]) != len(dataset['ner_tags'][i]):
        problematic_data_indices.append(i)

for i in problematic_data_indices:
    for j, token in enumerate(dataset['tokens'][i]):
        if token == "'s" or token == "$" or token == "..." or token == '`' or token == "'m" or token == "," or token == ".":
            dataset['tokens'][i][j-1] += dataset['tokens'][i][j]
            dataset['tokens'][i].pop(j)
        
        if token == "#" or token == "@":
            dataset['tokens'][i][j] += dataset['tokens'][i][j+1]
            dataset['tokens'][i].pop(j+1)
            
        if (token == "No" or token == "NO") and dataset['tokens'][i][j+1] == ".":
            dataset['tokens'][i][j] += dataset['tokens'][i][j+1]
            dataset['tokens'][i].pop(j+1)
            
        if token == "Mr" and dataset['tokens'][i][j+1] == ".":
            dataset['tokens'][i][j] += dataset['tokens'][i][j+1]
            dataset['tokens'][i].pop(j+1)
        
        if token == "‚" and dataset['tokens'][i][j+1] == ".":
            dataset['tokens'][i][j] += dataset['tokens'][i][j+1]
            dataset['tokens'][i].pop(j+1)

In [None]:
type(dataset['tokens'][0])

In [None]:
# print(len(dataset['tokens'][0]))
# print(len(dataset['ner_tags'][0]))

# print(dataset['tokens'][0])
# print(nergrit_dataset['train']['tokens'][0])

for i,sen in enumerate(dataset['tokens']):
    if type(dataset['tokens'][i]) == 'float':
        print(i)
        break
    if len(dataset['tokens'][i]) != len(dataset['ner_tags'][i]):
        print(f'not same {i}')

In [None]:
for i,token in enumerate(dataset['tokens'][10451]):
    if token != nergrit_tokens_combined[10451][i]:
        if nergrit_tokens_combined[10451][i] == '-"': continue
        print(f'indes:{i}')
        print(dataset['tokens'][10451][i-2:i+3])
        print(nergrit_tokens_combined[10451][i-2:i+3])
        break
        
print(dataset['tokens'][10451][:i+3])
print(nergrit_tokens_combined[10451][:i+3])

In [14]:
def extract_entity_label(dataset,isCount=False,entity_extracted=2):
    unique_labels = desc_labels(dataset)
    entity_label = {i:0 for i in range(len(unique_labels))} if isCount else {i:[] for i in range(len(unique_labels))}
    
    dataset = dataset['train']
    sentence_data = [seq['tokens'] for seq in dataset]
    tokens = [i for seq in dataset for i in seq['tokens']]
    labels = [i for seq in dataset for i in seq['ner_tags']]
    
    # print(tokens)

    for i, token in enumerate(tokens):
        if(isCount): entity_label[labels[i]] += 1 
        else:
            if(len(entity_label[labels[i]])<entity_extracted):
                entity_label[labels[i]].append(token)
                    
    entity_label_key_updated = {unique_labels[i]:entity_label[i] for i in range(len(unique_labels))}
    
        
    return entity_label_key_updated

def extract_entity_on_sentence(dataset,sample=2):
    unique_labels = desc_labels(dataset)
    
    dataset = dataset['train']
    sentence_data = [seq['tokens'] for seq in dataset]
    label_data = [seq['ner_tags'] for seq in dataset]
    
    label_count = {i:0 for i in range(len(unique_labels))}
    output = []
    
    for i, ner_tags in enumerate(label_data):
        if(len(ner_tags)>15): continue
        is_save = False
        for l in ner_tags:
            if(label_count[l]<sample):
                label_count[l] += 1
                is_save = True
        
        if(is_save):
            output.append([' '.join(sentence_data[i]),[unique_labels[j] for j in label_data[i]]])
            
    return output

In [15]:
exxtracted = extract_entity_on_sentence(nergrit_dataset)

In [12]:
def desc_labels(dataset):
    label_names = dataset['train'].info.features['ner_tags'].feature.names
    return {i:token for i,token in enumerate(label_names)}

In [13]:
def dataset_info(dataset):
    info = {
        "label_map": desc_labels(dataset),
        "samples": extract_entity_label(nergrit_dataset,entity_extracted=3)
    }
    return info

In [19]:
desc_labels(nergrit_dataset)

t1 = ('Gempa', 'bumi', 'berkekuatan', '5', ',', '1', 'skala', 'Richter', 'mengguncang', 'wilayah', 'Klungkung', ',', 'Bali', '.')
t2 = (38, 38, 38, 15, 34, 34, 34, 34, 38, 38, 7, 38, 4, 38)

print(len(t1), len(t2))

14 14


In [34]:
extract_entity_label(nergrit_dataset,True)

{'B-CRD': 3430,
 'B-DAT': 3588,
 'B-EVT': 1400,
 'B-FAC': 481,
 'B-GPE': 6451,
 'B-LAN': 12,
 'B-LAW': 314,
 'B-LOC': 2843,
 'B-MON': 1292,
 'B-NOR': 3893,
 'B-ORD': 571,
 'B-ORG': 4169,
 'B-PER': 6720,
 'B-PRC': 746,
 'B-PRD': 4142,
 'B-QTY': 1509,
 'B-REG': 356,
 'B-TIM': 811,
 'B-WOA': 197,
 'I-CRD': 1395,
 'I-DAT': 9906,
 'I-EVT': 2387,
 'I-FAC': 929,
 'I-GPE': 2314,
 'I-LAN': 11,
 'I-LAW': 1350,
 'I-LOC': 4529,
 'I-MON': 3251,
 'I-NOR': 5544,
 'I-ORD': 41,
 'I-ORG': 5425,
 'I-PER': 4357,
 'I-PRC': 1472,
 'I-PRD': 4182,
 'I-QTY': 2446,
 'I-REG': 176,
 'I-TIM': 2360,
 'I-WOA': 368,
 'O': 213820}

In [35]:
dataset_info(nergrit_dataset)

{'label_map': {0: 'B-CRD',
  1: 'B-DAT',
  2: 'B-EVT',
  3: 'B-FAC',
  4: 'B-GPE',
  5: 'B-LAN',
  6: 'B-LAW',
  7: 'B-LOC',
  8: 'B-MON',
  9: 'B-NOR',
  10: 'B-ORD',
  11: 'B-ORG',
  12: 'B-PER',
  13: 'B-PRC',
  14: 'B-PRD',
  15: 'B-QTY',
  16: 'B-REG',
  17: 'B-TIM',
  18: 'B-WOA',
  19: 'I-CRD',
  20: 'I-DAT',
  21: 'I-EVT',
  22: 'I-FAC',
  23: 'I-GPE',
  24: 'I-LAN',
  25: 'I-LAW',
  26: 'I-LOC',
  27: 'I-MON',
  28: 'I-NOR',
  29: 'I-ORD',
  30: 'I-ORG',
  31: 'I-PER',
  32: 'I-PRC',
  33: 'I-PRD',
  34: 'I-QTY',
  35: 'I-REG',
  36: 'I-TIM',
  37: 'I-WOA',
  38: 'O'},
 'samples': {'B-CRD': ['empat', 'enam', 'satu'],
  'B-DAT': ['Selasa', 'Selasa', 'Rabu'],
  'B-EVT': ['Final', 'Final', 'final'],
  'B-FAC': ['Pelabuhan', 'Pelabuhan', 'Masjid'],
  'B-GPE': ['Indonesia', 'Amerika', 'Jakarta'],
  'B-LAN': ['bahasa', 'bahasa', 'bahasa'],
  'B-LAW': ['surat', 'surat', 'US'],
  'B-LOC': ['Jakarta', 'JICT', 'West'],
  'B-MON': ['USD', 'Rp', 'Rp'],
  'B-NOR': ['Menteri', 'Menteri', 'M

In [None]:
nergrit_dataset['train']['tokens'][140]

# FASTEXT

In [6]:
import fasttext
import fasttext.util

In [7]:
id_fasttext = fasttext.load_model('./cc.id.300.bin')



In [23]:
id_fasttext.get_nearest_neighbors('sya')

[(0.6171797513961792, 'irnya'),
 (0.5662643909454346, 'patotoe'),
 (0.5593110918998718, 'ir-sya'),
 (0.5553608536720276, 'Sya'),
 (0.5534719824790955, ',,saya'),
 (0.5462336540222168, 'Patotoe'),
 (0.5404911637306213, ',saya'),
 (0.5337669253349304, 'seblmnya'),
 (0.5295220613479614, 'riat'),
 (0.5264296531677246, 'riah')]

In [None]:
print(f'origin dimension: {id_fasttext.get_dimension()}')

fasttext.util.reduce_model(id_fasttext, 2)

print(f'reduced dimension: {id_fasttext.get_dimension()}')

In [10]:
tes = ['sudah', 'dicoba', 'dan', 'tet', '##ep', 'gak', 'bisa', 'mba', ',', 'ada', 'not', '##if', '##2', 'tidak', 'jelas', 'seperti', 'itu']

for i in tes:
    print(id_fasttext[i])

[0.24763355 0.04528186]
[0.15636204 0.02645837]
[0.29079488 0.04939094]
[0.09388968 0.46004158]
[-0.04135711 -0.00143757]
[0.39425156 0.28984338]
[0.3626357  0.05012034]
[0.13619876 0.3875976 ]
[-0.13530862  0.7744079 ]
[0.45093042 0.14592652]
[-0.01554188  0.40266582]
[0.00469462 0.09509537]
[0.03605656 0.12732781]
[0.28490165 0.06851771]
[0.21356207 0.00209849]
[0.11995699 0.01652862]
[0.46251014 0.05710749]


In [27]:
id_fasttext['jdhersomvf']

array([ 0.00320197,  0.04431295,  0.00596271,  0.0003198 , -0.00154018,
       -0.00504316,  0.01854377,  0.00140952,  0.00416551, -0.0064503 ,
        0.0017719 , -0.01546858, -0.00787772,  0.01578196, -0.0111641 ,
        0.02190386, -0.00518977, -0.00415276, -0.02037244, -0.01143345,
        0.00754148,  0.00148768, -0.00179641, -0.01104866,  0.01832668,
       -0.00375121,  0.00497834, -0.0064133 , -0.02198054,  0.0020031 ,
        0.00191203, -0.00624121,  0.00103681, -0.00464204,  0.00391369,
       -0.01440342,  0.01218161,  0.01042238, -0.00031999,  0.0007526 ,
        0.00047804,  0.01641218, -0.01223699,  0.00625934,  0.0068442 ,
       -0.00454213,  0.01949731,  0.002113  ,  0.00531257, -0.0149791 ,
        0.00276029,  0.02538211, -0.00658982,  0.01531404, -0.01573758,
       -0.02086379,  0.01706518,  0.01133318, -0.00160001, -0.01392083,
        0.00370123,  0.00080782,  0.00645986, -0.00170984,  0.00422715,
        0.00382178,  0.00219005, -0.00318533, -0.00171774,  0.00