In [2]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=6a271194fc661ad4f75275e8d7944232ac281c0513b23db2d0ef2d29f0e00f4a
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [3]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from seqeval.metrics import classification_report
import numpy as np

In [4]:
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQy1ex2WmsP990kpSkyvem5B0Ai8oWJqWZpg73yfMMTNx84fximvMBb715Gnl8rPBgYNlIqwZcvRSyr/pub?output=csv"
df = pd.read_csv(url)

In [5]:
value_counts = df["entity"].value_counts()
print(value_counts)

entity
O            141399
LOC           14426
PER           10232
EVIDENCE       8852
NOR            8438
DAT            7039
CRIMETYPE      5804
LAW            1940
Evidence         33
ORG              26
PRO              11
AGE               7
2                 2
1                 2
5                 2
CRIME             1
REL               1
Name: count, dtype: int64


In [6]:
entities_to_drop = ['PRO', 'AGE', 'REL', 'CRIME', '2', '1', '5']
df = df[~df['entity'].isin(entities_to_drop)]

replace_map = {
    'ORG': 'NOR',
    'Evidence': 'EVIDENCE',
}
df['entity'] = df['entity'].replace(replace_map)

In [7]:
value_counts = df["entity"].value_counts()
print(value_counts)

entity
O            141399
LOC           14426
PER           10232
EVIDENCE       8885
NOR            8464
DAT            7039
CRIMETYPE      5804
LAW            1940
Name: count, dtype: int64


In [8]:
bio_labels = []
prev_id, prev_entity = None, None

for i, row in df.iterrows():
    curr_id = row["id"]
    curr_entity = row["entity"]

    if curr_entity == "O":
        bio = "O"
    else:
        if curr_id != prev_id or curr_entity != prev_entity:
            bio = f"B-{curr_entity}"
        else:
            bio = f"I-{curr_entity}"

    bio_labels.append(bio)
    prev_id, prev_entity = curr_id, curr_entity

df["bio_label"] = bio_labels
print(df.head(20))

       id        token     entity    bio_label
0   D0001       Pelaku          O            O
1   D0001   pembunuhan  CRIMETYPE  B-CRIMETYPE
2   D0001          dan          O            O
3   D0001  pemerkosaan  CRIMETYPE  B-CRIMETYPE
4   D0001           AA        PER        B-PER
5   D0001           14        DAT        B-DAT
6   D0001      seorang          O            O
7   D0001       remaja          O            O
8   D0001        putri          O            O
9   D0001           di          O            O
10  D0001      Kuburan        LOC        B-LOC
11  D0001         Cina        LOC        I-LOC
12  D0001    Palembang        LOC        I-LOC
13  D0001        sudah          O            O
14  D0001    ditangkap          O            O
15  D0001          dan          O            O
16  D0001   ditetapkan          O            O
17  D0001    tersangka          O            O
18  D0001      Keempat          O            O
19  D0001    tersangka          O            O


In [9]:
value_counts = df["bio_label"].value_counts()
print(value_counts)

bio_label
O              141399
I-LOC            8191
B-PER            7264
B-LOC            6235
B-NOR            5121
B-CRIMETYPE      4967
B-EVIDENCE       4882
I-EVIDENCE       4003
I-DAT            3895
I-NOR            3343
B-DAT            3144
I-PER            2968
I-LAW            1342
I-CRIMETYPE       837
B-LAW             598
B-nan              10
Name: count, dtype: int64


In [10]:
nan_rows = df[df['entity'].isnull()]
if not nan_rows.empty:
    print("Penyebab 'B-nan': Ditemukan baris dengan entitas kosong (NaN).")
    print(nan_rows)
    df.dropna(subset=['entity'], inplace=True)
    print("\nInfo: Baris dengan entitas kosong telah dihapus.\n")

Penyebab 'B-nan': Ditemukan baris dengan entitas kosong (NaN).
             id token entity bio_label
5918        NaN   NaN    NaN     B-nan
6336      D0023   NaN    NaN     B-nan
6344      D0023   NaN    NaN     B-nan
29100    akibat     O    NaN     B-nan
29101     debat     O    NaN     B-nan
29102     mulut     O    NaN     B-nan
29103    antara     O    NaN     B-nan
29104      Dedi   PER    NaN     B-nan
29108   membuat     O    NaN     B-nan
170589    D0531   dan    NaN     B-nan

Info: Baris dengan entitas kosong telah dihapus.



In [11]:
value_counts = df["bio_label"].value_counts()
print(value_counts)

bio_label
O              141399
I-LOC            8191
B-PER            7264
B-LOC            6235
B-NOR            5121
B-CRIMETYPE      4967
B-EVIDENCE       4882
I-EVIDENCE       4003
I-DAT            3895
I-NOR            3343
B-DAT            3144
I-PER            2968
I-LAW            1342
I-CRIMETYPE       837
B-LAW             598
Name: count, dtype: int64


In [12]:
grouped = df.groupby("id")
sentences = grouped["token"].apply(list).tolist()
labels = grouped["bio_label"].apply(list).tolist()

In [13]:
sentences = []
labels = []
current_tokens = []
current_labels = []

for token, label in zip(df['token'], df['bio_label']):
    current_tokens.append(token)
    current_labels.append(label)

    if len(current_tokens) >= 10:
        sentences.append(current_tokens)
        labels.append(current_labels)
        current_tokens = []
        current_labels = []

if current_tokens:
    sentences.append(current_tokens)
    labels.append(current_labels)

In [14]:
label_list = sorted(set(label for seq in labels for label in seq))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}
encoded_labels = [[label2id[tag] for tag in seq] for seq in labels]

In [15]:
label_list = sorted(set(label for seq in labels for label in seq))
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

encoded_labels = [[label2id[tag] for tag in seq] for seq in labels]

In [16]:
tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased")

def tokenize_and_align_labels(batch):
    tokenized_inputs = tokenizer(
        batch["tokens"],
        truncation=True,
        max_length=128,
        is_split_into_words=True,
        padding=True,  # Tambahkan padding
        return_tensors=None
    )

    labels_batch = batch["ner_tags"]
    aligned_labels_batch = []

    for i, labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != previous_word_idx:
                aligned_labels.append(labels[word_idx])
            else:
                aligned_labels.append(
                    labels[word_idx] if label_list[labels[word_idx]].startswith("I-") else -100
                )
            previous_word_idx = word_idx

        aligned_labels += [-100] * (len(tokenized_inputs["input_ids"][i]) - len(aligned_labels))
        aligned_labels_batch.append(aligned_labels)

    tokenized_inputs["labels"] = aligned_labels_batch
    return tokenized_inputs

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [17]:
clean_sentences = []
clean_labels = []

for toks, lbls in zip(sentences, encoded_labels):
    if toks and lbls and len(toks) == len(lbls):
        if all(isinstance(t, str) and isinstance(l, int) for t, l in zip(toks, lbls)):
            clean_sentences.append(toks)
            clean_labels.append(lbls)

from datasets import Dataset
dataset = Dataset.from_dict({
    "tokens": clean_sentences,
    "ner_tags": clean_labels
})

dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [18]:
model = AutoModelForTokenClassification.from_pretrained(
    "indolem/indobertweet-base-uncased",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

In [27]:
def compute_metrics(p, threshold=0.6):
    preds, labels = p
    probs = np.max(preds, axis=2)
    pred_ids = np.argmax(preds, axis=2)

    true_preds = []
    true_labels = []

    for pred_seq, label_seq, prob_seq in zip(pred_ids, labels, probs):
        pred_labels = []
        label_labels = []
        for p, l, prob in zip(pred_seq, label_seq, prob_seq):
            if l != -100:
                label_str = id2label[l]
                pred_str = id2label[p] if prob >= threshold else "O"
                pred_labels.append(pred_str)
                label_labels.append(label_str)
        true_preds.append(pred_labels)
        true_labels.append(label_labels)

    report = classification_report(true_labels, true_preds, output_dict=True)

    results = {}
    for entity, metrics in report.items():
        if isinstance(metrics, dict):
            cleaned_entity_name = entity.replace(' ', '_')
            for metric_name, value in metrics.items():
                results[f"{cleaned_entity_name}_{metric_name}"] = value

    return results


In [29]:
args = TrainingArguments(
    output_dir="./ner-crime-newest",
    run_name="ner-crime-newest",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    do_eval=True,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    report_to="none"  
)

In [30]:
tokenized_train = dataset["train"].map(tokenize_and_align_labels, batched=True)
tokenized_eval = dataset["test"].map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/15852 [00:00<?, ? examples/s]

Map:   0%|          | 0/3964 [00:00<?, ? examples/s]

In [31]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [32]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [33]:
trainer.train()



Step,Training Loss
100,0.1639
200,0.1353
300,0.1099
400,0.0998
500,0.1037
600,0.0853
700,0.1004




TrainOutput(global_step=744, training_loss=0.11405374285995319, metrics={'train_runtime': 281.1493, 'train_samples_per_second': 169.149, 'train_steps_per_second': 2.646, 'total_flos': 1062726937466040.0, 'train_loss': 0.11405374285995319, 'epoch': 3.0})

In [34]:
predictions, labels, _ = trainer.predict(tokenized_eval)
metrics = compute_metrics((predictions, labels), threshold=0.6)
print(metrics)



{'CRIMETYPE_precision': 0.7655417406749556, 'CRIMETYPE_recall': 0.8336557059961315, 'CRIMETYPE_f1-score': 0.7981481481481482, 'CRIMETYPE_support': 1034, 'DAT_precision': 0.7399103139013453, 'DAT_recall': 0.7432432432432432, 'DAT_f1-score': 0.7415730337078653, 'DAT_support': 666, 'EVIDENCE_precision': 0.6530958439355385, 'EVIDENCE_recall': 0.7142857142857143, 'EVIDENCE_f1-score': 0.6823216659282233, 'EVIDENCE_support': 1078, 'LAW_precision': 0.5727272727272728, 'LAW_recall': 0.45323741007194246, 'LAW_f1-score': 0.5060240963855421, 'LAW_support': 139, 'LOC_precision': 0.7114624505928854, 'LOC_recall': 0.7714285714285715, 'LOC_f1-score': 0.7402330363262509, 'LOC_support': 1400, 'NOR_precision': 0.661498708010336, 'NOR_recall': 0.7045871559633028, 'NOR_f1-score': 0.6823633940470902, 'NOR_support': 1090, 'PER_precision': 0.8824675324675325, 'PER_recall': 0.8790426908150065, 'PER_f1-score': 0.880751782242385, 'PER_support': 1546, 'micro_avg_precision': 0.7390113651923866, 'micro_avg_recall':

In [35]:
model.save_pretrained("./ner_crime_model")
tokenizer.save_pretrained("./ner_crime_model")

('./ner_crime_model/tokenizer_config.json',
 './ner_crime_model/special_tokens_map.json',
 './ner_crime_model/vocab.txt',
 './ner_crime_model/added_tokens.json',
 './ner_crime_model/tokenizer.json')

In [36]:
import shutil
shutil.make_archive("ner_crime_model", 'zip', "./ner_crime_model")

'/kaggle/working/ner_crime_model.zip'

In [37]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [38]:
from huggingface_hub import create_repo, upload_folder

repo_name = "ner-crime-newest"  
username = "fairuuz"  
full_repo_name = f"{username}/{repo_name}"

create_repo(full_repo_name, private=False, exist_ok=True)

upload_folder(
    repo_id=full_repo_name,
    folder_path="ner_crime_model",
    path_in_repo=".",  
    commit_message="feat: upload newest ner crime model"
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/fairuuz/ner-crime-newest/commit/88b4cb933b8742a5189d2270c0d428a10a445806', commit_message='feat: upload newest ner crime model', commit_description='', oid='88b4cb933b8742a5189d2270c0d428a10a445806', pr_url=None, repo_url=RepoUrl('https://huggingface.co/fairuuz/ner-crime-newest', endpoint='https://huggingface.co', repo_type='model', repo_id='fairuuz/ner-crime-newest'), pr_revision=None, pr_num=None)