In [1]:
import pandas as pd

url = 'https://raw.githubusercontent.com/dayangalyssa/InfoKilat/main/data/preprocessed_sentences.csv'
df = pd.read_csv(url)

df.head(50)

Unnamed: 0,original_sentence,processed_sentence,document_id
0,"TEMPO.CO, Jakarta - Anggota Komisi VII DPR RI ...",tempo co jakarta - anggota komisi vii dpr ri ...,0
1,"""Turut berbela sungkawa terhadap masyarakat si...",bela sungkawa masyarakat sipil dampak daerah...,0
2,"Pertamina harus tanggung jawab,"" kata dia dala...",pertamina tanggung terang jakarta sabtu 4 m...,0
3,TBBM Plumpang dinilai sebagai salah satu termi...,tbbm plumpang nilai salah terminal bbm penting...,0
4,"""Plumpang menyuplai sekitar 20 persen kebutuha...",plumpang suplai 20 persen butuh bbm hari 25 ...,0
5,Maka tindakan selanjutnya adalah bagaimana mem...,tindak suplai bbm ganggu kata rofik singgung...,0
6,"Terhitung 2 tahun terakhir ini, kata dia, suda...",hitung 2 5 fasilitas migas milik pertamina a...,0
7,"""Menyayangkan terjadinya musibah kebakaran ini...",sayang musibah bakar insiden bakar kali fasi...,0
8,Pertamina harus benahi sistem pengamanan dan S...,pertamina benah sistem aman sop dia rofik ha...,0
9,Menurutnya pembenahan diperlukan agar Pertamin...,turut benah pertamina abai aspek aman fasilita...,0


In [2]:
# Cek jumlah dokumen unik
num_docs = df['document_id'].nunique()
print(f"Jumlah document_id unik: {num_docs}")

Jumlah document_id unik: 50


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Ambil hanya document_id = 0
doc_df = df[df['document_id'] == 0].reset_index(drop=True)

# Ekstrak teks per kalimat
kalimat_list = doc_df['processed_sentence'].tolist()

# Inisialisasi CountVectorizer
vectorizer = CountVectorizer()

# Hitung term frequency per kalimat
X = vectorizer.fit_transform(kalimat_list)

# Ubah jadi DataFrame
tf_df = pd.DataFrame(X.toarray().T,  # transpose agar rows = terms, columns = kalimat
                     index=vectorizer.get_feature_names_out(),
                     columns=[f"Kalimat {i+1}" for i in range(len(kalimat_list))])

# Tambahkan kolom tf total
tf_df["tf"] = tf_df.sum(axis=1)

# Urutkan berdasarkan tf tertinggi (opsional)
tf_df = tf_df.sort_values("tf", ascending=False)

# Tampilkan hasil
tf_df.head()


Unnamed: 0,Kalimat 1,Kalimat 2,Kalimat 3,Kalimat 4,Kalimat 5,Kalimat 6,Kalimat 7,Kalimat 8,Kalimat 9,Kalimat 10,Kalimat 11,tf
pertamina,0,0,2,1,1,1,1,0,2,1,3,12
bakar,1,0,2,1,0,0,1,2,0,0,1,8
aman,0,0,1,1,0,0,0,0,3,1,2,8
bbm,0,0,1,1,1,1,0,0,0,0,2,6
plumpang,1,0,1,1,1,0,0,0,0,0,1,5


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

tf_dict = {}

for doc_id in range(51):
    doc_df = df[df['document_id'] == doc_id].reset_index(drop=True)

    if doc_df.empty:
        continue

    # Drop NaN di kolom processed_sentence dan pastikan semuanya string
    kalimat_list = doc_df['processed_sentence'].dropna().astype(str).tolist()

    # Skip jika tidak ada kalimat valid
    if len(kalimat_list) == 0:
        continue

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(kalimat_list)

    tf_df = pd.DataFrame(X.toarray().T,
                         index=vectorizer.get_feature_names_out(),
                         columns=[f"Kalimat {i+1}" for i in range(len(kalimat_list))])

    tf_df["tf"] = tf_df.sum(axis=1)
    tf_df = tf_df.sort_values("tf", ascending=False)

    tf_dict[doc_id] = tf_df


In [5]:
display(tf_dict[0])

Unnamed: 0,Kalimat 1,Kalimat 2,Kalimat 3,Kalimat 4,Kalimat 5,Kalimat 6,Kalimat 7,Kalimat 8,Kalimat 9,Kalimat 10,Kalimat 11,tf
pertamina,0,0,2,1,1,1,1,0,2,1,3,12
bakar,1,0,2,1,0,0,1,2,0,0,1,8
aman,0,0,1,1,0,0,0,0,3,1,2,8
bbm,0,0,1,1,1,1,0,0,0,0,2,6
plumpang,1,0,1,1,1,0,0,0,0,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...
ujar,0,0,0,0,0,0,0,0,0,0,1,1
ulang,0,0,0,0,0,0,0,1,0,0,0,1
utara,1,0,0,0,0,0,0,0,0,0,0,1
vii,1,0,0,0,0,0,0,0,0,0,0,1


In [6]:
import numpy as np
import warnings

# Nonaktifkan warning untuk applymap
warnings.filterwarnings("ignore", category=FutureWarning)

# Dictionary baru untuk menyimpan hasil weight_tf
weight_tf_dict = {}

for doc_id, tf_df in tf_dict.items():
    # Salin tf_df tanpa kolom "tf"
    tf_only = tf_df.drop(columns=["tf"]).copy()

    # Hitung weight_tf: 1 + log10(tf), jika tf > 0
    weight_tf = tf_only.applymap(lambda x: 1 + np.log10(x) if x > 0 else 0)

    # Tambahkan baris ws (jumlah bobot per kalimat)
    weight_tf.loc["Ws"] = weight_tf.sum(axis=0)

    # (Opsional) Tambahkan kolom W_tf (jumlah bobot per term)
    weight_tf["W_tf"] = weight_tf.sum(axis=1)

    # Simpan ke dictionary
    weight_tf_dict[doc_id] = weight_tf

# Contoh tampilkan hasil untuk document_id = 4
display(weight_tf_dict[1].round(4))


Unnamed: 0,Kalimat 1,Kalimat 2,Kalimat 3,Kalimat 4,Kalimat 5,Kalimat 6,Kalimat 7,Kalimat 8,Kalimat 9,Kalimat 10,Kalimat 11,Kalimat 12,Kalimat 13,W_tf
bakar,1.0000,1.0,0.0,0.0000,1.000,1.4771,1.0,0.0,1.0000,0.000,1.0,0.0,0.000,7.4771
jakarta,1.3010,0.0,0.0,1.0000,1.000,1.0000,0.0,0.0,1.0000,0.000,1.0,0.0,1.000,7.3010
presiden,1.3010,0.0,1.0,1.3010,1.000,0.0000,0.0,0.0,0.0000,0.000,0.0,0.0,0.000,4.6021
warga,0.0000,1.0,0.0,0.0000,1.000,1.0000,1.0,0.0,0.0000,1.000,1.0,0.0,0.000,6.0000
plumpang,1.0000,0.0,1.0,0.0000,1.000,0.0000,0.0,0.0,1.0000,0.000,0.0,0.0,1.000,5.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wakil,1.0000,0.0,0.0,0.0000,0.000,0.0000,0.0,0.0,0.0000,0.000,0.0,0.0,0.000,1.0000
wapres,0.0000,0.0,0.0,1.0000,0.000,0.0000,0.0,0.0,0.0000,0.000,0.0,0.0,0.000,1.0000
wib,0.0000,0.0,0.0,0.0000,1.000,0.0000,0.0,0.0,0.0000,0.000,0.0,0.0,0.000,1.0000
widodo,1.0000,0.0,0.0,0.0000,0.000,0.0000,0.0,0.0,0.0000,0.000,0.0,0.0,0.000,1.0000


In [7]:
# Dictionary untuk menyimpan ringkasan 3 kalimat asli teratas per dokumen
top3_summary_dict = {}

for doc_id in range(51):
    if doc_id not in weight_tf_dict:
        continue

    weight_tf = weight_tf_dict[doc_id]

    # Ambil baris Ws dan urutkan dari yang terbesar
    ws_series = weight_tf.loc["Ws"].drop("W_tf").sort_values(ascending=False)

    # Ambil 3 kalimat teratas berdasarkan nilai Ws
    top_kalimat_cols = ws_series.head(3).index.tolist()

    # Urutkan kalimat-kalimat ini berdasarkan indeks kalimat (misal 'Kalimat 2' → 2)
    sorted_kalimat_cols = sorted(top_kalimat_cols, key=lambda x: int(x.split()[-1]))

    # Ambil dataframe asli untuk document_id ini
    doc_df = df[df['document_id'] == doc_id].reset_index(drop=True)

    # Ambil kalimat asli berdasarkan indeks kalimat
    selected_original_sentences = []
    for kal_col in sorted_kalimat_cols:
        kal_index = int(kal_col.split()[-1]) - 1  # 'Kalimat 2' -> index 1
        if kal_index < len(doc_df):
            selected_original_sentences.append(doc_df.loc[kal_index, 'original_sentence'])

    # Simpan hasil ringkasan
    top3_summary_dict[doc_id] = selected_original_sentences


In [8]:
# Contoh output: ringkasan untuk document_id = 0
print("Ringkasan 3 kalimat teratas untuk document_id 0:\n")
for i, kalimat in enumerate(top3_summary_dict[1], 1):
    print(f"{i}. {kalimat}")

Ringkasan 3 kalimat teratas untuk document_id 0:

1. Tapi, Presiden sudah berkoordinasi dengan Wapres yang akan meninjau hari ini," ujar Deputi Bidang Protokol, Pers, dan Media Sekretariat Presiden Bey Machmudin saat dihubungi, Sabtu, 4 Februari 2023.Selain memerintahkan Ma'ruf Amin, Bey menyebut Jokowi juga telah memberikan arahan kepada Kapolri Jenderal Listyo Sigit Prabowo, Menteri BUMN Erick Thohir, dan Penjabat Gubernur DKI Jakarta Heru Budi Hartono soal kunjungan ke lokasi.
2. "Intinya Presiden minta untuk mengutamakan evakuasi korban dan penanganan warga terdampak," kata Bey.Kronologi KebakaranDepo Pertamina di Plumpang, Jakarta Utara, terbakar pada Jumat malam, 3 Maret 2023, pukul 20.20 WIB.
3. "Penyebab kejadian masih dalam proses investigasi," kata Area Manager Communication, Relation & CSR Pertamina Patra Niaga Regional Jawa Bagian Barat Eko Kristiawan.Eko juga menambahkan pihaknya saat ini fokus pada penanganan kebakaran pipa penerimaan BBM di Integrated Terminal BBM Jakart

#Implementasi Model

In [9]:
!pip install transformers sentencepiece --quiet

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load tokenizer dan model
tokenizer = AutoTokenizer.from_pretrained("cahya/bert2bert-indonesian-summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("cahya/bert2bert-indonesian-summarization")

# Fungsi untuk merangkum teks
def summarize(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Contoh penggunaan
text = "Masukkan teks panjang dalam bahasa Indonesia di sini untuk dirangkum."
print(summarize(text))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/230k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/999M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/999M [00:00<?, ?B/s]

masukkan teks panjang dalam bahasa indonesia di bahasa indonesia, rendarizetta merilis buku terbaru dalam bahasa inggris. dalam buku ini, summarize mengangkat isu tanah air di tanah air yang sedang berbenah.


In [11]:
def ringkas_teks_indo(text, max_input_length=512, max_output_length=100):
    # Tambah prefix khusus untuk tugas summarization
    input_text = "ringkasan: " + text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=max_input_length, truncation=True)

    summary_ids = model.generate(inputs, max_length=max_output_length, min_length=20, length_penalty=2.0, num_beams=4, early_stopping=True)

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [12]:
import re

# Bersihkan teks
def bersihkan(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'<.*?>', '', text)
    return text.strip()

# Simpan hasil ringkasan
indot5_summary_dict = {}

for doc_id in df['document_id'].unique():
    doc_text = ' '.join(df[df['document_id'] == doc_id]['original_sentence'].dropna().astype(str).tolist())
    doc_text = bersihkan(doc_text)

    if len(doc_text) < 50:
        continue

    try:
        summary = ringkas_teks_indo(doc_text[:1024])  # batas aman
        indot5_summary_dict[doc_id] = summary
    except Exception as e:
        indot5_summary_dict[doc_id] = "(Ringkasan gagal)"

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [13]:
indot5_summary_df = pd.DataFrame.from_dict(indot5_summary_dict, orient='index', columns=['indot5_summary'])
indot5_summary_df.reset_index(inplace=True)
indot5_summary_df.rename(columns={'index': 'document_id'}, inplace=True)

display(indot5_summary_df.head(100))


Unnamed: 0,document_id,indot5_summary
0,0,anggota komisi vii dpr ri rofik hananto menyay...
1,1,"kebakaran depo pertamina di plumpang, jakarta ..."
2,2,delegasi paguyuban first travel indonesia mene...
3,3,tim kedokteran dan kesehatan ( dokkes ) polri ...
4,4,ketua mpr ri sekaligus ketua umum ikatan motor...
5,5,wakil presiden ma ruf meninjau lokasi kebakara...
6,6,menteri keuangan ri sri mulyani indrawati berk...
7,7,keputusan pengadilan negeri jakarta pusat meng...
8,8,markas besar kepolisian republik indonesia tur...
9,9,"kawasan penyangga depo pertamina plumpang, jak..."


In [14]:
indot5_summary_df.to_csv('indot5_summary.csv', index=False)
full_df = pd.read_csv('indot5_summary.csv')
print(full_df)


    document_id                                     indot5_summary
0             0  anggota komisi vii dpr ri rofik hananto menyay...
1             1  kebakaran depo pertamina di plumpang, jakarta ...
2             2  delegasi paguyuban first travel indonesia mene...
3             3  tim kedokteran dan kesehatan ( dokkes ) polri ...
4             4  ketua mpr ri sekaligus ketua umum ikatan motor...
5             5  wakil presiden ma ruf meninjau lokasi kebakara...
6             6  menteri keuangan ri sri mulyani indrawati berk...
7             7  keputusan pengadilan negeri jakarta pusat meng...
8             8  markas besar kepolisian republik indonesia tur...
9             9  kawasan penyangga depo pertamina plumpang, jak...
10           10  presiden joko widodo memerintahkan menteri bum...
11           11  komisi yudisial diminta menggali motif hakim p...
12           12  ketua umum partai nasional demokrat surya palo...
13           13  ketua mpr ri yandri susanto optimis kpu dapat

## Evaluasi

In [15]:
!pip install -q scikit-learn pandas

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
manual_url = "https://docs.google.com/spreadsheets/d/1zcNv1AcX6HwV6Bsj_si15dez7PAKJWi6Up9z9Yy9XeM/export?format=csv"
manual_df = pd.read_csv(manual_url)
manual_df.columns

Index(['document_id', 'manual_summary'], dtype='object')

In [17]:
# Pastikan document_id bertipe string
manual_df['document_id'] = manual_df['document_id'].astype(str)
full_df['document_id'] = full_df['document_id'].astype(str)

# Merge berdasarkan document_id
merged_df = pd.merge(full_df, manual_df[['document_id', 'manual_summary']], on='document_id', how='inner')

In [18]:
def get_similarity(row):
    texts = [str(row['indot5_summary']), str(row['manual_summary'])]
    tfidf = TfidfVectorizer().fit_transform(texts)
    return cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]

merged_df['cosine_similarity'] = merged_df.apply(get_similarity, axis=1)

In [19]:
merged_df[['document_id', 'indot5_summary', 'manual_summary', 'cosine_similarity']].head(50)

Unnamed: 0,document_id,indot5_summary,manual_summary,cosine_similarity
0,0,anggota komisi vii dpr ri rofik hananto menyay...,Anggota Komisi VII DPR RI Rofik Hananto menyay...,0.712392
1,1,"kebakaran depo pertamina di plumpang, jakarta ...",Presiden Joko Widodo atau Jokowi memerintahkan...,0.233199
2,2,delegasi paguyuban first travel indonesia mene...,Wakil Ketua MPR RI Dr. H. M. Hidayat Nur Wahid...,0.207144
3,3,tim kedokteran dan kesehatan ( dokkes ) polri ...,Tim Kedokteran dan Kesehatan (Dokkes) Polri te...,1.0
4,4,ketua mpr ri sekaligus ketua umum ikatan motor...,Ketua MPR RI sekaligus Ketua Umum Ikatan Motor...,1.0
5,5,wakil presiden ma ruf meninjau lokasi kebakara...,"Pada 3 Maret 2023, kebakaran hebat melanda Dep...",0.405408
6,6,menteri keuangan ri sri mulyani indrawati berk...,Memperingati 14 tahun PT Sarana Multi Infrastr...,0.310782
7,7,keputusan pengadilan negeri jakarta pusat meng...,Pengadilan Negeri atau PN Jakarta Pusat memutu...,0.15464
8,8,markas besar kepolisian republik indonesia tur...,Markas Besar Kepolisian Republik Indonesia tur...,0.593011
9,9,"kawasan penyangga depo pertamina plumpang, jak...",Buffer zone atau kawasan penyangga yang memisa...,0.516214


In [20]:
average_cosine = merged_df['cosine_similarity'].mean()
average_cosine = average_cosine*100
print(f"Average Cosine Similarity: {average_cosine:.0f}%")


Average Cosine Similarity: 54%
