In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/dayangalyssa/InfoKilat/main/data/preprocessed_sentences.csv'
df = pd.read_csv(url)

df.head(10)

Unnamed: 0,original_sentence,processed_sentence,document_id
0,"TEMPO.CO, Jakarta - Anggota Komisi VII DPR RI ...",tempo co jakarta - anggota komisi vii dpr ri ...,0
1,"""Turut berbela sungkawa terhadap masyarakat si...",bela sungkawa masyarakat sipil dampak daerah...,0
2,"Pertamina harus tanggung jawab,"" kata dia dala...",pertamina tanggung terang jakarta sabtu 4 m...,0
3,TBBM Plumpang dinilai sebagai salah satu termi...,tbbm plumpang nilai salah terminal bbm penting...,0
4,"""Plumpang menyuplai sekitar 20 persen kebutuha...",plumpang suplai 20 persen butuh bbm hari 25 ...,0
5,Maka tindakan selanjutnya adalah bagaimana mem...,tindak suplai bbm ganggu kata rofik singgung...,0
6,"Terhitung 2 tahun terakhir ini, kata dia, suda...",hitung 2 5 fasilitas migas milik pertamina a...,0
7,"""Menyayangkan terjadinya musibah kebakaran ini...",sayang musibah bakar insiden bakar kali fasi...,0
8,Pertamina harus benahi sistem pengamanan dan S...,pertamina benah sistem aman sop dia rofik ha...,0
9,Menurutnya pembenahan diperlukan agar Pertamin...,turut benah pertamina abai aspek aman fasilita...,0


In [None]:
# Cek jumlah dokumen unik
num_docs = df['document_id'].nunique()
print(f"Jumlah document_id unik: {num_docs}")

Jumlah document_id unik: 50


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Ambil hanya document_id = 0
doc_df = df[df['document_id'] == 0].reset_index(drop=True)

# Ekstrak teks per kalimat
kalimat_list = doc_df['processed_sentence'].tolist()

# Inisialisasi CountVectorizer
vectorizer = CountVectorizer()

# Hitung term frequency per kalimat
X = vectorizer.fit_transform(kalimat_list)

# Ubah jadi DataFrame
tf_df = pd.DataFrame(X.toarray().T,  # transpose agar rows = terms, columns = kalimat
                     index=vectorizer.get_feature_names_out(),
                     columns=[f"Kalimat {i+1}" for i in range(len(kalimat_list))])

# Tambahkan kolom tf total
tf_df["tf"] = tf_df.sum(axis=1)

# Menampilkan berdasarkan tf tertinggi
tf_df = tf_df.sort_values("tf", ascending=False)

# Menampilkan hasil
tf_df.head()


Unnamed: 0,Kalimat 1,Kalimat 2,Kalimat 3,Kalimat 4,Kalimat 5,Kalimat 6,Kalimat 7,Kalimat 8,Kalimat 9,Kalimat 10,Kalimat 11,tf
pertamina,0,0,2,1,1,1,1,0,2,1,3,12
bakar,1,0,2,1,0,0,1,2,0,0,1,8
aman,0,0,1,1,0,0,0,0,3,1,2,8
bbm,0,0,1,1,1,1,0,0,0,0,2,6
plumpang,1,0,1,1,1,0,0,0,0,0,1,5


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

tf_dict = {}

for doc_id in range(51):
    doc_df = df[df['document_id'] == doc_id].reset_index(drop=True)

    if doc_df.empty:
        continue

    # Drop NaN di kolom processed_sentence dan pastikan semuanya string
    kalimat_list = doc_df['processed_sentence'].dropna().astype(str).tolist()

    # Skip jika tidak ada kalimat valid
    if len(kalimat_list) == 0:
        continue

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(kalimat_list)

    tf_df = pd.DataFrame(X.toarray().T,
                         index=vectorizer.get_feature_names_out(),
                         columns=[f"Kalimat {i+1}" for i in range(len(kalimat_list))])

    tf_df["tf"] = tf_df.sum(axis=1)
    tf_df = tf_df.sort_values("tf", ascending=False)

    tf_dict[doc_id] = tf_df


In [None]:
display(tf_dict[0])

Unnamed: 0,Kalimat 1,Kalimat 2,Kalimat 3,Kalimat 4,Kalimat 5,Kalimat 6,Kalimat 7,Kalimat 8,Kalimat 9,Kalimat 10,Kalimat 11,tf
pertamina,0,0,2,1,1,1,1,0,2,1,3,12
bakar,1,0,2,1,0,0,1,2,0,0,1,8
aman,0,0,1,1,0,0,0,0,3,1,2,8
bbm,0,0,1,1,1,1,0,0,0,0,2,6
plumpang,1,0,1,1,1,0,0,0,0,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...
ujar,0,0,0,0,0,0,0,0,0,0,1,1
ulang,0,0,0,0,0,0,0,1,0,0,0,1
utara,1,0,0,0,0,0,0,0,0,0,0,1
vii,1,0,0,0,0,0,0,0,0,0,0,1


In [None]:
import numpy as np
import warnings

# Nonaktifkan warning untuk applymap
warnings.filterwarnings("ignore", category=FutureWarning)

# Dictionary baru untuk menyimpan hasil weight_tf
weight_tf_dict = {}

for doc_id, tf_df in tf_dict.items():
    # Salin tf_df tanpa kolom "tf"
    tf_only = tf_df.drop(columns=["tf"]).copy()

    # Hitung weight_tf: 1 + log10(tf), jika tf > 0
    weight_tf = tf_only.applymap(lambda x: 1 + np.log10(x) if x > 0 else 0)

    # Tambahkan baris ws (jumlah bobot per kalimat)
    weight_tf.loc["Ws"] = weight_tf.sum(axis=0)

    # (Opsional) Tambahkan kolom W_tf (jumlah bobot per term)
    weight_tf["W_tf"] = weight_tf.sum(axis=1)

    # Simpan ke dictionary
    weight_tf_dict[doc_id] = weight_tf

# Contoh tampilkan hasil untuk document_id = 4
display(weight_tf_dict[1].round(4))


Unnamed: 0,Kalimat 1,Kalimat 2,Kalimat 3,Kalimat 4,Kalimat 5,Kalimat 6,Kalimat 7,Kalimat 8,Kalimat 9,Kalimat 10,Kalimat 11,Kalimat 12,Kalimat 13,W_tf
bakar,1.0000,1.0,0.0,0.0000,1.000,1.4771,1.0,0.0,1.0000,0.000,1.0,0.0,0.000,7.4771
jakarta,1.3010,0.0,0.0,1.0000,1.000,1.0000,0.0,0.0,1.0000,0.000,1.0,0.0,1.000,7.3010
presiden,1.3010,0.0,1.0,1.3010,1.000,0.0000,0.0,0.0,0.0000,0.000,0.0,0.0,0.000,4.6021
warga,0.0000,1.0,0.0,0.0000,1.000,1.0000,1.0,0.0,0.0000,1.000,1.0,0.0,0.000,6.0000
plumpang,1.0000,0.0,1.0,0.0000,1.000,0.0000,0.0,0.0,1.0000,0.000,0.0,0.0,1.000,5.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wakil,1.0000,0.0,0.0,0.0000,0.000,0.0000,0.0,0.0,0.0000,0.000,0.0,0.0,0.000,1.0000
wapres,0.0000,0.0,0.0,1.0000,0.000,0.0000,0.0,0.0,0.0000,0.000,0.0,0.0,0.000,1.0000
wib,0.0000,0.0,0.0,0.0000,1.000,0.0000,0.0,0.0,0.0000,0.000,0.0,0.0,0.000,1.0000
widodo,1.0000,0.0,0.0,0.0000,0.000,0.0000,0.0,0.0,0.0000,0.000,0.0,0.0,0.000,1.0000


In [None]:
# Dictionary untuk menyimpan ringkasan 3 kalimat asli teratas per dokumen
top3_summary_dict = {}

for doc_id in range(51):
    if doc_id not in weight_tf_dict:
        continue

    weight_tf = weight_tf_dict[doc_id]

    # Ambil baris Ws dan urutkan dari yang terbesar
    ws_series = weight_tf.loc["Ws"].drop("W_tf").sort_values(ascending=False)

    # Ambil 3 kalimat teratas berdasarkan nilai Ws
    top_kalimat_cols = ws_series.head(3).index.tolist()

    # Urutkan kalimat-kalimat ini berdasarkan indeks kalimat
    sorted_kalimat_cols = sorted(top_kalimat_cols, key=lambda x: int(x.split()[-1]))

    # Ambil dataframe asli untuk document_id ini
    doc_df = df[df['document_id'] == doc_id].reset_index(drop=True)

    # Ambil kalimat asli berdasarkan indeks kalimat
    selected_original_sentences = []
    for kal_col in sorted_kalimat_cols:
        kal_index = int(kal_col.split()[-1]) - 1  # 'Kalimat 2' -> index 1
        if kal_index < len(doc_df):
            selected_original_sentences.append(doc_df.loc[kal_index, 'original_sentence'])

    # Simpan hasil ringkasan
    top3_summary_dict[doc_id] = selected_original_sentences


In [None]:
# Contoh output: ringkasan untuk document_id = 0
print("Ringkasan 3 kalimat teratas untuk document_id 0:\n")
for i, kalimat in enumerate(top3_summary_dict[1], 1):
    print(f"{i}. {kalimat}")

Ringkasan 3 kalimat teratas untuk document_id 0:

1. Tapi, Presiden sudah berkoordinasi dengan Wapres yang akan meninjau hari ini," ujar Deputi Bidang Protokol, Pers, dan Media Sekretariat Presiden Bey Machmudin saat dihubungi, Sabtu, 4 Februari 2023.Selain memerintahkan Ma'ruf Amin, Bey menyebut Jokowi juga telah memberikan arahan kepada Kapolri Jenderal Listyo Sigit Prabowo, Menteri BUMN Erick Thohir, dan Penjabat Gubernur DKI Jakarta Heru Budi Hartono soal kunjungan ke lokasi.
2. "Intinya Presiden minta untuk mengutamakan evakuasi korban dan penanganan warga terdampak," kata Bey.Kronologi KebakaranDepo Pertamina di Plumpang, Jakarta Utara, terbakar pada Jumat malam, 3 Maret 2023, pukul 20.20 WIB.
3. "Penyebab kejadian masih dalam proses investigasi," kata Area Manager Communication, Relation & CSR Pertamina Patra Niaga Regional Jawa Bagian Barat Eko Kristiawan.Eko juga menambahkan pihaknya saat ini fokus pada penanganan kebakaran pipa penerimaan BBM di Integrated Terminal BBM Jakart