In [27]:
import pandas as pd

url = 'https://raw.githubusercontent.com/dayangalyssa/InfoKilat/main/data/preprocessed_sentences.csv'
df = pd.read_csv(url)

df.head(10)

Unnamed: 0,original_sentence,processed_sentence,document_id
0,"TEMPO.CO, Jakarta - Anggota Komisi VII DPR RI ...",tempo co jakarta - anggota komisi vii dpr ri ...,0
1,"""Turut berbela sungkawa terhadap masyarakat si...",bela sungkawa masyarakat sipil dampak daerah...,0
2,"Pertamina harus tanggung jawab,"" kata dia dala...",pertamina tanggung terang jakarta sabtu 4 m...,0
3,TBBM Plumpang dinilai sebagai salah satu termi...,tbbm plumpang nilai salah terminal bbm penting...,0
4,"""Plumpang menyuplai sekitar 20 persen kebutuha...",plumpang suplai 20 persen butuh bbm hari 25 ...,0
5,Maka tindakan selanjutnya adalah bagaimana mem...,tindak suplai bbm ganggu kata rofik singgung...,0
6,"Terhitung 2 tahun terakhir ini, kata dia, suda...",hitung 2 5 fasilitas migas milik pertamina a...,0
7,"""Menyayangkan terjadinya musibah kebakaran ini...",sayang musibah bakar insiden bakar kali fasi...,0
8,Pertamina harus benahi sistem pengamanan dan S...,pertamina benah sistem aman sop dia rofik ha...,0
9,Menurutnya pembenahan diperlukan agar Pertamin...,turut benah pertamina abai aspek aman fasilita...,0


In [28]:
# Cek jumlah dokumen unik
num_docs = df['document_id'].nunique()
print(f"Jumlah document_id unik: {num_docs}")

Jumlah document_id unik: 50


In [29]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Ambil hanya document_id = 0
doc_df = df[df['document_id'] == 4].reset_index(drop=True)

# Ekstrak teks per kalimat
kalimat_list = doc_df['processed_sentence'].tolist()

# Inisialisasi CountVectorizer
vectorizer = CountVectorizer()

# Hitung term frequency per kalimat
X = vectorizer.fit_transform(kalimat_list)

# Ubah jadi DataFrame
tf_df = pd.DataFrame(X.toarray().T,  # transpose agar rows = terms, columns = kalimat
                     index=vectorizer.get_feature_names_out(),
                     columns=[f"Kalimat {i+1}" for i in range(len(kalimat_list))])

# Tambahkan kolom tf total
tf_df["tf"] = tf_df.sum(axis=1)

# Menampilkan berdasarkan tf tertinggi
tf_df = tf_df.sort_values("tf", ascending=False)

# Menampilkan hasil
tf_df.head()


Unnamed: 0,Kalimat 1,Kalimat 2,Kalimat 3,Kalimat 4,Kalimat 5,Kalimat 6,Kalimat 7,Kalimat 8,Kalimat 9,Kalimat 10,Kalimat 11,Kalimat 12,Kalimat 13,Kalimat 14,Kalimat 15,tf
hdci,2,0,3,2,0,1,1,1,1,1,1,0,2,0,0,15
ketua,4,0,1,1,0,2,0,0,1,0,2,0,0,0,0,11
bamsoet,1,0,2,1,0,0,0,0,1,0,1,0,0,0,0,6
2023,1,0,4,0,0,0,0,0,0,0,0,0,0,0,0,5
masyarakat,0,1,0,0,0,0,1,0,0,1,1,1,0,0,0,5


In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Dictionary untuk menyimpan hasil TF-IDF per dokumen
tfidf_dict = {}

for doc_id in range(51):
    # Filter data per dokumen
    doc_df = df[df['document_id'] == doc_id].reset_index(drop=True)

    if doc_df.empty:
        continue

    # Ambil kalimat yang sudah diproses (pastikan tidak ada NaN)
    kalimat_list = doc_df['processed_sentence'].dropna().astype(str).tolist()

    if len(kalimat_list) == 0:
        continue

    # Inisialisasi TfidfVectorizer (tanpa normalisasi L2)
    vectorizer = TfidfVectorizer(use_idf=True, norm=None)

    # Hitung TF-IDF
    X = vectorizer.fit_transform(kalimat_list)

    # Konversi ke DataFrame (rows = terms, columns = kalimat)
    tfidf_df = pd.DataFrame(
        X.toarray().T,
        index=vectorizer.get_feature_names_out(),
        columns=[f"Kalimat {i+1}" for i in range(len(kalimat_list))]
    )

    # Tambahkan kolom total TF-IDF per term (opsional)
    tfidf_df["W_tfidf"] = tfidf_df.sum(axis=1)

    # Simpan ke dictionary
    tfidf_dict[doc_id] = tfidf_df

# Contoh tampilkan hasil untuk document_id = 0
display(tfidf_dict[0])

Unnamed: 0,Kalimat 1,Kalimat 2,Kalimat 3,Kalimat 4,Kalimat 5,Kalimat 6,Kalimat 7,Kalimat 8,Kalimat 9,Kalimat 10,Kalimat 11,W_tfidf
20,0.000000,0.0,0.000000,0.0,2.386294,0.000000,0.0,0.000000,0.0,0.0,2.386294,4.772589
2022,0.000000,0.0,0.000000,0.0,0.000000,2.791759,0.0,0.000000,0.0,0.0,0.000000,2.791759
2023,2.791759,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2.791759
2024,0.000000,0.0,2.791759,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2.791759
25,0.000000,0.0,0.000000,0.0,2.791759,0.000000,0.0,0.000000,0.0,0.0,0.000000,2.791759
...,...,...,...,...,...,...,...,...,...,...,...,...
ujar,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,2.791759,2.791759
ulang,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,2.791759,0.0,0.0,0.000000,2.791759
utara,2.791759,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2.791759
vii,2.791759,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2.791759


In [31]:
display(tfidf_dict[0])

Unnamed: 0,Kalimat 1,Kalimat 2,Kalimat 3,Kalimat 4,Kalimat 5,Kalimat 6,Kalimat 7,Kalimat 8,Kalimat 9,Kalimat 10,Kalimat 11,W_tfidf
20,0.000000,0.0,0.000000,0.0,2.386294,0.000000,0.0,0.000000,0.0,0.0,2.386294,4.772589
2022,0.000000,0.0,0.000000,0.0,0.000000,2.791759,0.0,0.000000,0.0,0.0,0.000000,2.791759
2023,2.791759,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2.791759
2024,0.000000,0.0,2.791759,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2.791759
25,0.000000,0.0,0.000000,0.0,2.791759,0.000000,0.0,0.000000,0.0,0.0,0.000000,2.791759
...,...,...,...,...,...,...,...,...,...,...,...,...
ujar,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,2.791759,2.791759
ulang,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,2.791759,0.0,0.0,0.000000,2.791759
utara,2.791759,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2.791759
vii,2.791759,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2.791759


In [32]:
# Dictionary untuk menyimpan ringkasan TF-IDF
tfidf_summary_dict = {}

for doc_id, tfidf_df in tfidf_dict.items():
    # Hitung total bobot TF-IDF per kalimat (Ws)
    ws_series = tfidf_df.drop(columns=["W_tfidf"]).sum(axis=0)

    # Ambil 3 kalimat dengan bobot tertinggi
    top_kalimat_cols = ws_series.sort_values(ascending=False).head(3).index.tolist()

    # Urutkan berdasarkan indeks kalimat
    sorted_kalimat_cols = sorted(top_kalimat_cols, key=lambda x: int(x.split()[-1]))

    # Ambil kalimat asli
    doc_df = df[df['document_id'] == doc_id].reset_index(drop=True)
    selected_sentences = []

    for kal_col in sorted_kalimat_cols:
        kal_index = int(kal_col.split()[-1]) - 1
        if kal_index < len(doc_df):
            selected_sentences.append(doc_df.loc[kal_index, 'original_sentence'])

    tfidf_summary_dict[doc_id] = selected_sentences

# Contoh output ringkasan untuk document_id = 0
print("Ringkasan 3 kalimat teratas (TF-IDF) untuk document_id 0:\n")
for i, kalimat in enumerate(tfidf_summary_dict[0], 1):
    print(f"{i}. {kalimat}")

Ringkasan 3 kalimat teratas (TF-IDF) untuk document_id 0:

1. TEMPO.CO, Jakarta - Anggota Komisi VII DPR RI Rofik Hananto menyayangkan terjadinya insiden kebakaran yang disebabkan oleh bocornya depo Plumpang, Jakarta Utara, pada Jumat, 3 Maret 2023.
2. Pertamina harus tanggung jawab," kata dia dalam keterangan di Jakarta Sabtu, 4 Maret 2024.Rofik mengatakan Pertamina serta pihak terkait harus memastikan keselamatan dan keamanan warga yang tinggal di sekitar lokasi.Pipa BBM yang terbakar itu merupakan bagian dari Terminal Bahan Bakar Minyak (TBBM) Plumpang.
3. "Investigasi menyeluruh dan tuntas, serta meminta komitmen Pertamina memperbaiki sistem keamanan kilang minyak maupun depo BBM, seringnya kebakaran terjadi mengindikasikan Pertamina abai terhadap pengamanan kilang," ujarnya.Pilihan Editor:Profil Depo Pertamina Plumpang, Pemasok 20 Persen Pasokan BBM di Seluruh Indonesia
