In [27]:
import os
import json
import pandas as pd

folder_path = '/home/jupyter-23521027/refresh-bert/data/liputan6_data/canonical/test' 

file_list = os.listdir(folder_path)

In [28]:
json_data_list = []

file_list = os.listdir(folder_path)

for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)  
    
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            
            json_data_list.append(json_data)

df = pd.DataFrame(json_data_list)
df['sentences'] = df['clean_article'].apply(lambda x: [' '.join(sentence) for sentence in x])
df.drop('clean_article', axis=1, inplace=True)
df['summary_sentences'] = df['clean_summary'].apply(lambda x: [' '.join(sentence) for sentence in x])
df['summary_string'] = df['summary_sentences'].apply(lambda x: ' '.join(x))
df.drop('clean_summary', axis=1, inplace=True)
df.drop('summary_sentences', axis=1, inplace=True)

In [29]:
df.tail(2)

Unnamed: 0,id,url,extractive_summary,sentences,summary_string
10970,26145,https://www.liputan6.com/news/read/26145/pemer...,"[1, 5]","[Liputan6 . com , Medan : Pemerintah tetap tak...",Biro perjalanan haji yang menggunakan paspor h...
10971,26102,https://www.liputan6.com/news/read/26102/eks-t...,"[0, 1]","[Liputan6 . com , Jakarta : Bekas tahanan poli...",Forum Eks Tahanan Politik meminta Presiden Meg...


In [30]:
# https://github.com/google-research/google-research/blob/master/rouge/rouge_scorer.py
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(
    'IDX Incubator \xa0  mengadakan sesi diskusi dengan tema “ Technology vs Humanity ” ',
    'IDX Incubator adakan sesi diskusi dengan tema Technology vs Humanity'
)
print(scores)

{'rouge1': Score(precision=0.9, recall=0.9, fmeasure=0.9), 'rouge2': Score(precision=0.7777777777777778, recall=0.7777777777777778, fmeasure=0.7777777777777778), 'rougeL': Score(precision=0.9, recall=0.9, fmeasure=0.9)}


In [31]:
len(df)

10972

In [32]:
from datetime import datetime

DATASET_SIZE = len(df)
all_document_sentences_f1_rouge_scores = []

now = datetime.now()
df = df.head(DATASET_SIZE)

for i, row in df.iterrows():
    document_sentences_f1_rouge_scores = []
    for sentence in row["sentences"]:
        scores = scorer.score(sentence, row["summary_string"])
        rouge1_f1 = scores["rouge1"].fmeasure
        rouge2_f1 = scores["rouge2"].fmeasure
        rougeL_f1 = scores["rougeL"].fmeasure
        rouge_score = (rouge1_f1 + rouge2_f1 + rougeL_f1) / 3.0
        document_sentences_f1_rouge_scores.append(rouge_score)
        
    if (i + 1) % 5000 == 0:
        print(f"processing {i + 1} with time: {datetime.now() - now}")
        now = datetime.now()
        
    all_document_sentences_f1_rouge_scores.append(document_sentences_f1_rouge_scores)

df["document_sentences_f1_rouge_scores"] = all_document_sentences_f1_rouge_scores
df.tail(5)

processing 5000 with time: 0:00:44.215940
processing 10000 with time: 0:00:44.448461


Unnamed: 0,id,url,extractive_summary,sentences,summary_string,document_sentences_f1_rouge_scores
10967,15906,https://www.liputan6.com/news/read/15906/soepa...,"[0, 3]","[Liputan6 . com , Jakarta : Ketua DPR Akbar Ta...",Kepergian Baharuddin Lopa membuat Akbar Tandju...,"[0.3679653679653679, 0.0634920634920635, 0.028..."
10968,19241,https://www.liputan6.com/news/read/19241/golka...,"[2, 3]","[Liputan6 . com , Jakarta : Partai Golongan Ka...",Ketua Umum DPP Partai Golkar Akbar Tandjung me...,"[0.27696078431372545, 0.0963924963924964, 0.30..."
10969,20408,https://www.liputan6.com/news/read/20408/pemer...,"[1, 4, 5]","[Liputan6 . com , Jakarta : Pemerintah berenca...",Pemerintah akan memberlakukan sistem UMR baru ...,"[0.1414141414141414, 0.16091954022988506, 0.23..."
10970,26145,https://www.liputan6.com/news/read/26145/pemer...,"[1, 5]","[Liputan6 . com , Medan : Pemerintah tetap tak...",Biro perjalanan haji yang menggunakan paspor h...,"[0.17769607843137256, 0.7280701754385964, 0.0,..."
10971,26102,https://www.liputan6.com/news/read/26102/eks-t...,"[0, 1]","[Liputan6 . com , Jakarta : Bekas tahanan poli...",Forum Eks Tahanan Politik meminta Presiden Meg...,"[0.22331154684095858, 0.22549019607843138, 0.2..."


In [33]:
all_top_10_sentences = [] 
for i, row in df.iterrows():
    document_sentences_and_scores = [] 
    for sentences_and_scores in zip(
        row["sentences"], 
        row["document_sentences_f1_rouge_scores"], 
        [x+1 for x in range(len(row["sentences"]))]
    ):
        document_sentences_and_scores.append(sentences_and_scores)

    top_10_sentences = sorted(document_sentences_and_scores, key=lambda x: x[1], reverse=True)[:10]
    all_top_10_sentences.append([(x[0],x[2]) for x in top_10_sentences])

df["top_10_sentences"] = all_top_10_sentences
df.head(1)

Unnamed: 0,id,url,extractive_summary,sentences,summary_string,document_sentences_f1_rouge_scores,top_10_sentences
0,19962,https://www.liputan6.com/news/read/19962/gas-b...,"[0, 4]","[Liputan6 . com , Jakarta : Untuk memenuhi ken...",Pertamina akan menyalurkan gas alam dari Sumat...,"[0.30026455026455023, 0.05925925925925926, 0.1...",[(Proyek transmisi gas ini untuk mendukung keb...


In [34]:
import itertools

df_candidate_summary = pd.DataFrame()

all_candidate_summary_txt = []
all_candidate_summary_ids_txt = []
all_document_length = [] 
for i, row in df.iterrows():
    document_candidate_summaries = []
    document_candidate_summaries_idx = []

    for itemcount in [1,2,3]:
        candidate_summaries = list(itertools.combinations(list(row["top_10_sentences"]), itemcount))
        for candidate_summary in candidate_summaries:
            candidate_summary_txt = " ".join([val[0] for val in candidate_summary])
            candidate_summary_ids = " ".join([str(val[1]) for val in candidate_summary])

            document_candidate_summaries.append(candidate_summary_txt)
            document_candidate_summaries_idx.append(candidate_summary_ids)

    all_candidate_summary_txt.append(document_candidate_summaries)
    all_candidate_summary_ids_txt.append(document_candidate_summaries_idx)
    all_document_length.append(len(row["sentences"]))

df_candidate_summary["candidate_summaries"] = all_candidate_summary_txt
df_candidate_summary["candidate_summary_ids"] = all_candidate_summary_ids_txt
df_candidate_summary["document_length"] = all_document_length
df_candidate_summary["summary"] = df["summary_string"]
df_candidate_summary.head(1)

Unnamed: 0,candidate_summaries,candidate_summary_ids,document_length,summary
0,[Proyek transmisi gas ini untuk mendukung kebi...,"[5, 1, 7, 9, 8, 3, 10, 6, 4, 2, 5 1, 5 7, 5 9,...",10,Pertamina akan menyalurkan gas alam dari Sumat...


In [35]:
# input: list of kalimat dalam dokumen dan summary
# output: list of scores 

documents = list(
    zip(
        df_candidate_summary["candidate_summaries"], 
        df_candidate_summary["summary"]
    )
)

def cal_rouge(document):
    candidate_summaries = document[0]
    summary = document[1]
    candidate_summary_scores = []
    for candidate_summary in candidate_summaries:
        scores = scorer.score(candidate_summary, summary)
        rouge1_f1 = scores["rouge1"].fmeasure
        rouge2_f1 = scores["rouge2"].fmeasure
        rougeL_f1 = scores["rougeL"].fmeasure
        rouge_score = (rouge1_f1 + rouge2_f1 + rougeL_f1) / 3.0
        candidate_summary_scores.append(rouge_score)
        
    return candidate_summary_scores


In [36]:
import multiprocess
from multiprocess import Pool
import tqdm

cores = 4
print(f"multiprocessing with {cores} cores")


with Pool(processes=cores) as pool:
    results = list(tqdm.tqdm(pool.imap_unordered(cal_rouge, documents), total=len(documents)))

df_candidate_summary["candidate_summary_scores"] = results
df_candidate_summary.tail(5)

multiprocessing with 4 cores


100%|██████████| 10972/10972 [09:35<00:00, 19.05it/s]


Unnamed: 0,candidate_summaries,candidate_summary_ids,document_length,summary,candidate_summary_scores
10967,"[Liputan6 . com , Jakarta : Ketua DPR Akbar Ta...","[1, 5, 4, 8, 6, 7, 2, 3, 1 5, 1 4, 1 8, 1 6, 1...",8,Kepergian Baharuddin Lopa membuat Akbar Tandju...,"[0.3679653679653679, 0.21014492753623193, 0.11..."
10968,[Hal itu dikemukakan Ketua Umum Dewan Pimpinan...,"[4, 3, 1, 8, 7, 10, 11, 12, 13, 2, 4 3, 4 1, 4...",13,Ketua Umum DPP Partai Golkar Akbar Tandjung me...,"[0.2309941520467836, 0.19285714285714287, 0.15..."
10969,"[Menurut Yacob , sistem UMR baru itu akan dise...","[5, 3, 6, 2, 1, 7, 4, 8, 5 3, 5 6, 5 2, 5 1, 5...",8,Pemerintah akan memberlakukan sistem UMR baru ...,"[0.29314888010540185, 0.22549019607843138, 0.2..."
10970,[Pemerintah akan menindak tegas biro perjalana...,"[2, 4, 5, 1, 6, 7, 8, 3, 2 4, 2 5, 2 1, 2 6, 2...",8,Biro perjalanan haji yang menggunakan paspor h...,"[0.3835263835263835, 0.3002898550724638, 0.276..."
10971,"[Itulah sebabnya , Sumardi mengatakan mereka m...","[7, 2, 1, 3, 6, 5, 4, 7 2, 7 1, 7 3, 7 6, 7 5,...",7,Forum Eks Tahanan Politik meminta Presiden Meg...,"[0.7280701754385964, 0.23097769439232851, 0.22..."


In [37]:
all_top_10_candidates = [] 
for i, row in df_candidate_summary.iterrows():
    document_candidates_and_scores = [] 
    for candidates_and_scores in zip(
        row["candidate_summary_ids"], 
        row["candidate_summary_scores"], 
    ):
        document_candidates_and_scores.append(candidates_and_scores)

    top_10_candidates = sorted(document_candidates_and_scores, key=lambda x: x[1], reverse=True)[:10]
    all_top_10_candidates.append(top_10_candidates)

df_candidate_summary["top_10_candidates"] = all_top_10_candidates
df_candidate_summary.head(2)

Unnamed: 0,candidate_summaries,candidate_summary_ids,document_length,summary,candidate_summary_scores,top_10_candidates
0,[Proyek transmisi gas ini untuk mendukung kebi...,"[5, 1, 7, 9, 8, 3, 10, 6, 4, 2, 5 1, 5 7, 5 9,...",10,Pertamina akan menyalurkan gas alam dari Sumat...,"[0.6027667984189723, 0.21929824561403508, 0.10...","[(5, 0.6027667984189723), (3, 0.55745967741935..."
1,"[Liputan6 . com , Yogyakarta : Sekitar 700 sis...","[1, 2, 3, 8, 7, 5, 9, 6, 10, 4, 1 2, 1 3, 1 8,...",10,Ratusan siswa Sekolah Menangah Umum 6 Yogyakar...,"[0.29337029337029336, 0.22331154684095858, 0.1...","[(1 2, 0.3430308293530178), (1 8, 0.3169318041..."


In [38]:
df_candidate_summary["id"] = df["id"]

with open(f"/home/jupyter-23521027/refresh-bert/data/preprocessed-input-directory/liputan6.test.label.multipleoracle", "w") as file:
    for i, row in df_candidate_summary.iterrows():
        file.write(f"liputan6-{row['id']}\n")
        file.write(f"{row['document_length']}\n")
        for candidate_summary_tuple in row["top_10_candidates"]:
            file.write(f"{candidate_summary_tuple[0]} {str(candidate_summary_tuple[1])[:14]}\n")
        file.write("\n")

In [39]:
!cat /home/jupyter-23521027/refresh-bert/data/preprocessed-input-directory/liputan6.test.label.multipleoracle | tail -c 100

0.572450363793
2 3 6 0.527635327635
7 3 6 0.511396883717
2 6 4 0.509469696969
7 3 5 0.504232804232



In [40]:
with open("/home/jupyter-23521027/refresh-bert/data/preprocessed-input-directory/liputan6.test.label.multipleoracle", 'r') as file:
    # Read the contents of the file
    text = file.read()
    # Count the occurrences of the word
    count = text.count("liputan6")

print(count)

10972
