In [1]:
!ls ../data/indosum

CHANGELOG.txt  dev.03.jsonl   test.02.jsonl  train.01.jsonl  train.05.jsonl
README.txt     dev.04.jsonl   test.03.jsonl  train.02.jsonl
dev.01.jsonl   dev.05.jsonl   test.04.jsonl  train.03.jsonl
dev.02.jsonl   test.01.jsonl  test.05.jsonl  train.04.jsonl


In [3]:
import jsonlines
import glob

lines = []
for f in glob.glob("../data/indosum/train.*.jsonl"):
    with jsonlines.open(f) as infile:
        for obj in infile:
            lines.append(obj)

import pandas as pd

df = pd.DataFrame(lines)
df.head()

df_indosum = pd.DataFrame()

all_document_sentences = []
all_document_summary = [] 
for i, row in df.iterrows():
    # create document sentences
    document_sentences = []
    for j, sentence in enumerate(row['paragraphs']):
        document_sentences.append(" ".join(sentence[0]))
    all_document_sentences.append(document_sentences)

    # create document summary
    summaries = ""
    for j, sentence in enumerate(row['summary']):
        summaries += " ".join(sentence)
        summaries += " "
    all_document_summary.append(summaries)

    if i == 100:
        break

df_indosum["document_sentences"] = all_document_sentences
df_indosum["summary"] = all_document_summary
df_indosum['source'] = "liputan6" 

df_indosum.tail(10)

Unnamed: 0,document_sentences,summary,source
91,"[Jakarta , CNN Indonesia - - Pemerintah akan m...",Pemerintah akan mengevaluasi hitungan Dana Des...,liputan6
92,[Napoli melalui leg pertama fase play-off Liga...,Napoli melalui leg pertama fase play-off Liga ...,liputan6
93,"[Cristiano Ronaldo tersenyum ., Tidak ada yang...",Ronaldo baru saja dikaruniai anak kembar . Ked...,liputan6
94,[Startup fintech Cashlez mengumumkan perolehan...,Startup fintech Cashlez memperoleh dana seri A...,liputan6
95,"[Jakarta , CNN Indonesia - - Departemen Keuang...",Departemen Keuangan Amerika Serikat menjatuhka...,liputan6
96,"[Jakarta , CNN Indonesia - - Amerika Serikat ,...","Perwakilan Amerika Serikat , Australia dan Jep...",liputan6
97,"[Saat hendak berlibur ke negara orang , pentin...","Saat hendak berlibur ke negara orang , penting...",liputan6
98,"[Tahun ini , nama Jim Parsons menduduki posisi...","Tahun ini , nama Jim Parsons menduduki posisi ...",liputan6
99,"[Liga Champions musim 2016 / 17 telah usai ., ...",Liga Champions musim 2016 / 17 telah usai . Na...,liputan6
100,[Jakarta ( ANTARA News ) - Gubernur Daerah Khu...,Gubernur Daerah Khusus Indonesia ( DKI ) Jakar...,liputan6


### Calculate sentence-wise ROUGE score

In [4]:
# https://github.com/google-research/google-research/blob/master/rouge/rouge_scorer.py
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(
    'IDX Incubator \xa0  mengadakan sesi diskusi dengan tema “ Technology vs Humanity ” ',
    'IDX Incubator adakan sesi diskusi dengan tema Technology vs Humanity'
)
print(scores)

{'rouge1': Score(precision=0.9, recall=0.9, fmeasure=0.9), 'rouge2': Score(precision=0.7777777777777778, recall=0.7777777777777778, fmeasure=0.7777777777777778), 'rougeL': Score(precision=0.9, recall=0.9, fmeasure=0.9)}


In [5]:
all_document_sentences_f1_rouge_scores = []
for i, row in df_indosum.iterrows():
    document_sentences_f1_rouge_scores = []
    for sentence in row["document_sentences"]:
        scores = scorer.score(sentence, row["summary"])
        rouge1_f1 = scores["rouge1"].fmeasure
        rouge2_f1 = scores["rouge2"].fmeasure
        rougeL_f1 = scores["rougeL"].fmeasure
        rouge_score = (rouge1_f1 + rouge2_f1 + rougeL_f1) / 3.0
        document_sentences_f1_rouge_scores.append(rouge_score)
    all_document_sentences_f1_rouge_scores.append(document_sentences_f1_rouge_scores)

df_indosum["document_sentences_f1_rouge_scores"] = all_document_sentences_f1_rouge_scores
df_indosum.tail(5)

Unnamed: 0,document_sentences,summary,source,document_sentences_f1_rouge_scores
96,"[Jakarta , CNN Indonesia - - Amerika Serikat ,...","Perwakilan Amerika Serikat , Australia dan Jep...",liputan6,"[0.6999691643539933, 0.575880758807588, 0.1498..."
97,"[Saat hendak berlibur ke negara orang , pentin...","Saat hendak berlibur ke negara orang , penting...",liputan6,"[0.27774733637747334, 0.14082503556187767, 0.0..."
98,"[Tahun ini , nama Jim Parsons menduduki posisi...","Tahun ini , nama Jim Parsons menduduki posisi ...",liputan6,"[0.4254479697517672, 0.5374911410347272, 0.139..."
99,"[Liga Champions musim 2016 / 17 telah usai ., ...",Liga Champions musim 2016 / 17 telah usai . Na...,liputan6,"[0.19786096256684493, 0.5262531607913133, 0.03..."
100,[Jakarta ( ANTARA News ) - Gubernur Daerah Khu...,Gubernur Daerah Khusus Indonesia ( DKI ) Jakar...,liputan6,"[0.5545893719806764, 0.18046407507856557, 0.54..."


### Sort sentence-wise ROUGE score with highest score on top

In [6]:
df_indosum.head(1).iloc[0]["summary"]

'IDX Incubator \xa0  mengadakan sesi diskusi dengan tema “ Technology vs Humanity ” . Dalam diskusi ini dihadirkan dua narasumber , yakni Wakil Ketua Komite Tetap KADIN Indonesia Kevin Wu dan Managing Director Samsung R&D Indonesia Alfred Boediman . Dalam paparannya , Alfred meyakini bahwa Internet of Things ( IoT ) dan Artificial Intelligence ( AI ) akan menjadi signifikan ke depannya , karena menjadi fondasi utama perangkat pintar yang mendukung pelayanan publik . '

In [7]:
all_top_10_sentences = [] 
for i, row in df_indosum.iterrows():
    document_sentences_and_scores = [] 
    for sentences_and_scores in zip(
        row["document_sentences"], 
        row["document_sentences_f1_rouge_scores"], 
        [x+1 for x in range(len(row["document_sentences"]))]
    ):
        document_sentences_and_scores.append(sentences_and_scores)

    top_10_sentences = sorted(document_sentences_and_scores, key=lambda x: x[1], reverse=True)[:10]
    all_top_10_sentences.append([(x[0],x[2]) for x in top_10_sentences])

df_indosum["top_10_sentences"] = all_top_10_sentences
df_indosum.head(1)

Unnamed: 0,document_sentences,summary,source,document_sentences_f1_rouge_scores,top_10_sentences
0,[IDX Incubator kembali mengadakan sesi diskusi...,IDX Incubator mengadakan sesi diskusi denga...,liputan6,"[0.13588588588588588, 0.06741573033707865, 0.0...",[(IDX Incubator kembali mengadakan sesi diskus...


In [8]:
df_indosum.head(1).iloc[0]["top_10_sentences"][:2]

[('IDX Incubator kembali mengadakan sesi diskusi teknologi dan startup untuk kali kedua .',
  1),
 ('Dari sisi pemanfaatannya kedua pemateri meyakini bahwa AI akan memberikan banyak dampak baik .',
  9)]


### Make candidate summary by combination of (1, 2, 3) sentence length created from 10 sentences with highest ROUGE score

In [9]:
import itertools

df_indosum_candidate_summary = pd.DataFrame()

all_candidate_summary_txt = []
all_candidate_summary_ids_txt = []
all_document_length = [] 
for i, row in df_indosum.iterrows():
    document_candidate_summaries = []
    document_candidate_summaries_idx = []

    for itemcount in [1,2,3]:
        candidate_summaries = list(itertools.combinations(list(row["top_10_sentences"]), itemcount))
        for candidate_summary in candidate_summaries:
            candidate_summary_txt = " ".join([val[0] for val in candidate_summary])
            candidate_summary_ids = " ".join([str(val[1]) for val in candidate_summary])

            document_candidate_summaries.append(candidate_summary_txt)
            document_candidate_summaries_idx.append(candidate_summary_ids)

    all_candidate_summary_txt.append(document_candidate_summaries)
    all_candidate_summary_ids_txt.append(document_candidate_summaries_idx)
    all_document_length.append(len(row["document_sentences"]))

df_indosum_candidate_summary["candidate_summaries"] = all_candidate_summary_txt
df_indosum_candidate_summary["candidate_summary_ids"] = all_candidate_summary_ids_txt
df_indosum_candidate_summary["document_length"] = all_document_length
df_indosum_candidate_summary["summary"] = df_indosum["summary"]
df_indosum_candidate_summary.head(1)

Unnamed: 0,candidate_summaries,candidate_summary_ids,document_length,summary
0,[IDX Incubator kembali mengadakan sesi diskusi...,"[1, 9, 7, 2, 10, 8, 3, 6, 5, 11, 1 9, 1 7, 1 2...",11,IDX Incubator mengadakan sesi diskusi denga...


In [10]:
df_indosum_candidate_summary.head(1).iloc[0]["summary"]

'IDX Incubator \xa0  mengadakan sesi diskusi dengan tema “ Technology vs Humanity ” . Dalam diskusi ini dihadirkan dua narasumber , yakni Wakil Ketua Komite Tetap KADIN Indonesia Kevin Wu dan Managing Director Samsung R&D Indonesia Alfred Boediman . Dalam paparannya , Alfred meyakini bahwa Internet of Things ( IoT ) dan Artificial Intelligence ( AI ) akan menjadi signifikan ke depannya , karena menjadi fondasi utama perangkat pintar yang mendukung pelayanan publik . '

In [11]:
df_indosum_candidate_summary.head(1).iloc[0]["candidate_summaries"][:2]

['IDX Incubator kembali mengadakan sesi diskusi teknologi dan startup untuk kali kedua .',
 'Dari sisi pemanfaatannya kedua pemateri meyakini bahwa AI akan memberikan banyak dampak baik .']

### Calculate summary-wise ROUGE score

In [13]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
print(f"multiprocessing with {cores} cores")

# input: list of kalimat dalam dokumen dan summary
# output: list of scores 

documents = list(
    zip(
        df_indosum_candidate_summary["candidate_summaries"], 
        df_indosum_candidate_summary["summary"]
    )
)

def cal_rouge(document):
    candidate_summaries = document[0]
    summary = document[1]
    candidate_summary_scores = []
    for i, candidate_summary in enumerate(candidate_summaries):
        scores = scorer.score(candidate_summary, summary)
        rouge1_f1 = scores["rouge1"].fmeasure
        rouge2_f1 = scores["rouge2"].fmeasure
        rougeL_f1 = scores["rougeL"].fmeasure
        rouge_score = (rouge1_f1 + rouge2_f1 + rougeL_f1) / 3.0
        candidate_summary_scores.append(rouge_score)

    return candidate_summary_scores

from multiprocessing import Pool

with Pool(processes=multiprocessing.cpu_count()) as pool:
    results = pool.map(cal_rouge, documents)

df_indosum_candidate_summary["candidate_summary_scores"] = results
df_indosum_candidate_summary.tail(5)

multiprocessing with 4 cores


Unnamed: 0,candidate_summaries,candidate_summary_ids,document_length,summary,candidate_summary_scores
96,"[Jakarta , CNN Indonesia - - Amerika Serikat ,...","[1, 2, 3, 6, 5, 7, 4, 8, 1 2, 1 3, 1 6, 1 5, 1...",8,"Perwakilan Amerika Serikat , Australia dan Jep...","[0.6999691643539933, 0.575880758807588, 0.1498..."
97,"[Selain cuaca , kemeriahan festival khas yang ...","[5, 1, 6, 2, 11, 16, 20, 9, 13, 14, 5 1, 5 6, ...",21,"Saat hendak berlibur ke negara orang , penting...","[0.34471313418681837, 0.27774733637747334, 0.2..."
98,"[Bahkan kabarnya , Parsons dibayar sebesar 1 j...","[2, 1, 5, 8, 6, 4, 3, 7, 2 1, 2 5, 2 8, 2 6, 2...",8,"Tahun ini , nama Jim Parsons menduduki posisi ...","[0.5374911410347272, 0.4254479697517672, 0.317..."
99,[Keberhasilan Real Madrid menjungkalkan Juvent...,"[2, 9, 1, 6, 13, 8, 4, 12, 7, 3, 2 9, 2 1, 2 6...",15,Liga Champions musim 2016 / 17 telah usai . Na...,"[0.5262531607913133, 0.33152501506931886, 0.19..."
100,[Jakarta ( ANTARA News ) - Gubernur Daerah Khu...,"[1, 3, 5, 2, 7, 8, 4, 6, 1 3, 1 5, 1 2, 1 7, 1...",8,Gubernur Daerah Khusus Indonesia ( DKI ) Jakar...,"[0.5545893719806764, 0.5439411536972512, 0.417..."


### Sort candidate summary by score with highest score on top

In [None]:
all_top_10_candidates = [] 
for i, row in df_indosum_candidate_summary.iterrows():
    document_candidates_and_scores = [] 
    for candidates_and_scores in zip(
        row["candidate_summary_ids"], 
        row["candidate_summary_scores"], 
    ):
        document_candidates_and_scores.append(candidates_and_scores)

    top_10_candidates = sorted(document_candidates_and_scores, key=lambda x: x[1], reverse=True)[:10]
    all_top_10_candidates.append(top_10_candidates)

df_indosum_candidate_summary["top_10_candidates"] = all_top_10_candidates
df_indosum_candidate_summary.head(1)

    

Unnamed: 0,candidate_summaries,candidate_summary_ids,document_length,summary,candidate_summary_scores,top_10_candidates
0,[IDX Incubator kembali mengadakan sesi diskusi...,"[1, 9, 7, 2, 10, 8, 3, 6, 5, 11, 1 9, 1 7, 1 2...",11,IDX Incubator mengadakan sesi diskusi denga...,"[0.13588588588588588, 0.08937595129375951, 0.0...","[(1 9 7, 0.20734693877551022), (1 9, 0.1924723..."


In [None]:
with open("../data/indosum/indosum.train.label.multipleoracle", "w") as file:
    for i, row in df_indosum_candidate_summary.iterrows():
        file.write(f"indosum-{i}\n")
        file.write(f"{row['document_length']}\n")
        for candidate_summary_tuple in row["top_10_candidates"]:
            file.write(f"{candidate_summary_tuple[0]} {str(candidate_summary_tuple[1])[:14]}\n")
        file.write("\n")