In [1]:
!ls ../data/indosum

CHANGELOG.txt  dev.03.jsonl   test.02.jsonl  train.01.jsonl  train.05.jsonl
README.txt     dev.04.jsonl   test.03.jsonl  train.02.jsonl
dev.01.jsonl   dev.05.jsonl   test.04.jsonl  train.03.jsonl
dev.02.jsonl   test.01.jsonl  test.05.jsonl  train.04.jsonl


In [2]:
import jsonlines
import glob

lines = []
for f in glob.glob("../data/indosum/train.*.jsonl"):
    with jsonlines.open(f) as infile:
        for obj in infile:
            lines.append(obj)

import pandas as pd

df = pd.DataFrame(lines)
df.head()

df_indosum = pd.DataFrame()

all_document_sentences = []
all_document_summary = [] 
for i, row in df.iterrows():
    # create document sentences
    document_sentences = []
    for j, sentence in enumerate(row['paragraphs']):
        document_sentences.append(" ".join(sentence[0]))
    all_document_sentences.append(document_sentences)

    # create document summary
    summaries = ""
    for j, sentence in enumerate(row['summary']):
        summaries += " ".join(sentence)
        summaries += " "
    all_document_summary.append(summaries)

    if i == 10:
        break

df_indosum["document_sentences"] = all_document_sentences
df_indosum["summary"] = all_document_summary
df_indosum['source'] = "liputan6" 

df_indosum.tail(10)

Unnamed: 0,document_sentences,summary,source
1,"[JUARA.NET - Pebalap Inggris Raya , Lewis Hami...","Pebalap Inggris Raya , Lewis Hamilton , dan Da...",liputan6
2,"[Jakarta , CNN Indonesia - - Kabar reuni Spice...",Kabar reuni Spice Girls sudah santer beredar s...,liputan6
3,"[Jakarta , CNN Indonesia - - Konsep Kampung Pe...",Konsep Kampung Pelangi dinilai sukses memberi ...,liputan6
4,"[Jakarta , CNN Indonesia - - Setelah sempat me...","Album erbaru EXO , ' Universe ' , langsung men...",liputan6
5,"[Jakarta , CNN Indonesia - - Live Nation , pro...",Promotor Live Nation akan membawa Bryan Adams ...,liputan6
6,"[Merdeka.com - N""Golo Kante mengungkapkan pern...","N""Golo Kante mengungkapkan pernah mendapat taw...",liputan6
7,"[Jakarta , CNN Indonesia - - Perjuangan Michae...",Perjuangan Michael Nyqvist melawan kanker paru...,liputan6
8,"[Selama ini , Latte Art memang identik dengan ...","Selama ini , Latte Art memang identik dengan k...",liputan6
9,[Pelatih Borneo FC Iwan Setiawan sempat melont...,Pelatih Borneo FC Iwan Setiawan sempat melonta...,liputan6
10,"[Jakarta , CNN Indonesia - - Eks pimpinan Dewa...",Eks pimpinan Dewan Perwakilan Daerah Laode Ida...,liputan6


### Calculate sentence-wise ROUGE score

In [3]:
# https://github.com/google-research/google-research/blob/master/rouge/rouge_scorer.py
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(
    'The quick brown fox jumps over the lazy dog',
    'The quick brown dog jumps on the log.'
)
print(scores)

{'rouge1': Score(precision=0.75, recall=0.6666666666666666, fmeasure=0.7058823529411765), 'rouge2': Score(precision=0.2857142857142857, recall=0.25, fmeasure=0.26666666666666666), 'rougeL': Score(precision=0.625, recall=0.5555555555555556, fmeasure=0.5882352941176471)}


In [4]:
all_document_sentences_f1_rouge_scores = []
for i, row in df_indosum.iterrows():
    document_sentences_f1_rouge_scores = []
    for sentence in row["document_sentences"]:
        scores = scorer.score(sentence, row["summary"])
        rouge1_f1 = scores["rouge1"].fmeasure
        rouge2_f1 = scores["rouge2"].fmeasure
        rougeL_f1 = scores["rougeL"].fmeasure
        rouge_score = (rouge1_f1 + rouge2_f1 + rougeL_f1) / 3.0
        document_sentences_f1_rouge_scores.append(rouge_score)
    all_document_sentences_f1_rouge_scores.append(document_sentences_f1_rouge_scores)

df_indosum["document_sentences_f1_rouge_scores"] = all_document_sentences_f1_rouge_scores
df_indosum.tail(5)

Unnamed: 0,document_sentences,summary,source,document_sentences_f1_rouge_scores
6,"[Merdeka.com - N""Golo Kante mengungkapkan pern...","N""Golo Kante mengungkapkan pernah mendapat taw...",liputan6,"[0.5232804232804235, 0.34767025089605735, 0.18..."
7,"[Jakarta , CNN Indonesia - - Perjuangan Michae...",Perjuangan Michael Nyqvist melawan kanker paru...,liputan6,"[0.24985994397759104, 0.28698752228164, 0.1323..."
8,"[Selama ini , Latte Art memang identik dengan ...","Selama ini , Latte Art memang identik dengan k...",liputan6,"[0.2309988518943743, 0.1154347033390634, 0.020..."
9,[Pelatih Borneo FC Iwan Setiawan sempat melont...,Pelatih Borneo FC Iwan Setiawan sempat melonta...,liputan6,"[0.5297979797979798, 0.1500711237553343, 0.132..."
10,"[Jakarta , CNN Indonesia - - Eks pimpinan Dewa...",Eks pimpinan Dewan Perwakilan Daerah Laode Ida...,liputan6,"[0.6553418803418803, 0.4777561468022207, 0.021..."


### Sort sentence-wise ROUGE score with highest score on top

In [5]:
df_indosum.head(1).iloc[0]["summary"]

'IDX Incubator \xa0  mengadakan sesi diskusi dengan tema “ Technology vs Humanity ” . Dalam diskusi ini dihadirkan dua narasumber , yakni Wakil Ketua Komite Tetap KADIN Indonesia Kevin Wu dan Managing Director Samsung R&D Indonesia Alfred Boediman . Dalam paparannya , Alfred meyakini bahwa Internet of Things ( IoT ) dan Artificial Intelligence ( AI ) akan menjadi signifikan ke depannya , karena menjadi fondasi utama perangkat pintar yang mendukung pelayanan publik . '

In [6]:
all_top_10_sentences = [] 
for i, row in df_indosum.iterrows():
    document_sentences_and_scores = [] 
    for sentences_and_scores in zip(
        row["document_sentences"], 
        row["document_sentences_f1_rouge_scores"], 
        [x+1 for x in range(len(row["document_sentences"]))]
    ):
        document_sentences_and_scores.append(sentences_and_scores)

    top_10_sentences = sorted(document_sentences_and_scores, key=lambda x: x[1], reverse=True)[:10]
    all_top_10_sentences.append([(x[0],x[2]) for x in top_10_sentences])

df_indosum["top_10_sentences"] = all_top_10_sentences
df_indosum.head(1)

Unnamed: 0,document_sentences,summary,source,document_sentences_f1_rouge_scores,top_10_sentences
0,[IDX Incubator kembali mengadakan sesi diskusi...,IDX Incubator mengadakan sesi diskusi denga...,liputan6,"[0.13588588588588588, 0.06741573033707865, 0.0...",[(IDX Incubator kembali mengadakan sesi diskus...


In [10]:
df_indosum.head(1).iloc[0]["top_10_sentences"][:2]

[('IDX Incubator kembali mengadakan sesi diskusi teknologi dan startup untuk kali kedua .',
  1),
 ('Dari sisi pemanfaatannya kedua pemateri meyakini bahwa AI akan memberikan banyak dampak baik .',
  9)]


### Make candidate summary by combination of (1, 2, 3) sentence length created from 10 sentences with highest ROUGE score

In [18]:
import itertools

df_indosum_candidate_summary = pd.DataFrame()

all_candidate_summary_txt = []
all_candidate_summary_ids_txt = []
all_document_length = [] 
for i, row in df_indosum.iterrows():
    document_candidate_summaries = []
    document_candidate_summaries_idx = []

    for itemcount in [2,3]:
        candidate_summaries = list(itertools.combinations(list(row["top_10_sentences"]), itemcount))
        for candidate_summary in candidate_summaries:
            candidate_summary_txt = " ".join([val[0] for val in candidate_summary])
            candidate_summary_ids = " ".join([str(val[1]) for val in candidate_summary])

            document_candidate_summaries.append(candidate_summary_txt)
            document_candidate_summaries_idx.append(candidate_summary_ids)

    all_candidate_summary_txt.append(document_candidate_summaries)
    all_candidate_summary_ids_txt.append(document_candidate_summaries_idx)
    all_document_length.append(len(row["document_sentences"]))

df_indosum_candidate_summary["candidate_summaries"] = all_candidate_summary_txt
df_indosum_candidate_summary["candidate_summary_ids"] = all_candidate_summary_ids_txt
df_indosum_candidate_summary["document_length"] = all_document_length
df_indosum_candidate_summary["summary"] = df_indosum["summary"]
df_indosum_candidate_summary.head(1)

Unnamed: 0,candidate_summaries,candidate_summary_ids,document_length,summary
0,[IDX Incubator kembali mengadakan sesi diskusi...,"[1 9, 1 7, 1 2, 1 10, 1 8, 1 3, 1 6, 1 5, 1 11...",11,IDX Incubator mengadakan sesi diskusi denga...


In [19]:
df_indosum_candidate_summary.head(1).iloc[0]["summary"]

'IDX Incubator \xa0  mengadakan sesi diskusi dengan tema “ Technology vs Humanity ” . Dalam diskusi ini dihadirkan dua narasumber , yakni Wakil Ketua Komite Tetap KADIN Indonesia Kevin Wu dan Managing Director Samsung R&D Indonesia Alfred Boediman . Dalam paparannya , Alfred meyakini bahwa Internet of Things ( IoT ) dan Artificial Intelligence ( AI ) akan menjadi signifikan ke depannya , karena menjadi fondasi utama perangkat pintar yang mendukung pelayanan publik . '

In [20]:
df_indosum_candidate_summary.head(1).iloc[0]["candidate_summaries"][:2]

['IDX Incubator kembali mengadakan sesi diskusi teknologi dan startup untuk kali kedua . Dari sisi pemanfaatannya kedua pemateri meyakini bahwa AI akan memberikan banyak dampak baik .',
 'IDX Incubator kembali mengadakan sesi diskusi teknologi dan startup untuk kali kedua . Produk AI juga menjadi salah satu tren yang ada saat ini di Indonesia .']

### Calculate summary-wise ROUGE score

In [21]:
all_candidate_summary_scores = []
for i, row in df_indosum_candidate_summary.iterrows():
    candidate_summary_scores = []
    for candidate_summary in row["candidate_summaries"]:
        scores = scorer.score(candidate_summary, row["summary"])
        rouge1_f1 = scores["rouge1"].fmeasure
        rouge2_f1 = scores["rouge2"].fmeasure
        rougeL_f1 = scores["rougeL"].fmeasure
        rouge_score = (rouge1_f1 + rouge2_f1 + rougeL_f1) / 3.0
        candidate_summary_scores.append(rouge_score)
    all_candidate_summary_scores.append(candidate_summary_scores)

df_indosum_candidate_summary["candidate_summary_scores"] = all_candidate_summary_scores
df_indosum_candidate_summary.head(1)

Unnamed: 0,candidate_summaries,candidate_summary_ids,document_length,summary,candidate_summary_scores
0,[IDX Incubator kembali mengadakan sesi diskusi...,"[1 9, 1 7, 1 2, 1 10, 1 8, 1 3, 1 6, 1 5, 1 11...",11,IDX Incubator mengadakan sesi diskusi denga...,"[0.19247239125535268, 0.1767861167455488, 0.15..."


### Sort candidate summary by score with highest score on top

In [23]:
all_top_10_candidates = [] 
for i, row in df_indosum_candidate_summary.iterrows():
    document_candidates_and_scores = [] 
    for candidates_and_scores in zip(
        row["candidate_summary_ids"], 
        row["candidate_summary_scores"], 
    ):
        document_candidates_and_scores.append(candidates_and_scores)

    top_10_candidates = sorted(document_candidates_and_scores, key=lambda x: x[1], reverse=True)[:10]
    all_top_10_candidates.append(top_10_candidates)

df_indosum_candidate_summary["top_10_candidates"] = all_top_10_candidates
df_indosum_candidate_summary.head(1)

    

Unnamed: 0,candidate_summaries,candidate_summary_ids,document_length,summary,candidate_summary_scores,top_10_candidates
0,[IDX Incubator kembali mengadakan sesi diskusi...,"[1 9, 1 7, 1 2, 1 10, 1 8, 1 3, 1 6, 1 5, 1 11...",11,IDX Incubator mengadakan sesi diskusi denga...,"[0.19247239125535268, 0.1767861167455488, 0.15...","[(1 9 7, 0.20734693877551022), (1 9, 0.1924723..."


In [26]:
with open("../data/indosum/indosum.train.label.multipleoracle", "w") as file:
    file.write("Hello, world!\n")
    # for i, row in df_indosum_candidate_summary.iterrows():

    