In [1]:
!ls ../data/indosum

CHANGELOG.txt  dev.03.jsonl   test.02.jsonl  train.01.jsonl train.05.jsonl
README.txt     dev.04.jsonl   test.03.jsonl  train.02.jsonl
dev.01.jsonl   dev.05.jsonl   test.04.jsonl  train.03.jsonl
dev.02.jsonl   test.01.jsonl  test.05.jsonl  train.04.jsonl


In [16]:
import jsonlines
import glob

lines = []
for f in glob.glob("../data/indosum/train.*.jsonl"):
    with jsonlines.open(f) as infile:
        for obj in infile:
            lines.append(obj)

import pandas as pd

df = pd.DataFrame(lines)
df.head()

df_indosum = pd.DataFrame()

all_document_sentences = []
all_document_summary = [] 
for i, row in df.iterrows():
    # create document sentences
    document_sentences = []
    for j, sentence in enumerate(row['paragraphs']):
        document_sentences.append(" ".join(sentence[0]))
    all_document_sentences.append(document_sentences)

    # create document summary
    summaries = ""
    for j, sentence in enumerate(row['summary']):
        summaries += " ".join(sentence)
        summaries += " "
    all_document_summary.append(summaries)

    if i == 39999:
        break

df_indosum["document_sentences"] = all_document_sentences
df_indosum["summary"] = all_document_summary
df_indosum['source'] = "liputan6" 
''
print(len(df_indosum))

40000


### Calculate sentence-wise ROUGE score

In [17]:
# https://github.com/google-research/google-research/blob/master/rouge/rouge_scorer.py
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(
    'IDX Incubator \xa0  mengadakan sesi diskusi dengan tema “ Technology vs Humanity ” ',
    'IDX Incubator adakan sesi diskusi dengan tema Technology vs Humanity'
)
print(scores)

{'rouge1': Score(precision=0.9, recall=0.9, fmeasure=0.9), 'rouge2': Score(precision=0.7777777777777778, recall=0.7777777777777778, fmeasure=0.7777777777777778), 'rougeL': Score(precision=0.9, recall=0.9, fmeasure=0.9)}


In [18]:
all_document_sentences_f1_rouge_scores = []
for i, row in df_indosum.iterrows():
    document_sentences_f1_rouge_scores = []
    for sentence in row["document_sentences"]:
        scores = scorer.score(sentence, row["summary"])
        rouge1_f1 = scores["rouge1"].fmeasure
        rouge2_f1 = scores["rouge2"].fmeasure
        rougeL_f1 = scores["rougeL"].fmeasure
        rouge_score = (rouge1_f1 + rouge2_f1 + rougeL_f1) / 3.0
        document_sentences_f1_rouge_scores.append(rouge_score)
    all_document_sentences_f1_rouge_scores.append(document_sentences_f1_rouge_scores)

df_indosum["document_sentences_f1_rouge_scores"] = all_document_sentences_f1_rouge_scores
df_indosum.tail(5)

Unnamed: 0,document_sentences,summary,source,document_sentences_f1_rouge_scores
39995,"[Jakarta , CNN Indonesia - - Pebalap Movistar ...","Pebalap Movistar Yamaha , Valentino Rossi , me...",liputan6,"[0.45098361554057753, 0.26115575966322235, 0.1..."
39996,[Cagub dan cawagub DKI terpilih Anies Baswedan...,Cagub dan cawagub DKI terpilih Anies Baswedan ...,liputan6,"[0.5701365817644888, 0.05882352941176472, 0.55..."
39997,"[Jakarta , CNN Indonesia - - Usai memerankan s...",Usai memerankan sepasang kekasih dalam film le...,liputan6,"[0.509009009009009, 0.5908408408408408, 0.1489..."
39998,[Sedikitnya 12 orang tewas saat sebuah bus jat...,Sedikitnya 12 orang tewas saat sebuah bus jatu...,liputan6,"[0.44238975817923176, 0.2891117321497068, 0.44..."
39999,"[Jakarta , CNN Indonesia - - Hasil undian Pra ...",Laga Piala Asia U - 23 akan segera dimulai . I...,liputan6,"[0.10597439544807967, 0.14022196708763876, 0.0..."


### Sort sentence-wise ROUGE score with highest score on top

In [19]:
df_indosum.head(1).iloc[0]["summary"]

'Setelah selesai membentuk \xa0 holding \xa0 BUMN pertambangan , kini pemerintah tengah menggarap holding - holding lainnya , salah satunya \xa0 holding \xa0 BUMN jasa keuangan . Direktur Utama BRI , Suprajarto , berharap pembentukan \xa0 holding \xa0 BUMN jasa keuangan segera terealisasi . Jika \xa0 holding \xa0 jasa keuangan cepat terbentuk , pengadaan ATM bisa jauh lebih efisien secara teknologi dan biaya . Promosi pun bisa dilakukan bersama dengan bank-bank negara lainnya atau Himpunan Bank - bank Milik Negara . '

In [20]:
all_top_10_sentences = [] 
for i, row in df_indosum.iterrows():
    document_sentences_and_scores = [] 
    for sentences_and_scores in zip(
        row["document_sentences"], 
        row["document_sentences_f1_rouge_scores"], 
        [x+1 for x in range(len(row["document_sentences"]))]
    ):
        document_sentences_and_scores.append(sentences_and_scores)

    top_10_sentences = sorted(document_sentences_and_scores, key=lambda x: x[1], reverse=True)[:10]
    all_top_10_sentences.append([(x[0],x[2]) for x in top_10_sentences])

df_indosum["top_10_sentences"] = all_top_10_sentences
df_indosum.head(1)

Unnamed: 0,document_sentences,summary,source,document_sentences_f1_rouge_scores,top_10_sentences
0,[Setelah selesai membentuk holding BUMN pertam...,Setelah selesai membentuk holding BUMN per...,liputan6,"[0.45894308943089435, 0.0905349794238683, 0.11...",[(Setelah selesai membentuk holding BUMN perta...


In [21]:
df_indosum.head(1).iloc[0]["top_10_sentences"][:2]

[('Setelah selesai membentuk holding BUMN pertambangan , kini pemerintah tengah menggarap holding - holding lainnya , salah satunya holding BUMN jasa keuangan .',
  1),
 ('Orang nomor satu BRI ini mengatakan , jika holding jasa keuangan cepat terbentuk , pengadaan ATM bisa jauh lebih efisien secara teknologi dan biaya .',
  6)]


### Make candidate summary by combination of (1, 2, 3) sentence length created from 10 sentences with highest ROUGE score

In [22]:
import itertools

df_indosum_candidate_summary = pd.DataFrame()

all_candidate_summary_txt = []
all_candidate_summary_ids_txt = []
all_document_length = [] 
for i, row in df_indosum.iterrows():
    document_candidate_summaries = []
    document_candidate_summaries_idx = []

    for itemcount in [1,2,3]:
        candidate_summaries = list(itertools.combinations(list(row["top_10_sentences"]), itemcount))
        for candidate_summary in candidate_summaries:
            candidate_summary_txt = " ".join([val[0] for val in candidate_summary])
            candidate_summary_ids = " ".join([str(val[1]) for val in candidate_summary])

            document_candidate_summaries.append(candidate_summary_txt)
            document_candidate_summaries_idx.append(candidate_summary_ids)

    all_candidate_summary_txt.append(document_candidate_summaries)
    all_candidate_summary_ids_txt.append(document_candidate_summaries_idx)
    all_document_length.append(len(row["document_sentences"]))

df_indosum_candidate_summary["candidate_summaries"] = all_candidate_summary_txt
df_indosum_candidate_summary["candidate_summary_ids"] = all_candidate_summary_ids_txt
df_indosum_candidate_summary["document_length"] = all_document_length
df_indosum_candidate_summary["summary"] = df_indosum["summary"]
df_indosum_candidate_summary.head(1)

Unnamed: 0,candidate_summaries,candidate_summary_ids,document_length,summary
0,[Setelah selesai membentuk holding BUMN pertam...,"[1, 6, 4, 3, 9, 8, 2, 5, 7, 10, 1 6, 1 4, 1 3,...",10,Setelah selesai membentuk holding BUMN per...


In [23]:
df_indosum_candidate_summary.head(1).iloc[0]["summary"]

'Setelah selesai membentuk \xa0 holding \xa0 BUMN pertambangan , kini pemerintah tengah menggarap holding - holding lainnya , salah satunya \xa0 holding \xa0 BUMN jasa keuangan . Direktur Utama BRI , Suprajarto , berharap pembentukan \xa0 holding \xa0 BUMN jasa keuangan segera terealisasi . Jika \xa0 holding \xa0 jasa keuangan cepat terbentuk , pengadaan ATM bisa jauh lebih efisien secara teknologi dan biaya . Promosi pun bisa dilakukan bersama dengan bank-bank negara lainnya atau Himpunan Bank - bank Milik Negara . '

In [24]:
df_indosum_candidate_summary.head(1).iloc[0]["candidate_summaries"][:2]

['Setelah selesai membentuk holding BUMN pertambangan , kini pemerintah tengah menggarap holding - holding lainnya , salah satunya holding BUMN jasa keuangan .',
 'Orang nomor satu BRI ini mengatakan , jika holding jasa keuangan cepat terbentuk , pengadaan ATM bisa jauh lebih efisien secara teknologi dan biaya .']

### Calculate summary-wise ROUGE score

In [25]:
# input: list of kalimat dalam dokumen dan summary
# output: list of scores 

documents = list(
    zip(
        df_indosum_candidate_summary["candidate_summaries"], 
        df_indosum_candidate_summary["summary"]
    )
)

def cal_rouge(document):
    candidate_summaries = document[0]
    summary = document[1]
    candidate_summary_scores = []
    for candidate_summary in candidate_summaries:
        scores = scorer.score(candidate_summary, summary)
        rouge1_f1 = scores["rouge1"].fmeasure
        rouge2_f1 = scores["rouge2"].fmeasure
        rougeL_f1 = scores["rougeL"].fmeasure
        rouge_score = (rouge1_f1 + rouge2_f1 + rougeL_f1) / 3.0
        candidate_summary_scores.append(rouge_score)
        
    return candidate_summary_scores


In [26]:
import multiprocess
from multiprocess import Pool
import tqdm

cores = multiprocess.cpu_count() # Count the number of cores in a computer
print(f"multiprocessing with {cores} cores")


with Pool(processes=cores) as pool:
    results = list(tqdm.tqdm(pool.imap_unordered(cal_rouge, documents), total=len(documents)))

df_indosum_candidate_summary["candidate_summary_scores"] = results
df_indosum_candidate_summary.tail(5)

multiprocessing with 8 cores


100%|██████████| 40000/40000 [39:35<00:00, 16.84it/s] 


Unnamed: 0,candidate_summaries,candidate_summary_ids,document_length,summary,candidate_summary_scores
39995,"[Jakarta , CNN Indonesia - - Pebalap Movistar ...","[1, 2, 6, 3, 4, 7, 5, 1 2, 1 6, 1 3, 1 4, 1 7,...",7,"Pebalap Movistar Yamaha , Valentino Rossi , me...","[0.5676190476190478, 0.17828282828282827, 0.16..."
39996,[Cagub dan cawagub DKI terpilih Anies Baswedan...,"[1, 3, 7, 17, 6, 14, 4, 11, 10, 13, 1 3, 1 7, ...",17,Cagub dan cawagub DKI terpilih Anies Baswedan ...,"[0.48900432900432894, 0.20005772005772005, 0.1..."
39997,"[Seperti baru-baru ini , Leo dan Kate melelang...","[2, 1, 10, 3, 4, 6, 11, 8, 5, 9, 2 1, 2 10, 2 ...",13,Usai memerankan sepasang kekasih dalam film le...,"[0.6193602693602694, 0.48268106162843, 0.23598..."
39998,[Sedikitnya 12 orang tewas saat sebuah bus jat...,"[1, 3, 2, 4, 5, 1 3, 1 2, 1 4, 1 5, 3 2, 3 4, ...",5,Sedikitnya 12 orang tewas saat sebuah bus jatu...,"[0.5701365817644888, 0.5556907820599504, 0.117..."
39999,[Tim - tim yang akan lolos adalah juara setiap...,"[8, 7, 9, 2, 4, 5, 1, 6, 3, 8 7, 8 9, 8 2, 8 4...",9,Laga Piala Asia U - 23 akan segera dimulai . I...,"[0.5908408408408408, 0.509009009009009, 0.1567..."


### Sort candidate summary by score with highest score on top

In [27]:
all_top_10_candidates = [] 
for i, row in df_indosum_candidate_summary.iterrows():
    document_candidates_and_scores = [] 
    for candidates_and_scores in zip(
        row["candidate_summary_ids"], 
        row["candidate_summary_scores"], 
    ):
        document_candidates_and_scores.append(candidates_and_scores)

    top_10_candidates = sorted(document_candidates_and_scores, key=lambda x: x[1], reverse=True)[:10]
    all_top_10_candidates.append(top_10_candidates)

df_indosum_candidate_summary["top_10_candidates"] = all_top_10_candidates
df_indosum_candidate_summary.head(10)

    

Unnamed: 0,candidate_summaries,candidate_summary_ids,document_length,summary,candidate_summary_scores,top_10_candidates
0,[Setelah selesai membentuk holding BUMN pertam...,"[1, 6, 4, 3, 9, 8, 2, 5, 7, 10, 1 6, 1 4, 1 3,...",10,Setelah selesai membentuk holding BUMN per...,"[0.6502659574468085, 0.5077519379844961, 0.165...","[(2, 0.9033703487239432), (6 8, 0.757366771159..."
1,"[Jakarta , CNN Indonesia - - Presiden Joko Wid...","[1, 5, 3, 2, 4, 6, 7, 8, 1 5, 1 3, 1 2, 1 4, 1...",8,Presiden Joko Widodo menyatakan sebidang tanah...,"[0.4912280701754386, 0.3214695752009185, 0.205...","[(1 5, 0.6714882259244909), (1 5 8, 0.64943639..."
2,"[Sementara itu , kepada petugas tersangka meng...","[8, 1, 9, 6, 5, 2, 7, 4, 12, 10, 8 1, 8 9, 8 6...",12,Polres Tanjungbalai menggagalkan upaya perdaga...,"[0.575154101469891, 0.4607370248890604, 0.1524...","[(7, 0.6933031503489908), (1 5, 0.665183207239..."
3,[Juergen Klopp yakin kalau tim asuhannya itu b...,"[3, 1, 2, 4, 6, 7, 5, 8, 9, 3 1, 3 2, 3 4, 3 6...",9,"Pelatih Liverpool FC , Juergen Klopp , optimis...","[0.4119402985074627, 0.3625559625559626, 0.214...","[(3 1, 0.6466107731930517), (3 1 9, 0.63051518..."
4,[Kebahagiaan tengah melingkupi hati pasangan D...,"[1, 2, 4, 6, 5, 12, 9, 11, 10, 3, 1 2, 1 4, 1 ...",12,Kebahagiaan tengah melingkupi hati pasangan Di...,"[0.4878048780487805, 0.43094916779127307, 0.11...","[(10, 0.7373404007067373), (5 9, 0.67032163742..."
5,[INDIA – Presiden Jokowi tiba di India setelah...,"[1, 2, 4, 3, 5, 6, 1 2, 1 4, 1 3, 1 5, 1 6, 2 ...",6,Presiden Jokowi tiba di India setelah menempuh...,"[0.30756302521008405, 0.15156794425087106, 0.0...","[(1 4, 0.326984126984127), (1 2, 0.32491982644..."
6,[Pagi ini Stasiun Klender terbakar dan menyeba...,"[1, 3, 2, 7, 9, 8, 6, 4, 5, 1 3, 1 2, 1 7, 1 9...",9,Pagi ini Stasiun Klender terbakar dan menyebab...,"[0.46279942279942277, 0.37150308202939786, 0.3...","[(1 3 7, 0.7100301659125189), (1 2 7, 0.705782..."
7,"[Jalan Raya Porong di Kabupaten Sidoarjo , Jaw...","[1, 2, 4, 6, 8, 7, 3, 5, 1 2, 1 4, 1 6, 1 8, 1...",8,"Jalan Raya Porong di Kabupaten Sidoarjo , tere...","[0.2671554440800052, 0.19316239316239314, 0.13...","[(2 4 8, 0.4307866440294284), (1 6, 0.38787878..."
8,[Bermain di Stadion Hoa Xuan Stadium pada Rabu...,"[2, 1, 4, 5, 3, 6, 2 1, 2 4, 2 5, 2 3, 2 6, 1 ...",6,Timnas Indonesia U - 16 melanjutkan hasil apik...,"[0.7848809677558225, 0.5090090090090089, 0.153...","[(2 5, 0.8683300465860698), (2, 0.784880967755..."
9,"[Dari dalam negeri , IHSG berhasil melanjutkan...","[5, 1, 3, 2, 9, 8, 7, 4, 10, 6, 5 1, 5 3, 5 2,...",13,Indeks Harga Saham Gabungan ( IHSG ) telah dip...,"[0.45894308943089435, 0.38714859437751, 0.2685...","[(5 1 3, 0.7036136768757091), (5 1, 0.67722473..."


In [28]:
with open(f"../data/indosum/indosum{len(df_indosum_candidate_summary)}.train.label.multipleoracle", "w") as file:
    for i, row in df_indosum_candidate_summary.iterrows():
        file.write(f"indosum-{i}\n")
        file.write(f"{row['document_length']}\n")
        for candidate_summary_tuple in row["top_10_candidates"]:
            file.write(f"{candidate_summary_tuple[0]} {str(candidate_summary_tuple[1])[:14]}\n")
        file.write("\n")

In [29]:
df_indosum_candidate_summary.to_pickle(f"../data/indosum/df_indosum_candidate_summary_{len(df_indosum_candidate_summary)}.pickle")

In [30]:
df_indosum_candidate_summary

Unnamed: 0,candidate_summaries,candidate_summary_ids,document_length,summary,candidate_summary_scores,top_10_candidates
0,[Setelah selesai membentuk holding BUMN pertam...,"[1, 6, 4, 3, 9, 8, 2, 5, 7, 10, 1 6, 1 4, 1 3,...",10,Setelah selesai membentuk holding BUMN per...,"[0.6502659574468085, 0.5077519379844961, 0.165...","[(2, 0.9033703487239432), (6 8, 0.757366771159..."
1,"[Jakarta , CNN Indonesia - - Presiden Joko Wid...","[1, 5, 3, 2, 4, 6, 7, 8, 1 5, 1 3, 1 2, 1 4, 1...",8,Presiden Joko Widodo menyatakan sebidang tanah...,"[0.4912280701754386, 0.3214695752009185, 0.205...","[(1 5, 0.6714882259244909), (1 5 8, 0.64943639..."
2,"[Sementara itu , kepada petugas tersangka meng...","[8, 1, 9, 6, 5, 2, 7, 4, 12, 10, 8 1, 8 9, 8 6...",12,Polres Tanjungbalai menggagalkan upaya perdaga...,"[0.575154101469891, 0.4607370248890604, 0.1524...","[(7, 0.6933031503489908), (1 5, 0.665183207239..."
3,[Juergen Klopp yakin kalau tim asuhannya itu b...,"[3, 1, 2, 4, 6, 7, 5, 8, 9, 3 1, 3 2, 3 4, 3 6...",9,"Pelatih Liverpool FC , Juergen Klopp , optimis...","[0.4119402985074627, 0.3625559625559626, 0.214...","[(3 1, 0.6466107731930517), (3 1 9, 0.63051518..."
4,[Kebahagiaan tengah melingkupi hati pasangan D...,"[1, 2, 4, 6, 5, 12, 9, 11, 10, 3, 1 2, 1 4, 1 ...",12,Kebahagiaan tengah melingkupi hati pasangan Di...,"[0.4878048780487805, 0.43094916779127307, 0.11...","[(10, 0.7373404007067373), (5 9, 0.67032163742..."
...,...,...,...,...,...,...
39995,"[Jakarta , CNN Indonesia - - Pebalap Movistar ...","[1, 2, 6, 3, 4, 7, 5, 1 2, 1 6, 1 3, 1 4, 1 7,...",7,"Pebalap Movistar Yamaha , Valentino Rossi , me...","[0.5676190476190478, 0.17828282828282827, 0.16...","[(1 5, 0.5782576596530085), (1, 0.567619047619..."
39996,[Cagub dan cawagub DKI terpilih Anies Baswedan...,"[1, 3, 7, 17, 6, 14, 4, 11, 10, 13, 1 3, 1 7, ...",17,Cagub dan cawagub DKI terpilih Anies Baswedan ...,"[0.48900432900432894, 0.20005772005772005, 0.1...","[(1 3, 0.515688445921004), (11 10, 0.494699113..."
39997,"[Seperti baru-baru ini , Leo dan Kate melelang...","[2, 1, 10, 3, 4, 6, 11, 8, 5, 9, 2 1, 2 10, 2 ...",13,Usai memerankan sepasang kekasih dalam film le...,"[0.6193602693602694, 0.48268106162843, 0.23598...","[(2 1, 0.8552973791763124), (2 1 3, 0.83807219..."
39998,[Sedikitnya 12 orang tewas saat sebuah bus jat...,"[1, 3, 2, 4, 5, 1 3, 1 2, 1 4, 1 5, 3 2, 3 4, ...",5,Sedikitnya 12 orang tewas saat sebuah bus jatu...,"[0.5701365817644888, 0.5556907820599504, 0.117...","[(3 4, 0.8662467777512025), (1 3 5, 0.57725786..."


### Get top candidate's average sentence length

In [32]:
sentence_length_sum = 0
total_sentence = 0
for i, row in df_indosum_candidate_summary.iterrows():
    top_10_candidates = row["top_10_candidates"]
    top_candidate_sentences_idx = top_10_candidates[0][0]
    top_candidate_summary_id_idx = row["candidate_summary_ids"].index(top_candidate_sentences_idx)


    top_candidate_txt = row["candidate_summaries"][top_candidate_summary_id_idx]
    top_candidate_sentences = top_candidate_txt.split(".")

    for sentence in top_candidate_sentences:
        total_sentence += 1
        sentence_length_sum += len(sentence.split(" "))

print(f"average token per sentence of top candidates for every document: {sentence_length_sum/total_sentence}")

average token per sentence of top candidates for every document: 15.593156955543487
