In [1]:
import pandas as pd
import random
import re
import math
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

#bsoup
from urllib.request import urlopen
from bs4 import BeautifulSoup

# import StemmerFactory class
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()
from IPython.display import Markdown, display

In [2]:
#this function does cleaning, tokenize, remove stopwords, and stemming
def get_clean_corpus(corpus, stopwords):
    
    #segmentasi
    temp = sent_tokenize(corpus)
    corpus = pd.DataFrame(temp, columns=['dokumen'])
    
    clean_corpus = []
    token = []
    for index, sentence in enumerate(corpus['dokumen']):
        term = word_tokenize(corpus['dokumen'][index])
        
        #deleting url
        deleted_url = [temp for temp in term if not 
                       re.match(r"\w+(?:(\.(\w+)\.(\w+)))|\w+(?:(\.(\w+)))", str(temp))]
        
        #deleting symbol
        deleted_symbol = [re.sub(r"[\-\+\=\:\;\"\\\@\[\]\,_!;.':#$%^&*()<>?/\|}{~:]"," ",
                                 str(temp)) for temp in deleted_url ]
        
        #stemming
        stemmed_sentence = stemmer.stem(" ".join(deleted_symbol))
        
        #tanpa stemming
#         stemmed_sentence2 = (" ".join(deleted_symbol))
        
        tokens = word_tokenize(stemmed_sentence)
        for i in range(len(tokens)):
            for index, word in enumerate(tokens):
                #delete stopwprds
                if word in stopwords:
                    del tokens[index]
                    
                #delete number
                if word.isdigit():
                    del tokens[index]
        
        clean_corpus.append(" ".join(tokens))  
        token.append(list(dict.fromkeys(tokens)))
        
    corpus['clean_corpus'] = clean_corpus
    corpus['terms'] = token
    
    return corpus

In [3]:
#this function calculates term weighting
def get_term_weighting_score(cleaning_result):
    
    #getting all the terms
    terms = []
    for index, sentence in enumerate(cleaning_result['terms']):
        terms += [temp for temp in sentence if temp not in terms]
    terms.sort()
        
    #getting frequency for every sentences
    terms_frequency = pd.DataFrame()
    for index, term in enumerate(cleaning_result['terms']):
        frequency_each_sentence = []
        for i, d in enumerate(terms):
            temp = term.count(d)
            frequency_each_sentence.append(temp)
        terms_frequency[str(index+1)] = frequency_each_sentence
        
    terms_frequency['terms'] = terms
    terms_frequency.set_index('terms', inplace= True)
    
    #getting df for every terms
    df_idf = pd.DataFrame(terms_frequency.sum(axis=1), columns=['df_term'])
    df_idf['terms'] = terms
    df_idf.set_index('terms', inplace= True)
    
    #getting idf for every terms
    N = len(terms_frequency.columns)
    terms_idf = []
    for i, d in df_idf.iterrows():
        idf_score = math.log((N+1)/((df_idf['df_term'][i])), 10)
        terms_idf.append(idf_score)
    df_idf['idf_term'] = terms_idf
    
    return terms_frequency, df_idf   

In [4]:
class Sentence:
    np.random.seed(0)
    def __init__(self, id, full_sentence, clean_sentence, tokens):
        self.list_bm25 = {}
        self.pagerank_score = random.random()
        self.id = id
        self.full_sentence = full_sentence
        self.clean_sentence = clean_sentence
        self.tokens = tokens
        self.sentence_len = len(clean_sentence.split())
        self.pagerank_score_new = 0
            
    def calculate_bm25(self, raw_frequency, idf, doc, slen_ave):
        k1 = 1.2
        b = 0.75
        total_bm25 = 0
        for query in self.tokens:
            tf = raw_frequency.at[str(query),str(doc.id)]
            idff = idf.at[str(query),'idf_term']
            temp = idff * ((k1+1) * tf) / (k1*( (1-b) + (b *(doc.sentence_len/slen_ave)) ) + tf)  
            total_bm25 += temp
        self.list_bm25[doc.id] = total_bm25
        
    def calculate_new_pagerank(self, doc):
        d=0.85
        sum_InVi = 0
        for item in doc:
            if self.id is not item.id:
                Wji = self.list_bm25[item.id]
                total_Wjk = sum(item.list_bm25.values())
                sum_InVi += Wji/total_Wjk*item.pagerank_score
        self.pagerank_score_new = (1-d)+(d*sum_InVi)                

In [5]:
class Graph:    
    def __init__(self, result_doc, raw_frequency, idf, cr):
        self.raw_frequency = raw_frequency
        self.idf = idf.drop(columns=['df_term'])
        self.result_doc = result_doc
        self.total_doc = len(result_doc['dokumen'])
        self.slen_ave = 0
        self.summarize = []
        self.doc = []
        self.outlier = []
        self.compression_rate = cr
        self.doc_utuh = []
        
        
        #making object sentence
#         list_pgrk = [0.400827866,0.863170087,0.389187762,0.924094751,0.157640608,
#                      0.714980958,0.216858534,0.237221536,0.076112858,0.841401681]

        for index, item in self.result_doc.iterrows():
            self.doc.append(Sentence((index+1), item['dokumen'], item['clean_corpus'], 
                                     item['terms'])) 
            self.doc_utuh.append(Sentence((index+1), item['dokumen'], item['clean_corpus'], 
                                          item['terms'])) 
        
        #calculate len average
        temp_len_doc = 0
        for item in self.doc:
            temp_len_doc += item.sentence_len
        self.slen_ave = temp_len_doc/len(self.doc)
        
        
        #calculate bm25 for each object sentence
        for item in self.doc:
            for item2 in self.doc:
                if item.id is not item2.id:
                    item.calculate_bm25(raw_frequency= self.raw_frequency, 
                                        idf=self.idf, doc=item2, slen_ave=self.slen_ave)
           
#         self.doc_utuh = self.doc
        ##CHECKING IF BM25 SCORE IS 0 (OUTLIER SENTENCE)
        for index, item in enumerate(self.doc):
            if sum(item.list_bm25.values()) <= 0:
                self.outlier.append(self.doc.pop(index))
                
                
        #calculate pagerank
        for i in range(100): #reference: https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html
            for item in self.doc:
                item.calculate_new_pagerank(self.doc)
            
            #update pagerank score
            for item in self.doc:
                item.pagerank_score = item.pagerank_score_new
        

        #getting the summarize        
        sorted_doc = sorted(self.doc, key=lambda x: x.pagerank_score, reverse=True)        
        top_pagerank = []
        for item in range(math.ceil(self.total_doc*self.compression_rate)):
            top_pagerank.append(sorted_doc[item])
        
        sorted_sum = sorted(top_pagerank, key=lambda x: x.id)
        
        temp_summarize = [item.full_sentence for item in sorted_sum]
        
#         self.summarize = sorted_sum
        
        self.summarize = temp_summarize   

In [6]:
def get_document(url_bbcnews):
    
    try:
        page_news = urlopen(url_bbcnews)
    except:
        return "error", "none"
    
    beautysoup = BeautifulSoup(page_news, 'html.parser')
    title =  beautysoup.find('h1', {"class": "story-body__h1"}).text
    text_news = beautysoup.find('div', {"class": "story-body__inner"})

    document = ''
    for data in text_news.findAll('p'):
        document = document + ' ' +  data.text
    
    return document, title

# main sistem

In [7]:
# print("Choose compression rate:\n 1. 5% \n 2. 10% \n 3. 20% \n 4. 30%")

In [8]:
# # document = get_document("https://www.bbc.com/indonesia/indonesia-50038237")
document2 = input("URL berita BBC Indonesia: ") 

print("Jenis compression rate:\n 1. 5% \n 2. 10% \n 3. 20% \n 4. 30%")
cr2 = input("Pilih jenis compression rate: ")


if cr2 in ["1","2","3","4"] :
#     print(cr2)
    stopword = open("../stopword_list_tala.txt", "r")
    stopwords = stopword.read().split("\n")
    document3, title = get_document(document2)

    if document3 == "error" or title == "none":
        display(Markdown('**sorry, i cant access the url**'))
    else:      
        cleaning_result2 = get_clean_corpus(corpus=document3, stopwords=stopwords)
        terms_frequency2, df_idf2  = get_term_weighting_score(cleaning_result=cleaning_result2)
        
        if cr2 == "1":
            crate = 0.05
        elif cr2 == "2":
            crate = 0.1
        elif cr2 == "3":
            crate = 0.2
        elif cr2 == "4":
            crate = 0.3

        percobaan2 = Graph(result_doc=cleaning_result2, raw_frequency=terms_frequency2, 
                              idf=df_idf2, cr=float(crate))
        display(Markdown('**Judul berita:**'))
        print(title)

        summarize = ""
        for ringkasan in percobaan2.summarize:
            summarize += ringkasan + " "

        display(Markdown('**Hasil ringkasan:**'))
        print(summarize)
else:
#     print(cr2)
    display(Markdown('**sorry, wrong input**'))
#     print("sorry, wrong input")


URL berita BBC Indonesia: https://www.bbc.com/indonesia/indonesia-50038237
Jenis compression rate:
 1. 5% 
 2. 10% 
 3. 20% 
 4. 30%
Pilih jenis compression rate: 4


**Judul berita:**

Asap Palembang: Kabut selimuti ibu kota Sumatera Selatan, siswa sekolah diliburkan


**Hasil ringkasan:**

 Sejumlah sekolah di Kota Palembang, Sumatera Selatan, memutuskan memulangkan siswa-siswa mereka lantaran kabut asap semakin tebal menyelimuti kota tersebut. "Pagi ini kami memulangkan siswa karena melihat kabut asap yang tebal dan berdampak buruk terhadap siswa, oleh karenanya atas instruksi Kadiknas Kota Palembang melalui pesan WA Grup meminta siswa dipulangkan dan belajar di rumah masing-masing saja," jelas Siti kepada radio Elshinta. Akan tetapi, sebagaimana dipaparkan Kepala Dinas Pendidikan Sumatera Selatan, Widodo, kegiatan belajar mengajar di daerah yang tidak terdampak kabut asap tetap berlangsung. "Daerah yang tidak terdampak kabut asap tetap normal tetap belajar, untuk daerah yang terkategori sedang tetap belajar namun jam masuk sekolah diundur dan kami himbau memakai masker, bagi daerah terkategori parah maka siswa diberikan tugas dengan memaksimalkan kelas daring," kata Widodo kepada Antara. "Melalui pesan digital, Kepala Dinas Pendidikan Kota Palembang menginstruksikan ke

In [9]:
cleaning_result2

Unnamed: 0,dokumen,clean_corpus,terms
0,"Sejumlah sekolah di Kota Palembang, Sumatera ...",sekolah kota palembang sumatera selatan putus ...,"[sekolah, kota, palembang, sumatera, selatan, ..."
1,pada Senin (14/10) pagi.,senin pagi,"[senin, pagi]"
2,"Kepala SMP Negeri 7 Palembang, Siti Zubaida, m...",kepala smp negeri palembang siti zubaida putus...,"[kepala, smp, negeri, palembang, siti, zubaida..."
3,"""Pagi ini kami memulangkan siswa karena meliha...",pagi pulang siswa lihat kabut asap tebal dampa...,"[pagi, pulang, siswa, lihat, kabut, asap, teba..."
4,Hal ini diamini Kepala Dinas Pendidikan Kota P...,amin kepala dinas didik kota palembang ahmad z...,"[amin, kepala, dinas, didik, kota, palembang, ..."
5,"""Hari ini seluruh TK hingga SMP negeri dan swa...",tk smp negeri swasta derajat libur besok edar ...,"[tk, smp, negeri, swasta, derajat, libur, beso..."
6,"Menurutnya, kalau kualitas udara Palembang mas...",kualitas udara palembang buruk giat ajar ajar ...,"[kualitas, udara, palembang, buruk, giat, ajar..."
7,"Akan tetapi, sebagaimana dipaparkan Kepala Din...",papar kepala dinas didik sumatera selatan wido...,"[papar, kepala, dinas, didik, sumatera, selata..."
8,"""Daerah yang tidak terdampak kabut asap tetap ...",daerah dampak kabut asap normal ajar daerah ka...,"[daerah, dampak, kabut, asap, normal, ajar, ka..."
9,"Hal itu belakangan dibenarkan Agus Wibowo, sel...",agus wibowo kepala pusat data informasi humas ...,"[agus, wibowo, kepala, pusat, data, informasi,..."


In [11]:
df_idf2

Unnamed: 0_level_0,df_term,idf_term
terms,Unnamed: 1_level_1,Unnamed: 2_level_1
agus,2,1.243038
ahmad,2,1.243038
ajar,5,0.845098
akibat,4,0.942008
akses,1,1.544068
aktivitas,1,1.544068
amelia,1,1.544068
amin,1,1.544068
anak,1,1.544068
andriawan,1,1.544068
