In [1]:
import pandas as pd
import random
import re
import math
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

# import StemmerFactory class
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [2]:
#this function does cleaning, tokenize, remove stopwords, and stemming
def get_clean_corpus(corpus, stopwords):
    
    #segmentasi
    temp = sent_tokenize(corpus)
    corpus = pd.DataFrame(temp, columns=['dokumen'])
    
    clean_corpus = []
    token = []
    for index, sentence in enumerate(corpus['dokumen']):
        term = word_tokenize(corpus['dokumen'][index])
        
        #deleting url
        deleted_url = [temp for temp in term if not 
                       re.match(r"\w+(?:(\.(\w+)\.(\w+)))|\w+(?:(\.(\w+)))", str(temp))]
        
        #deleting symbol
        deleted_symbol = [re.sub(r"[\-\+\=\:\;\"\\\@\[\]\,_!;.':#$%^&*()<>?/\|}{~:]"," ",
                                 str(temp)) for temp in deleted_url ]
        
        #stemming
        stemmed_sentence = stemmer.stem(" ".join(deleted_symbol))
        
        tokens = word_tokenize(stemmed_sentence)
        for i in range(len(tokens)):
            for index, word in enumerate(tokens):
                #delete stopwprds
                if word in stopwords:
                    del tokens[index]
                    
                #delete number
                if word.isdigit():
                    del tokens[index]
        
        clean_corpus.append(" ".join(tokens))  
        token.append(list(dict.fromkeys(tokens)))
        
    corpus['clean_corpus'] = clean_corpus
    corpus['terms'] = token
    
    return corpus

In [3]:
#this function calculates term weighting
def get_term_weighting_score(cleaning_result):
    
    #getting all the terms
    terms = []
    for index, sentence in enumerate(cleaning_result['terms']):
        terms += [temp for temp in sentence if temp not in terms]
    terms.sort()
        
    #getting frequency for every sentences
    terms_frequency = pd.DataFrame()
    for index, term in enumerate(cleaning_result['terms']):
        frequency_each_sentence = []
        for i, d in enumerate(terms):
            temp = term.count(d)
            frequency_each_sentence.append(temp)
        terms_frequency[str(index+1)] = frequency_each_sentence
        
    terms_frequency['terms'] = terms
    terms_frequency.set_index('terms', inplace= True)
    
    #getting df for every terms
    df_idf = pd.DataFrame(terms_frequency.sum(axis=1), columns=['df_term'])
    df_idf['terms'] = terms
    df_idf.set_index('terms', inplace= True)
    
    #getting idf for every terms
    N = len(terms_frequency.columns)
    terms_idf = []
    for i, d in df_idf.iterrows():
        idf_score = math.log((N+1)/((df_idf['df_term'][i])), 10)
        terms_idf.append(idf_score)
    df_idf['idf_term'] = terms_idf
    
    return terms_frequency, df_idf   

In [4]:
class Sentence:
    np.random.seed(0)
    def __init__(self, id, full_sentence, clean_sentence, tokens):
        self.list_bm25 = {}
        self.pagerank_score = random.random()
        self.id = id
        self.full_sentence = full_sentence
        self.clean_sentence = clean_sentence
        self.tokens = tokens
        self.sentence_len = len(clean_sentence.split())
        self.pagerank_score_new = 0
            
    def calculate_bm25(self, raw_frequency, idf, doc, slen_ave):
        k1 = 1.2
        b = 0.75
        total_bm25 = 0
        for query in self.tokens:
            tf = raw_frequency.at[str(query),str(doc.id)]
            idff = idf.at[str(query),'idf_term']
            temp = idff * ((k1+1) * tf) / (k1*( (1-b) + (b *(doc.sentence_len/slen_ave)) ) + tf)  
            total_bm25 += temp
        self.list_bm25[doc.id] = total_bm25
        
    def calculate_new_pagerank(self, doc):
        d=0.85
        sum_InVi = 0
        for item in doc:
            if self.id is not item.id:
                Wji = self.list_bm25[item.id]
                total_Wjk = sum(item.list_bm25.values())
                sum_InVi += Wji/total_Wjk*item.pagerank_score
        self.pagerank_score_new = (1-d)+(d*sum_InVi)                

In [5]:
class Graph:    
    def __init__(self, result_doc, raw_frequency, idf, cr):
        self.raw_frequency = raw_frequency
        self.idf = idf.drop(columns=['df_term'])
        self.result_doc = result_doc
        self.total_doc = len(result_doc['dokumen'])
        self.slen_ave = 0
        self.summarize = []
        self.doc = []
        self.outlier = []
        self.compression_rate = cr
        self.doc_utuh = []
        
        
        #making object sentence
#         list_pgrk = [0.400827866,0.863170087,0.389187762,0.924094751,0.157640608,
#                      0.714980958,0.216858534,0.237221536,0.076112858,0.841401681]

        for index, item in self.result_doc.iterrows():
            self.doc.append(Sentence((index+1), item['dokumen'], item['clean_corpus'], 
                                     item['terms'])) 
            self.doc_utuh.append(Sentence((index+1), item['dokumen'], item['clean_corpus'], 
                                          item['terms'])) 
        
        #calculate len average
        temp_len_doc = 0
        for item in self.doc:
            temp_len_doc += item.sentence_len
        self.slen_ave = temp_len_doc/len(self.doc)
        
        
        #calculate bm25 for each object sentence
        for item in self.doc:
            for item2 in self.doc:
                if item.id is not item2.id:
                    item.calculate_bm25(raw_frequency= self.raw_frequency, 
                                        idf=self.idf, doc=item2, slen_ave=self.slen_ave)
           
#         self.doc_utuh = self.doc
        ##CHECKING IF BM25 SCORE IS 0 (OUTLIER SENTENCE)
        for index, item in enumerate(self.doc):
            if sum(item.list_bm25.values()) <= 0:
                self.outlier.append(self.doc.pop(index))
                
                
        #calculate pagerank
        for i in range(200):
            for item in self.doc:
                item.calculate_new_pagerank(self.doc)
            
            #update pagerank score
            for item in self.doc:
                item.pagerank_score = item.pagerank_score_new
        

        #getting the summarize        
        sorted_doc = sorted(self.doc, key=lambda x: x.pagerank_score, reverse=True)        
        top_pagerank = []
        for item in range(math.ceil(self.total_doc*self.compression_rate)):
            top_pagerank.append(sorted_doc[item])
        
        sorted_sum = sorted(top_pagerank, key=lambda x: x.id)
        
        temp_summarize = [item.full_sentence for item in sorted_sum]
        
#         self.summarize = sorted_sum
        
        self.summarize = temp_summarize   

In [6]:
# import libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup

# specify the url
url_bbcnews = "https://www.bbc.com/indonesia/indonesia-50038237"

# Connect to the website and return the html to the variable ‘page’
try:
    page_news = urlopen(url_bbcnews)
except:
    print("Error opening the URL")

# parse the html using beautiful soup and store in variable `soup`
beautysoup = BeautifulSoup(page_news, 'html.parser')

# Take out the <div> of name and get its value
text_news = beautysoup.find('div', {"class": "story-body__inner"})

document = ''
for i in text_news.findAll('p'):
    document = document + ' ' +  i.text
print(document)

# Saving the scraped text
# with open('scraped_text.txt', 'w') as file:
#     file.write(article)

 Sejumlah sekolah di Kota Palembang, Sumatera Selatan, memutuskan memulangkan siswa-siswa mereka lantaran kabut asap semakin tebal menyelimuti kota tersebut. pada Senin (14/10) pagi. Kepala SMP Negeri 7 Palembang, Siti Zubaida, mengatakan keputusan pemulangan ditempuh sesuai dengan instruksi Dinas Pendidikan Kota Palembang. "Pagi ini kami memulangkan siswa karena melihat kabut asap yang tebal dan berdampak buruk terhadap siswa, oleh karenanya atas instruksi Kadiknas Kota Palembang melalui pesan WA Grup meminta siswa dipulangkan dan belajar di rumah masing-masing saja," jelas Siti kepada radio Elshinta. Hal ini diamini Kepala Dinas Pendidikan Kota Palembang, Ahmad Zulinto, yang menyampaikan surat edaran ke semua sekolah. "Hari ini seluruh TK hingga SMP negeri dan swasta sederajat diliburkan, untuk besok dan seterusnya akan diberikan edaran lebih lanjut," kata Ahmad Zulinto kepada kantor berita Antara. Menurutnya, kalau kualitas udara Palembang masih buruk dalam beberapa hari ke depan, k

In [9]:
# input_document = open("bbc contoh beneran.txt", "r")
# document2= document.readline()
stopword = open("stopword_list_tala.txt", "r")
stopwords = stopword.read().split("\n")
cleaning_result = get_clean_corpus(corpus=document, stopwords=stopwords)
terms_frequency, df_idf  = get_term_weighting_score(cleaning_result=cleaning_result)
cr = [0.05, 0.10, 0.20, 0.30]
for crr in cr:
    print("cr : ", crr)
    percobaan = Graph(result_doc=cleaning_result, raw_frequency=terms_frequency, 
                      idf=df_idf, cr=crr)
    for ringkasan in percobaan.summarize:
        print(ringkasan)

cr :  0.05
"Pagi ini kami memulangkan siswa karena melihat kabut asap yang tebal dan berdampak buruk terhadap siswa, oleh karenanya atas instruksi Kadiknas Kota Palembang melalui pesan WA Grup meminta siswa dipulangkan dan belajar di rumah masing-masing saja," jelas Siti kepada radio Elshinta.
"Melalui pesan digital, Kepala Dinas Pendidikan Kota Palembang menginstruksikan kegiatan belajar mengajar di tingkat paud, TK, SD dan SMP negeri dan swasta diliburkan hingga batas yang belum ditentukan," sebut Agus dalam siaran pers.
cr :  0.1
"Pagi ini kami memulangkan siswa karena melihat kabut asap yang tebal dan berdampak buruk terhadap siswa, oleh karenanya atas instruksi Kadiknas Kota Palembang melalui pesan WA Grup meminta siswa dipulangkan dan belajar di rumah masing-masing saja," jelas Siti kepada radio Elshinta.
"Daerah yang tidak terdampak kabut asap tetap normal tetap belajar, untuk daerah yang terkategori sedang tetap belajar namun jam masuk sekolah diundur dan kami himbau memakai ma

# test doc

In [None]:
# input_document1 = open("dokumen uji/dok 1.txt", "r")
# document1= input_document1.readline()
# cleaning_result1 = get_clean_corpus(corpus=document1, stopwords=stopwords)
# terms_frequency1, df_idf1  = get_term_weighting_score(cleaning_result=cleaning_result1)
# # cr = [0.05, 0.10, 0.20, 0.30]

# hasil_dok1 = pd.DataFrame()
# hasil_dok1["dokumen"] =  cleaning_result1["dokumen"]
# for crr in cr:
#     tempppppp=[]
#     print("cr : ", crr)
#     percobaan1 = Graph(result_doc=cleaning_result1, raw_frequency=terms_frequency1, 
#                        idf=df_idf1, cr=crr)
    
#     for index, itemmm in enumerate(percobaan1.doc_utuh):
#         if itemmm.full_sentence in percobaan1.summarize:
#             tempppppp.append("yes")
#         else:
#             tempppppp.append("no")
#     hasil_dok1[str(crr)] = tempppppp

In [None]:
# hasil_dok1

In [None]:
# input_document2 = open("dokumen uji/dok 2.txt", "r")
# document2= input_document2.readline()
# cleaning_result2 = get_clean_corpus(corpus=document2, stopwords=stopwords)
# terms_frequency2, df_idf2  = get_term_weighting_score(cleaning_result=cleaning_result2)
# # cr = [0.05, 0.10, 0.20, 0.30]

# hasil_dok2 = pd.DataFrame()
# hasil_dok2["dokumen"] =  cleaning_result2["dokumen"]
# for crr in cr:
#     tempppppp = []
#     print("cr : ", crr)
#     percobaan2 = Graph(result_doc=cleaning_result2, raw_frequency=terms_frequency2, 
#                        idf=df_idf2, cr=crr)
    
#     for index, itemmm in enumerate(percobaan2.doc_utuh):
#         if itemmm.full_sentence in percobaan2.summarize:
#             tempppppp.append("yes")
#         else:
#             tempppppp.append("no")
#     hasil_dok2[str(crr)] = tempppppp

In [None]:
# hasil_dok2

In [None]:
# input_document3 = open("dokumen uji/dok 3.txt", "r")
# document3= input_document3.readline()
# cleaning_result3 = get_clean_corpus(corpus=document3, stopwords=stopwords)
# terms_frequency3, df_idf3  = get_term_weighting_score(cleaning_result=cleaning_result3)
# # cr = [0.05, 0.10, 0.20, 0.30]

# hasil_dok3 = pd.DataFrame()
# hasil_dok3["dokumen"] =  cleaning_result3["dokumen"]
# for crr in cr:
#     tempppppp = []
#     print("cr : ", crr)
#     percobaan3 = Graph(result_doc=cleaning_result3, raw_frequency=terms_frequency3, 
#                        idf=df_idf3, cr=crr)
    
#     for index, itemmm in enumerate(percobaan3.doc_utuh):
#         if itemmm.full_sentence in percobaan3.summarize:
#             tempppppp.append("yes")
#         else:
#             tempppppp.append("no")
#     hasil_dok3[str(crr)] = tempppppp

In [None]:
# hasil_dok3

In [None]:
# input_document4 = open("dokumen uji/dok 4.txt", "r")
# document4= input_document4.readline()
# cleaning_result4 = get_clean_corpus(corpus=document4, stopwords=stopwords)
# terms_frequency4, df_idf4  = get_term_weighting_score(cleaning_result=cleaning_result4)
# # cr = [0.05, 0.10, 0.20, 0.30]

# hasil_dok4 = pd.DataFrame()
# hasil_dok4["dokumen"] =  cleaning_result4["dokumen"]
# for crr in cr:
#     tempppppp = []
#     print("cr : ", crr)
#     percobaan4 = Graph(result_doc=cleaning_result4, raw_frequency=terms_frequency4, 
#                        idf=df_idf4, cr=crr)
    
#     for index, itemmm in enumerate(percobaan4.doc_utuh):
#         if itemmm.full_sentence in percobaan4.summarize:
#             tempppppp.append("yes")
#         else:
#             tempppppp.append("no")
#     hasil_dok4[str(crr)] = tempppppp

In [None]:
# hasil_dok4 

In [None]:
# input_document5 = open("dokumen uji/dok 5.txt", "r")
# document5= input_document5.readline()
# cleaning_result5 = get_clean_corpus(corpus=document5, stopwords=stopwords)
# terms_frequency5, df_idf5  = get_term_weighting_score(cleaning_result=cleaning_result5)
# # cr = [0.05, 0.10, 0.20, 0.30]
# hasil_dok5 = pd.DataFrame()
# hasil_dok5["dokumen"] =  cleaning_result5["dokumen"]
# for crr in cr:
#     tempppppp = []
#     print("cr : ", crr)
#     percobaan5 = Graph(result_doc=cleaning_result5, raw_frequency=terms_frequency5, 
#                        idf=df_idf5, cr=crr)
    
#     for index, itemmm in enumerate(percobaan5.doc_utuh):
#         if itemmm.full_sentence in percobaan5.summarize:
#             tempppppp.append("yes")
#         else:
#             tempppppp.append("no")
#     hasil_dok5[str(crr)] = tempppppp

In [None]:
# hasil_dok5

In [None]:
# input_document6 = open("dokumen uji/dok 6.txt", "r")
# document6= input_document6.readline()
# cleaning_result6 = get_clean_corpus(corpus=document6, stopwords=stopwords)
# terms_frequency6, df_idf6  = get_term_weighting_score(cleaning_result=cleaning_result6)
# # cr = [0.05, 0.10, 0.20, 0.30]

# hasil_dok6 = pd.DataFrame()
# hasil_dok6["dokumen"] =  cleaning_result6["dokumen"]
# for crr in cr:
#     tempppppp = []
#     print("cr : ", crr)
#     percobaan6 = Graph(result_doc=cleaning_result6, raw_frequency=terms_frequency6, 
#                        idf=df_idf6, cr=crr)
    
#     for index, itemmm in enumerate(percobaan6.doc_utuh):
#         if itemmm.full_sentence in percobaan6.summarize:
#             tempppppp.append("yes")
#         else:
#             tempppppp.append("no")
#     hasil_dok6[str(crr)] = tempppppp

In [None]:
# hasil_dok6

In [None]:
# input_document7 = open("dokumen uji/dok 7.txt", "r")
# document7= input_document7.readline()
# cleaning_result7 = get_clean_corpus(corpus=document7, stopwords=stopwords)
# terms_frequency7, df_idf7  = get_term_weighting_score(cleaning_result=cleaning_result7)
# # cr = [0.05, 0.10, 0.20, 0.30]

# hasil_dok7 = pd.DataFrame()
# hasil_dok7["dokumen"] =  cleaning_result7["dokumen"]
# for crr in cr:
#     tempppppp = []
#     print("cr : ", crr)
#     percobaan7 = Graph(result_doc=cleaning_result7, raw_frequency=terms_frequency7, idf=df_idf7, cr=crr)
    
#     for index, itemmm in enumerate(percobaan7.doc_utuh):
#         if itemmm.full_sentence in percobaan7.summarize:
#             tempppppp.append("yes")
#         else:
#             tempppppp.append("no")
#     hasil_dok7[str(crr)] = tempppppp

In [None]:
# hasil_dok7

In [None]:
# input_document8 = open("dokumen uji/dok 8.txt", "r")
# document8= input_document8.readline()
# cleaning_result8 = get_clean_corpus(corpus=document8, stopwords=stopwords)
# terms_frequency8, df_idf8  = get_term_weighting_score(cleaning_result=cleaning_result8)
# # cr = [0.05, 0.10, 0.20, 0.30]

# hasil_dok8 = pd.DataFrame()
# hasil_dok8["dokumen"] =  cleaning_result8["dokumen"]
# for crr in cr:
#     tempppppp = []
#     print("cr : ", crr)
#     percobaan8 = Graph(result_doc=cleaning_result8, raw_frequency=terms_frequency8, 
#                        idf=df_idf8, cr=crr)
    

#     for index, itemmm in enumerate(percobaan8.doc_utuh):
#         if itemmm.full_sentence in percobaan8.summarize:
#             tempppppp.append("yes")
#         else:
#             tempppppp.append("no")
#     hasil_dok8[str(crr)] = tempppppp

In [None]:
# hasil_dok8

In [None]:
# input_document9 = open("dokumen uji/dok 9.txt", "r")
# document9= input_document9.readline()
# cleaning_result9 = get_clean_corpus(corpus=document9, stopwords=stopwords)
# terms_frequency9, df_idf9  = get_term_weighting_score(cleaning_result=cleaning_result9)
# # cr = [0.05, 0.10, 0.20, 0.30]


# hasil_dok9 = pd.DataFrame()
# hasil_dok9["dokumen"] =  cleaning_result9["dokumen"]
# for crr in cr:
#     tempppppp = []
#     print("cr : ", crr)
#     percobaan9 = Graph(result_doc=cleaning_result9, 
#                         raw_frequency=terms_frequency9, idf=df_idf9, cr=crr)
    
# #     for ringkasan in percobaan10.summarize:
# #         print(ringkasan)
    
#     for index, itemmm in enumerate(percobaan9.doc_utuh):
#         if itemmm.full_sentence in percobaan9.summarize:
#             tempppppp.append("yes")
#         else:
#             tempppppp.append("no")
#     hasil_dok9[str(crr)] = tempppppp

In [None]:
# hasil_dok9

In [None]:
# input_document10 = open("dokumen uji/dok 10.txt", "r")
# document10= input_document10.readline()
# cleaning_result10 = get_clean_corpus(corpus=document10, stopwords=stopwords)
# terms_frequency10, df_idf10  = get_term_weighting_score(cleaning_result=cleaning_result10)
# # cr = [0.05, 0.10, 0.20, 0.30]




# hasil_dok10 = pd.DataFrame()
# hasil_dok10["dokumen"] =  cleaning_result10["dokumen"]
# for crr in cr:
#     tempppppp = []
#     print("cr : ", crr)
#     percobaan10 = Graph(result_doc=cleaning_result10, 
#                         raw_frequency=terms_frequency10, idf=df_idf10, cr=crr)
    
#     for ringkasan in percobaan10.summarize:
#         print(ringkasan)
    
#     for index, itemmm in enumerate(percobaan10.doc_utuh):
#         if itemmm.full_sentence in percobaan10.summarize:
#             tempppppp.append("yes")
#         else:
#             tempppppp.append("no")
#     hasil_dok10[str(crr)] = tempppppp
                


In [None]:
# hasil_dok10

In [None]:
# for index, itemmm in enumerate(percobaan10.doc_utuh):
# #     print(index+1)
#     print(itemmm.full_sentence)

In [None]:
# hasil_dok1.to_csv("hasil sistem/hasil_sistem_dok1.csv",index=False)
# hasil_dok2.to_csv("hasil sistem/hasil_sistem_dok2.csv",index=False)
# hasil_dok3.to_csv("hasil sistem/hasil_sistem_dok3.csv",index=False)
# hasil_dok4.to_csv("hasil sistem/hasil_sistem_dok4.csv",index=False)
# hasil_dok5.to_csv("hasil sistem/hasil_sistem_dok5.csv",index=False)
# hasil_dok6.to_csv("hasil sistem/hasil_sistem_dok6.csv",index=False)
# hasil_dok7.to_csv("hasil sistem/hasil_sistem_dok7.csv",index=False)
# hasil_dok8.to_csv("hasil sistem/hasil_sistem_dok8.csv",index=False)
# hasil_dok9.to_csv("hasil sistem/hasil_sistem_dok9.csv",index=False)
# hasil_dok10.to_csv("hasil sistem/hasil_sistem_dok10.csv",index=False)

In [None]:
# hasil_dok10