In [3]:
import time
start = time.time()

In [59]:
import pandas as pd
import re
import math
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

# import StemmerFactory class
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()


In [25]:
#this function does cleaning, tokenize, remove stopwords, and stemming
def get_clean_corpus(corpus, stopwords):
    
    #segmentasi
    temp = sent_tokenize(corpus)
    corpus = pd.DataFrame(temp, columns=['dokumen'])
    
    clean_corpus = []
    token = []
    for index, sentence in enumerate(corpus['dokumen']):
        term = word_tokenize(corpus['dokumen'][index])
        
        #deleting url
        deleted_url = [temp for temp in term if not re.match(r"\w+(?:(\.(\w+)\.(\w+)))|\w+(?:(\.(\w+)))", str(temp))]
        
        #deleting symbol
        deleted_symbol = [re.sub(r"[\-\+\=\:\;\"\\\@\[\]\,_!;.':#$%^&*()<>?/\|}{~:]"," ",str(temp)) for temp in deleted_url ]
        
        #stemming
        stemmed_sentence = stemmer.stem(" ".join(deleted_symbol))
        
        tokens = word_tokenize(stemmed_sentence)
        
        for i in range(len(tokens)):
            for index, word in enumerate(tokens):
                #delete stopwprds
                if word in stopwords:
                    del tokens[index]
                    
                #delete number
                if word.isdigit():
                    del tokens[index]
        
        clean_corpus.append(" ".join(tokens))  
        token.append(list(dict.fromkeys(tokens)))
        
    corpus['clean_corpus'] = clean_corpus
    corpus['terms'] = token
    
    return corpus

In [48]:
input_document = open("bbc contoh beneran.txt", "r")
document= input_document.readline()

In [45]:
stopword = open("stopword_list_tala.txt", "r")
stopwords = stopword.read().split("\n")

In [49]:
cleaning_result = get_clean_corpus(corpus=document, stopwords=stopwords)

In [50]:
cleaning_result

Unnamed: 0,dokumen,clean_corpus,terms
0,"Sejumlah sekolah di Kota Palembang, Sumatera S...",sekolah kota palembang sumatera selatan putus ...,"[sekolah, kota, palembang, sumatera, selatan, ..."
1,"Kepala SMP Negeri 7 Palembang, Siti Zubaida, m...",kepala smp negeri palembang siti zubaida putus...,"[kepala, smp, negeri, palembang, siti, zubaida..."
2,"""Pagi ini kami memulangkan siswa karena meliha...",pagi pulang siswa lihat kabut asap tebal dampa...,"[pagi, pulang, siswa, lihat, kabut, asap, teba..."
3,Hal ini diamini Kepala Dinas Pendidikan Kota P...,amin kepala dinas didik kota palembang ahmad z...,"[amin, kepala, dinas, didik, kota, palembang, ..."
4,"""Hari ini seluruh TK hingga SMP negeri dan swa...",tk smp negeri swasta derajat libur besok edar ...,"[tk, smp, negeri, swasta, derajat, libur, beso..."
5,"Menurutnya, kalau kualitas udara Palembang mas...",kualitas udara palembang buruk giat ajar ajar ...,"[kualitas, udara, palembang, buruk, giat, ajar..."
6,"Akan tetapi, sebagaimana dipaparkan Kepala Din...",papar kepala dinas didik sumatera selatan wido...,"[papar, kepala, dinas, didik, sumatera, selata..."
7,"""Daerah yang tidak terdampak kabut asap tetap ...",daerah dampak kabut asap normal ajar daerah ka...,"[daerah, dampak, kabut, asap, normal, ajar, ka..."
8,"Hal itu belakangan dibenarkan Agus Wibowo, sel...",agus wibowo kepala pusat data informasi humas ...,"[agus, wibowo, kepala, pusat, data, informasi,..."
9,"""Melalui pesan digital, Kepala Dinas Pendidika...",pesan digital kepala dinas didik kota palemban...,"[pesan, digital, kepala, dinas, didik, kota, p..."


In [54]:
#this function calculates term weighting
def get_term_weighting_score(cleaning_result):
    
    #getting all the terms
    terms = []
    for index, sentence in enumerate(cleaning_result['terms']):
        terms += [temp for temp in sentence if temp not in terms]
    terms.sort()
        
    #getting frequency for every sentences
    terms_frequency = pd.DataFrame()
    for index, term in enumerate(cleaning_result['terms']):
        frequency_each_sentence = []
        for i, d in enumerate(terms):
            temp = term.count(d)
            frequency_each_sentence.append(temp)
        terms_frequency[str(index+1)] = frequency_each_sentence
        
    terms_frequency['terms'] = terms
    terms_frequency.set_index('terms', inplace= True)
    
    #getting df for every terms
    df_idf = pd.DataFrame(terms_frequency.sum(axis=1), columns=['df_term'])
    df_idf['terms'] = terms
    df_idf.set_index('terms', inplace= True)
    
    #getting idf for every terms
    N = len(terms_frequency.columns)
    terms_idf = []
    for i, d in df_idf.iterrows():
        idf_score = math.log((N+1)/((df_idf['df_term'][i]+1)), 10)+1
        terms_idf.append(idf_score)
    df_idf['idf_term'] = terms_idf
    
    return terms_frequency, df_idf   

In [55]:
terms_frequency, df_idf  = get_term_weighting_score(cleaning_result=cleaning_result)

In [57]:
# terms_frequency
# df_idf

In [62]:
class Sentence:
    np.random.seed(0)
    def __init__(self, id, full_sentence, clean_sentence, tokens, pagerank):
        self.list_bm25 = {}
#         self.pagerank_score = random.random()
        self.pagerank_score = pagerank
        self.id = id
        self.full_sentence = full_sentence
        self.clean_sentence = clean_sentence
        self.tokens = tokens
        self.sentence_len = len(clean_sentence.split())
        self.pagerank_score_new = 0
            
    def calculate_bm25(self, raw_frequency, idf, doc, slen_ave):
        k1 = 1.2
        b = 0.75
        total_bm25 = 0
        for query in self.tokens:
            tf = raw_frequency.at[str(query),str(doc.id)]
            idff = idf.at[str(query),'idf_term']
            temp = idff * ((k1+1) * tf) / (k1*( (1-b) + (b *(doc.sentence_len/slen_ave)) ) + tf)            
            total_bm25 += temp
        self.list_bm25[doc.id] = total_bm25
        
    def calculate_new_pagerank(self, doc):
        d=0.85
        sum_InVi = 0
        for item in doc:
            if self.id is not item.id:
                Wji = self.list_bm25[item.id]
                total_Wjk = sum(item.list_bm25.values())
                sum_InVi += Wji/total_Wjk*item.pagerank_score
        self.pagerank_score_new = (1-d)+(d*sum_InVi)
#         print(self.pagerank_score_new)
                

In [63]:
class Graph:    
    def __init__(self, result_doc, raw_frequency, idf):
        self.raw_frequency = raw_frequency
        self.idf = idf.drop(columns=['df_term'])
        self.result_doc = result_doc
        self.total_doc = len(result_doc['dokumen'])
        self.slen_ave = 0
        self.summarize = []
        
        
        #making object sentence
        list_pgrk = [0.400827866,0.863170087,0.389187762,0.924094751,0.157640608,0.714980958,0.216858534,0.237221536,0.076112858,0.841401681]

        doc = []
        for index, item in self.result_doc.iterrows():
            doc.append(Sentence((index+1), item['dokumen'], item['clean_corpus'], item['terms'], pagerank=list_pgrk[index])) 
        
        #calculate len average
        temp_len_doc = 0
        for item in doc:
            temp_len_doc += item.sentence_len
        self.slen_ave = temp_len_doc/len(doc)
        
        
        #calculate bm25 for each object sentence
        for item in doc:
            for item2 in doc:
                if item.id is not item2.id:
                    item.calculate_bm25(raw_frequency= self.raw_frequency, idf=self.idf, doc=item2, slen_ave=self.slen_ave)
                    
        #calculate pagerank
        for i in range(20):
            for item in doc:
                item.calculate_new_pagerank(doc)
            
            #update pagerank score
            for item in doc:
                item.pagerank_score = item.pagerank_score_new
        

        #getting the summarize        
        sorted_doc = sorted(doc, key=lambda x: x.pagerank_score, reverse=True)        
        top_pagerank = []
        for item in range(math.ceil(self.total_doc*0.25)):
            top_pagerank.append(sorted_doc[item])
        
        sorted_sum = sorted(top_pagerank, key=lambda x: x.id)
        
        temp_summarize = [item.full_sentence for item in sorted_sum]
        
        self.summarize = sorted_sum
#         self.summarize = temp_summarize
   

In [64]:
cobs = Graph(result_doc=cleaning_result, raw_frequency=terms_frequency, idf=df_idf)
for item in cobs.summarize:
    print(item.id)
    print(item.full_sentence)
    print(item.pagerank_score)

2
Kepala SMP Negeri 7 Palembang, Siti Zubaida, mengatakan keputusan pemulangan ditempuh sesuai dengan instruksi Dinas Pendidikan Kota Palembang.
1.2186868189941311
3
"Pagi ini kami memulangkan siswa karena melihat kabut asap yang tebal dan berdampak buruk terhadap siswa, oleh karenanya atas instruksi Kadiknas Kota Palembang melalui pesan WA Grup meminta siswa dipulangkan dan belajar di rumah masing-masing saja," jelas Siti kepada radio Elshinta.
1.2882431241496164
10
"Melalui pesan digital, Kepala Dinas Pendidikan Kota Palembang menginstruksikan kegiatan belajar mengajar di tingkat paud, TK, SD dan SMP negeri dan swasta diliburkan hingga batas yang belum ditentukan," sebut Agus dalam siaran pers.
1.742006231091659
