In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)
import time
start = time.time()
import re
import nltk
import string
import random
import math

#scrapping
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

# import StemmerFactory class
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

#term weighting tfidf
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
#this function does tokenize, remove stopwords, and stemming
def get_clean_corpus(raw_corpus, stopwords):    
    clean_corpus = []
    token = []
    for index, item in enumerate(corpus['dokumen']):
        term = corpus['dokumen'][index].split(" ")
        
        #deleting url
        deleted_url = [temp for temp in term if not re.match(r"\w+(?:(\.(\w+)\.(\w+)))|\w+(?:(\.(\w+)))", str(temp))]
        
        #deleting symbol
        deleted_symbol = [re.sub(r"[\-\+\=\:\;\"\\\@\[\]\,_!;.':#$%^&*()<>?/\|}{~:]"," ",str(temp)) for temp in deleted_url ]
        
        #stemming
        stemmed_sentence = stemmer.stem(" ".join(deleted_symbol))
        
        tokens = stemmed_sentence.split(" ")
        
        for i in range(len(tokens)):
            for index, word in enumerate(tokens):
                #delete stopwprds
                if word in stopwords:
                    del tokens[index]
                    
                #delete number
                if word.isdigit():
                    del tokens[index]
        
        clean_corpus.append(" ".join(tokens))  
        token.append(list(dict.fromkeys(tokens)))
        
    raw_corpus['clean_corpus'] = clean_corpus
    raw_corpus['token'] = token
    
    return raw_corpus

In [3]:
#this function calculates term weighting
def get_term_weighting_score(corpus):
    #frequency
    vectorizer = CountVectorizer(min_df=0)
    freq_term_corpus = vectorizer.fit_transform(corpus["clean_corpus"]).toarray()
    
    #raw weigthing
    transformer = TfidfTransformer(norm=None, use_idf=True, smooth_idf=True,)
    tfidf = transformer.fit_transform(freq_term_corpus)       
    tokens = vectorizer.get_feature_names()
    raw_frequency = pd.DataFrame(freq_term_corpus.transpose())
    raw_frequency.columns = [str(item+1) for item in range(len(corpus['dokumen']))]
    raw_frequency['tokens'] = tokens
    raw_frequency.set_index('tokens', inplace=True)
    
    #idf
    idf = pd.DataFrame({'idf_score':transformer.idf_,
                       'tokens':tokens})
    idf.set_index('tokens', inplace=True)
    
    return raw_frequency, idf

In [4]:
class Sentence:
    np.random.seed(0)
    def __init__(self, id, full_sentence, clean_sentence, tokens):
        self.list_bm25 = {}
        self.pagerank_score = random.random()
#         self.pagerank_score = pagerank
        self.id = id
        self.full_sentence = full_sentence
        self.clean_sentence = clean_sentence
        self.tokens = tokens
        self.sentence_len = len(clean_sentence.split())
        self.pagerank_score_new = 0
            
    def calculate_bm25(self, raw_frequency, idf, doc, slen_ave):
        k1 = 1.2
        b = 0.75
        total_bm25 = 0
        for query in self.tokens:
            tf = raw_frequency.at[str(query),str(doc.id)]
            idff = idf.at[str(query),'idf_score']
            temp = idff * ((k1+1) * tf) / (k1*( (1-b) + (b *(doc.sentence_len/slen_ave)) ) + tf)            
            total_bm25 += temp
        self.list_bm25[doc.id] = total_bm25
        print(self.list_bm25)
        
    def calculate_new_pagerank(self, doc):
        d=0.85
        sum_InVi = 0
        for item in doc:
            if self.id is not item.id:
                Wji = self.list_bm25[item.id]
                total_Wjk = sum(item.list_bm25.values())
                sum_InVi += Wji/total_Wjk*item.pagerank_score
        self.pagerank_score_new = (1-d)+(d*sum_InVi)
        print(self.pagerank_score_new)
#         print(self.pagerank_score_new)
                

In [5]:
class Graph:    
    def __init__(self, result_doc, raw_frequency, idf):
        self.raw_frequency = raw_frequency
        self.idf = idf
        self.result_doc = result_doc
        self.total_doc = len(result_doc['dokumen'])
        self.slen_ave = 0
        self.summarize = []
        
        
        #making list of object sentence
        doc = []
        for index, item in self.result_doc.iterrows():
            doc.append(Sentence((index+1), item['dokumen'], item['clean_corpus'], item['token']))
        
        #calculate len average
        temp_len_doc = 0
        for item in doc:
            temp_len_doc += item.sentence_len
        self.slen_ave = temp_len_doc/len(doc)
        
        
        #calculate bm25 for each object sentence
        for item in doc:
            for item2 in doc:
                if item.id is not item2.id:
                    item.calculate_bm25(raw_frequency= self.raw_frequency, idf=self.idf, doc=item2, slen_ave=self.slen_ave)
                    
        #calculate pagerank
        for i in range(10):
            for item in doc:
                item.calculate_new_pagerank(doc)
            
            #update pagerank score
            for item in doc:
                item.pagerank_score = item.pagerank_score_new
        

        #getting the summarize        
        sorted_doc = sorted(doc, key=lambda x: x.pagerank_score, reverse=True)        
        top_pagerank = []
        for item in range(math.ceil(self.total_doc*0.25)):
            top_pagerank.append(sorted_doc[item])
        
        sorted_sum = sorted(top_pagerank, key=lambda x: x.id)
        
        temp_summarize = [item.full_sentence for item in sorted_sum]
        
        self.summarize = sorted_sum
#         self.summarize = temp_summarize
   

In [6]:
stopword = open("stopword_list_tala.txt", "r")
stopwords = stopword.read().split("\n")
corpus = pd.read_csv("coba2.csv")

In [7]:
result_doc = get_clean_corpus(corpus, stopwords)

In [8]:
len(result_doc['dokumen'])

30

In [9]:
raw_frequency, idf = get_term_weighting_score(result_doc)

In [10]:
cobs = Graph(result_doc=result_doc, raw_frequency=raw_frequency, idf=idf)
# cobs.summarize
for item in cobs.summarize:
    print(item.id)
    print(item.full_sentence)
    print(item.pagerank_score)

{2: 10.614537618700027}
{2: 10.614537618700027, 3: 18.479271585829686}
{2: 10.614537618700027, 3: 18.479271585829686, 4: 7.368664781655389}
{2: 10.614537618700027, 3: 18.479271585829686, 4: 7.368664781655389, 5: 0.0}
{2: 10.614537618700027, 3: 18.479271585829686, 4: 7.368664781655389, 5: 0.0, 6: 1.8210346353436744}
{2: 10.614537618700027, 3: 18.479271585829686, 4: 7.368664781655389, 5: 0.0, 6: 1.8210346353436744, 7: 9.709818338190662}
{2: 10.614537618700027, 3: 18.479271585829686, 4: 7.368664781655389, 5: 0.0, 6: 1.8210346353436744, 7: 9.709818338190662, 8: 7.393818671748557}
{2: 10.614537618700027, 3: 18.479271585829686, 4: 7.368664781655389, 5: 0.0, 6: 1.8210346353436744, 7: 9.709818338190662, 8: 7.393818671748557, 9: 0.0}
{2: 10.614537618700027, 3: 18.479271585829686, 4: 7.368664781655389, 5: 0.0, 6: 1.8210346353436744, 7: 9.709818338190662, 8: 7.393818671748557, 9: 0.0, 10: 3.0864617524253006}
{2: 10.614537618700027, 3: 18.479271585829686, 4: 7.368664781655389, 5: 0.0, 6: 1.8210346

{1: 0.0, 2: 5.705522327155271, 3: 0.0, 4: 10.96756733884321}
{1: 0.0, 2: 5.705522327155271, 3: 0.0, 4: 10.96756733884321, 6: 9.77701058172498}
{1: 0.0, 2: 5.705522327155271, 3: 0.0, 4: 10.96756733884321, 6: 9.77701058172498, 7: 0.0}
{1: 0.0, 2: 5.705522327155271, 3: 0.0, 4: 10.96756733884321, 6: 9.77701058172498, 7: 0.0, 8: 0.0}
{1: 0.0, 2: 5.705522327155271, 3: 0.0, 4: 10.96756733884321, 6: 9.77701058172498, 7: 0.0, 8: 0.0, 9: 0.0}
{1: 0.0, 2: 5.705522327155271, 3: 0.0, 4: 10.96756733884321, 6: 9.77701058172498, 7: 0.0, 8: 0.0, 9: 0.0, 10: 12.116516306665464}
{1: 0.0, 2: 5.705522327155271, 3: 0.0, 4: 10.96756733884321, 6: 9.77701058172498, 7: 0.0, 8: 0.0, 9: 0.0, 10: 12.116516306665464, 11: 0.0}
{1: 0.0, 2: 5.705522327155271, 3: 0.0, 4: 10.96756733884321, 6: 9.77701058172498, 7: 0.0, 8: 0.0, 9: 0.0, 10: 12.116516306665464, 11: 0.0, 12: 0.0}
{1: 0.0, 2: 5.705522327155271, 3: 0.0, 4: 10.96756733884321, 6: 9.77701058172498, 7: 0.0, 8: 0.0, 9: 0.0, 10: 12.116516306665464, 11: 0.0, 12: 0.0

{1: 0.0, 2: 2.1732583254317652, 3: 0.0, 4: 2.451682879579607, 5: 0.0, 6: 0.0, 7: 2.1732583254317652, 8: 0.0, 10: 4.411872399057884, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0}
{1: 0.0, 2: 2.1732583254317652, 3: 0.0, 4: 2.451682879579607, 5: 0.0, 6: 0.0, 7: 2.1732583254317652, 8: 0.0, 10: 4.411872399057884, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0}
{1: 0.0, 2: 2.1732583254317652, 3: 0.0, 4: 2.451682879579607, 5: 0.0, 6: 0.0, 7: 2.1732583254317652, 8: 0.0, 10: 4.411872399057884, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 4.125524911619031}
{1: 0.0, 2: 2.1732583254317652, 3: 0.0, 4: 2.451682879579607, 5: 0.0, 6: 0.0, 7: 2.1732583254317652, 8: 0.0, 10: 4.411872399057884, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 4.125524911619031, 17: 0.0}
{1: 0.0, 2: 2.1732583254317652, 3: 0.0, 4: 2.451682879579607, 5: 0.0, 6: 0.0, 7: 2.1732583254317652, 8: 0.0, 10: 4.411872399057884, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 4.125524911619031, 17: 0.0, 18: 0.0}
{1: 0.0, 2: 2.17325832543176

{1: 4.388889530741622, 2: 0.0, 3: 3.781342030966141, 4: 3.0959472667321073, 5: 0.0, 6: 0.0, 7: 1.8937438827972273, 8: 3.6124766407824156, 9: 0.0, 10: 0.0, 11: 5.443519347553072, 13: 6.059079553555984, 14: 0.0, 15: 9.473586115395587, 16: 1.4430692568430794, 17: 0.0, 18: 2.0700579684369695, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 0.0, 24: 5.612456154890927, 25: 0.0, 26: 2.744357040857168, 27: 0.0}
{1: 4.388889530741622, 2: 0.0, 3: 3.781342030966141, 4: 3.0959472667321073, 5: 0.0, 6: 0.0, 7: 1.8937438827972273, 8: 3.6124766407824156, 9: 0.0, 10: 0.0, 11: 5.443519347553072, 13: 6.059079553555984, 14: 0.0, 15: 9.473586115395587, 16: 1.4430692568430794, 17: 0.0, 18: 2.0700579684369695, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 0.0, 24: 5.612456154890927, 25: 0.0, 26: 2.744357040857168, 27: 0.0, 28: 4.438577458937358}
{1: 4.388889530741622, 2: 0.0, 3: 3.781342030966141, 4: 3.0959472667321073, 5: 0.0, 6: 0.0, 7: 1.8937438827972273, 8: 3.6124766407824156, 9: 0.0, 10: 0.0, 11: 5.443519347553072, 1

{1: 0.0}
{1: 0.0, 2: 0.0}
{1: 0.0, 2: 0.0, 3: 1.999523691545907}
{1: 0.0, 2: 0.0, 3: 1.999523691545907, 4: 0.0}
{1: 0.0, 2: 0.0, 3: 1.999523691545907, 4: 0.0, 5: 0.0}
{1: 0.0, 2: 0.0, 3: 1.999523691545907, 4: 0.0, 5: 0.0, 6: 0.0}
{1: 0.0, 2: 0.0, 3: 1.999523691545907, 4: 0.0, 5: 0.0, 6: 0.0, 7: 2.5672118058649347}
{1: 0.0, 2: 0.0, 3: 1.999523691545907, 4: 0.0, 5: 0.0, 6: 0.0, 7: 2.5672118058649347, 8: 1.999523691545907}
{1: 0.0, 2: 0.0, 3: 1.999523691545907, 4: 0.0, 5: 0.0, 6: 0.0, 7: 2.5672118058649347, 8: 1.999523691545907, 9: 0.0}
{1: 0.0, 2: 0.0, 3: 1.999523691545907, 4: 0.0, 5: 0.0, 6: 0.0, 7: 2.5672118058649347, 8: 1.999523691545907, 9: 0.0, 10: 0.0}
{1: 0.0, 2: 0.0, 3: 1.999523691545907, 4: 0.0, 5: 0.0, 6: 0.0, 7: 2.5672118058649347, 8: 1.999523691545907, 9: 0.0, 10: 0.0, 11: 0.0}
{1: 0.0, 2: 0.0, 3: 1.999523691545907, 4: 0.0, 5: 0.0, 6: 0.0, 7: 2.5672118058649347, 8: 1.999523691545907, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0}
{1: 0.0, 2: 0.0, 3: 1.999523691545907, 4: 0.0, 5: 0.0, 6: 

{1: 3.3194852041143097, 2: 2.2394342780375163, 3: 2.7322544063348735, 4: 1.8210346353436744, 5: 0.0, 6: 5.476890414958078, 7: 1.8937438827972273, 8: 1.4749798792302726, 9: 0.0, 10: 3.7285350341595658, 11: 3.719155663472554, 12: 0.0, 13: 2.363452011207563, 14: 0.0, 15: 0.0, 16: 5.753215619296837, 17: 4.491336821801429}
{1: 3.3194852041143097, 2: 2.2394342780375163, 3: 2.7322544063348735, 4: 1.8210346353436744, 5: 0.0, 6: 5.476890414958078, 7: 1.8937438827972273, 8: 1.4749798792302726, 9: 0.0, 10: 3.7285350341595658, 11: 3.719155663472554, 12: 0.0, 13: 2.363452011207563, 14: 0.0, 15: 0.0, 16: 5.753215619296837, 17: 4.491336821801429, 18: 7.0714377211544575}
{1: 3.3194852041143097, 2: 2.2394342780375163, 3: 2.7322544063348735, 4: 1.8210346353436744, 5: 0.0, 6: 5.476890414958078, 7: 1.8937438827972273, 8: 1.4749798792302726, 9: 0.0, 10: 3.7285350341595658, 11: 3.719155663472554, 12: 0.0, 13: 2.363452011207563, 14: 0.0, 15: 0.0, 16: 5.753215619296837, 17: 4.491336821801429, 18: 7.0714377211

{1: 1.5274947239748435, 2: 2.2394342780375163, 3: 1.2572745271046006, 4: 1.8210346353436744, 5: 0.0, 6: 1.8210346353436744, 7: 0.0, 8: 2.5240675038615414, 9: 0.0, 10: 3.7285350341595658, 11: 1.711407132212617, 12: 2.8245492920510458, 13: 0.0, 14: 0.0}
{1: 1.5274947239748435, 2: 2.2394342780375163, 3: 1.2572745271046006, 4: 1.8210346353436744, 5: 0.0, 6: 1.8210346353436744, 7: 0.0, 8: 2.5240675038615414, 9: 0.0, 10: 3.7285350341595658, 11: 1.711407132212617, 12: 2.8245492920510458, 13: 0.0, 14: 0.0, 15: 2.9998661446611115}
{1: 1.5274947239748435, 2: 2.2394342780375163, 3: 1.2572745271046006, 4: 1.8210346353436744, 5: 0.0, 6: 1.8210346353436744, 7: 0.0, 8: 2.5240675038615414, 9: 0.0, 10: 3.7285350341595658, 11: 1.711407132212617, 12: 2.8245492920510458, 13: 0.0, 14: 0.0, 15: 2.9998661446611115, 16: 4.097150981888496}
{1: 1.5274947239748435, 2: 2.2394342780375163, 3: 1.2572745271046006, 4: 1.8210346353436744, 5: 0.0, 6: 1.8210346353436744, 7: 0.0, 8: 2.5240675038615414, 9: 0.0, 10: 3.7285

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
1
Sejumlah sekolah di Kota Palembang, Sumatera Selatan, memutuskan memulangkan siswa-siswa mereka lantaran kabut asap semakin tebal menyelimuti kota tersebut. pada Senin (14/10) pagi.
nan
2
Kepala SMP Negeri 7 Palembang, Siti Zubaida, mengatakan keputusan pemulangan ditempuh sesuai dengan instruksi Dinas Pendidikan Kota Palembang.
nan
3
Pagi ini kami memulangkan siswa karena melihat kabut asap yang tebal dan berdampak buruk terhadap siswa, oleh karenanya atas instruksi Kadiknas Kota Palembang melalui pesan WA Grup 



In [11]:
elapsed_time_fl = (time.time() - start) 
print(elapsed_time_fl)

29.174566984176636


In [12]:
yap = stemmer.stem('www.ini_makam.co.id bukan juga 24/7 00.00 00,00ya!')
yap

'www ini makam co id bukan juga 24 7 00 00 00 00ya'

In [13]:

print(pd.__version__) 

0.23.0


In [14]:
for item in range(3):
    print(item)

0
1
2


In [15]:
math.ceil(1.2)

2