In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)
import time
start = time.time()
import re
import nltk
import string
import random
import math

#scrapping
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

# import StemmerFactory class
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

#term weighting tfidf
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
#this function does tokenize, remove stopwords, and stemming
def get_clean_corpus(raw_corpus, stopwords):    
    clean_corpus = []
    token = []
    for index, item in enumerate(corpus['dokumen']):
        term = corpus['dokumen'][index].split(" ")
        
        #deleting url
        deleted_url = [temp for temp in term if not re.match(r"\w+(?:(\.(\w+)\.(\w+)))|\w+(?:(\.(\w+)))", str(temp))]
        
        #deleting symbol
        deleted_symbol = [re.sub(r"[\-\+\=\:\;\"\\\@\[\]\,_!;.':#$%^&*()<>?/\|}{~:]"," ",str(temp)) for temp in deleted_url ]
        
        #stemming
        stemmed_sentence = stemmer.stem(" ".join(deleted_symbol))
        
        tokens = stemmed_sentence.split(" ")
        
        for i in range(len(tokens)):
            for index, word in enumerate(tokens):
                #delete stopwprds
                if word in stopwords:
                    del tokens[index]
                    
                #delete number
                if word.isdigit():
                    del tokens[index]
        
        clean_corpus.append(" ".join(tokens))  
        token.append(list(dict.fromkeys(tokens)))
        
    raw_corpus['clean_corpus'] = clean_corpus
    raw_corpus['token'] = token
    
    return raw_corpus

In [3]:
#this function calculates term weighting
def get_term_weighting_score(corpus):
    #frequency
    vectorizer = CountVectorizer(min_df=0)
    freq_term_corpus = vectorizer.fit_transform(corpus["clean_corpus"]).toarray()
    
    
    transformer = TfidfTransformer(norm=None, use_idf=True, smooth_idf=True,)
    tfidf = transformer.fit_transform(freq_term_corpus)
    
    #raw weigthing
    tokens = vectorizer.get_feature_names()
    raw_frequency = pd.DataFrame(freq_term_corpus.transpose())
    raw_frequency.columns = [str(item+1) for item in range(len(corpus['dokumen']))]
    raw_frequency['tokens'] = tokens
    raw_frequency.set_index('tokens', inplace=True)
    
    #idf
    idf = pd.DataFrame({'idf_score':transformer.idf_,
                       'tokens':tokens})
    idf.set_index('tokens', inplace=True)
    
    return raw_frequency, idf

In [4]:
class Sentence:
    np.random.seed(0)
    def __init__(self, id, full_sentence, clean_sentence, tokens):
        self.list_bm25 = {}
        self.pagerank_score = random.random()
#         self.pagerank_score = pagerank
        self.id = id
        self.full_sentence = full_sentence
        self.clean_sentence = clean_sentence
        self.tokens = tokens
        self.sentence_len = len(clean_sentence.split())
        self.pagerank_score_new = 0
            
    def calculate_bm25(self, raw_frequency, idf, doc, slen_ave):
        k1 = 1.2
        b = 0.75
        total_bm25 = 0
        for query in self.tokens:
            tf = raw_frequency.at[str(query),str(doc.id)]
            idff = idf.at[str(query),'idf_score']
            temp = idff * ((k1+1) * tf) / (k1*( (1-b) + (b *(doc.sentence_len/slen_ave)) ) + tf)            
            total_bm25 += temp
        self.list_bm25[doc.id] = total_bm25
#         print("IDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD ",self.id)
#         print(self.list_bm25)
        
    def calculate_new_pagerank(self, doc):
        d=0.85
        sum_InVi = 0
        for item in doc:
            if self.id is not item.id:
                Wji = self.list_bm25[item.id]
                total_Wjk = sum(item.list_bm25.values())
                sum_InVi += Wji/total_Wjk*item.pagerank_score
        self.pagerank_score_new = (1-d)+(d*sum_InVi)
#         print("IDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD ",self.id)
#         print(self.pagerank_score_new)
#         print(self.pagerank_score_new)
                

In [5]:
class Graph:    
    def __init__(self, result_doc, raw_frequency, idf):
        self.raw_frequency = raw_frequency
        self.idf = idf
        self.result_doc = result_doc
        self.total_doc = len(result_doc['dokumen'])
        self.slen_ave = 0
        self.summarize = []
        
        
        #making list of object sentence
        doc = []
        for index, item in self.result_doc.iterrows():
            doc.append(Sentence((index+1), item['dokumen'], item['clean_corpus'], item['token']))
        
        #calculate len average
        temp_len_doc = 0
        for item in doc:
            temp_len_doc += item.sentence_len
        self.slen_ave = temp_len_doc/len(doc)
        
        
        #calculate bm25 for each object sentence
        for item in doc:
            for item2 in doc:
                if item.id is not item2.id:
                    item.calculate_bm25(raw_frequency= self.raw_frequency, idf=self.idf, doc=item2, slen_ave=self.slen_ave)
                    
        #calculate pagerank
        for i in range(100):
            for item in doc:
                item.calculate_new_pagerank(doc)
            
            #update pagerank score
            for item in doc:
                item.pagerank_score = item.pagerank_score_new
        

        #getting the summarize        
        sorted_doc = sorted(doc, key=lambda x: x.pagerank_score, reverse=True)        
        top_pagerank = []
        for item in range(math.ceil(self.total_doc*0.25)):
            top_pagerank.append(sorted_doc[item])
        
        sorted_sum = sorted(top_pagerank, key=lambda x: x.id)
        
        temp_summarize = [item.full_sentence for item in sorted_sum]
        
        self.summarize = sorted_sum
#         self.summarize = temp_summarize
   

In [6]:
stopword = open("stopword_list_tala.txt", "r")
stopwords = stopword.read().split("\n")
corpus = pd.read_csv("coba2.csv")

In [7]:
result_doc = get_clean_corpus(corpus, stopwords)

In [8]:
# result_doc

In [9]:
raw_frequency, idf = get_term_weighting_score(result_doc)

In [16]:
cobs = Graph(result_doc=result_doc, raw_frequency=raw_frequency, idf=idf)
# cobs.summarize
for item in cobs.summarize:
    print(item.id)
    print(item.full_sentence)
    print(item.pagerank_score)

1
Sejumlah sekolah di Kota Palembang, Sumatera Selatan, memutuskan memulangkan siswa-siswa mereka lantaran kabut asap semakin tebal menyelimuti kota tersebut. pada Senin (14/10) pagi.
1.5149792748244266
3
Pagi ini kami memulangkan siswa karena melihat kabut asap yang tebal dan berdampak buruk terhadap siswa, oleh karenanya atas instruksi Kadiknas Kota Palembang melalui pesan WA Grup meminta siswa dipulangkan dan belajar di rumah masing-masing saja, jelas Siti kepada radio?Elshinta.
1.7430588301054804
7
Akan tetapi, sebagaimana dipaparkan Kepala Dinas Pendidikan Sumatera Selatan, Widodo, kegiatan belajar mengajar di daerah yang tidak terdampak kabut asap tetap berlangsung.
1.4938766458706758
10
Melalui pesan digital, Kepala Dinas Pendidikan Kota Palembang menginstruksikan kegiatan belajar mengajar di tingkat paud, TK, SD dan SMP negeri dan swasta diliburkan hingga batas yang belum ditentukan, sebut Agus dalam siaran pers.
1.7431166059102274
11
Sejumlah warga Palembang, Sumatera Selatan,

In [11]:
elapsed_time_fl = (time.time() - start) 
print(elapsed_time_fl)

30.469436645507812


In [12]:
yap = stemmer.stem('www.ini_makam.co.id bukan juga 24/7 00.00 00,00ya!')
yap

'www ini makam co id bukan juga 24 7 00 00 00 00ya'

In [13]:

print(pd.__version__) 

0.23.0


In [14]:
for item in range(3):
    print(item)

0
1
2


In [15]:
math.ceil(1.2)

2