# pba-modul4-id-text-similarity-clustering-recommender

In [5]:
import os
import re
import sys
import string
import modSpellChecker_1 as sc
from contractions_1 import CONTRACTION_MAP
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import numpy as np
import pandas as pd
import operator

Deklarasi Fungsi untuk Normalisasi Teks

In [6]:
character = ['z','y','x','w','v','u','t','s','r','q','p','o','n','m','l','k','j','i','h','g','f','e','d',',',';',':','-','...','?','!', '(',')','[',']','{','}','<','>', '"','/','\'','#','-','@']

def repeatcharNormalize(text):
    for i in range(len(character)):
        charac_long = 5
        while charac_long >= 2:
            char = character[i] * charac_long
            text = text.replace(char, character[i])
            charac_long -= 1
    return text

def spellNormalize(text):
    spellCheck = []
    for i in text:
        if i not in character:
            j = sc.correction(i)
            spellCheck.append(j)
        else:
            spellCheck.append(i)
    return spellCheck

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

def expand_contractions(text, contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def stemmer_text(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = stemmer.stem(text)
    return text

In [8]:
def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('',token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [9]:
factory = StopWordRemoverFactory()
stopword_list = factory.get_stop_words()

In [10]:
def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [11]:
def normalize_corpus(corpus, tokenize=False):
    normalized_corpus = []
    for text in corpus:
        text = expand_contractions(text, CONTRACTION_MAP)
        text = stemmer_text(text)
        text = remove_special_characters(text)
        text = repeatcharNormalize(text)
        text = remove_stopwords(text)
        normalized_corpus.append(text)
        if tokenize:
            text = tokenize_text(text)
            text = spellNormalize(text)
            normalized_corpus.append(text)
    return normalized_corpus

Penyiapan Data Teks

In [12]:
dataset = pd.read_csv('dataartikel_1.csv')
dataset

Unnamed: 0,Judul,Artikel,Jenis
0,Rupiah masih rawan bergejolak,Nilai tukar rupiah terhadap dolar Amerika Seri...,ekonomi
1,SRC sasar ritel tradisional jadi pilar ekonomi...,Perkembangan teknologi telah menyebabkan perub...,ekonomi
2,Kiat memaksimalkan Facebook untuk bisnis,Media sosial kini bukan hanya menjadi alat unt...,ekonomi
3,Kiat menambah relasi bisnis,"Dalam menjalankan bisnis, menjalin relasi puny...",ekonomi
4,Prinsip dasar merintis bisnis,Merintis usaha memang bukan pekerjaan mudah. A...,ekonomi
...,...,...,...
1304,Genoa 1 - 1 Sampdoria - Liga Serie A 2018/2019,Genoa dan Sampdoria sama-sama gagal memetik po...,sport
1305,Villarreal 2 - 1 Real Betis - La Liga 2018/2019,Villarreal menjaga langkah untuk mendekatkan d...,sport
1306,"PSM kembali ke puncak klasemen, Persib resmi t...",Persija Jakarta harus rela kembali ke peringka...,sport
1307,Hasil imbang yang pahit bagi AC Milan,"""Hasil imbang yang pahit."" Begitu laman resmi ...",sport


In [13]:
feature = dataset.iloc[:,0]

In [14]:
feature[0:10]

0                        Rupiah masih rawan bergejolak
1    SRC sasar ritel tradisional jadi pilar ekonomi...
2             Kiat memaksimalkan Facebook untuk bisnis
3                          Kiat menambah relasi bisnis
4                        Prinsip dasar merintis bisnis
5                       10 Sektor seret pelemahan IHSG
6                     8 Sektor perkasa lambungkan IHSG
7    Untung rugi ratifikasi tujuh perjanjian dagang...
8    Uang muka KPR nol persen untuk PNS pada tahun ...
9    Cari untung lewat obligasi negara atau deposit...
Name: Judul, dtype: object

In [15]:
norm_corpus = normalize_corpus(feature)
len(norm_corpus)

1309

Ekstraksi Fitur dengan TFIDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
tfidf_matrix = tf.fit_transform(norm_corpus)
tfidf_matrix.shape

(1309, 1716)

1.4 Penghitungan Text Similarity

1.4.1 Metode Cosine Similarity

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
doc_sim = cosine_similarity(tfidf_matrix)
doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1299,1300,1301,1302,1303,1304,1305,1306,1307,1308
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.333162,0.138819,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.333162,1.0,0.241581,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.138819,0.241581,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
article_list = dataset['Judul'].values
article_list, article_list.shape

(array(['Rupiah masih rawan bergejolak',
        'SRC sasar ritel tradisional jadi pilar ekonomi nasional',
        'Kiat memaksimalkan Facebook untuk bisnis', ...,
        'PSM kembali ke puncak klasemen, Persib resmi tersisih',
        'Hasil imbang yang pahit bagi AC Milan',
        'Seruan untuk Edy Rahmayadi di balik kegagalan timnas Indonesia'],
       dtype=object),
 (1309,))

In [19]:
article_idx = np.where(article_list=='Rupiah masih rawan bergejolak')[0][0]
article_idx

0

Pencarian Dokumen Similarity

In [20]:
article_similarities = doc_sim_df.iloc[article_idx].values
article_similarities

array([1., 0., 0., ..., 0., 0., 0.])

In [21]:
similar_article_idxs = np.argsort(-article_similarities)[1:6]
similar_article_idxs

array([ 44, 766, 603, 760,  40])

In [22]:
similar_article = article_list[similar_article_idxs]
similar_article

array(['Menjaga kurs rupiah, merawat fundamen ekonomi',
       'Pelemahan rupiah belum berpengaruh ke inflasi',
       'Pemanfaatan satelit untuk daerah rawan bencana',
       'Faktor internal tahan rupiah jatuh lebih dalam',
       'Menghadapi gejolak ekonomi global'], dtype=object)

Deklarasi Fungsi untuk Pencarian Dokumen Similarity

In [23]:
def article_recommender(article_title, articles=article_list,doc_sims=doc_sim_df):
    article_idx = np.where(articles == article_title)[0][0]
    article_similarities = doc_sims.iloc[article_idx].values
    similar_article_idxs = np.argsort(-article_similarities)[1:6]
    similar_articles = articles[similar_article_idxs]
    return similar_articles

In [24]:
popular_articles = ['Rupiah masih rawan bergejolak',
'Kiat menambah relasi bisnis',
'Hasil imbang yang pahit bagi AC Milan',
'SRC sasar ritel tradisional jadi pilar ekonomi nasional',
'PSM kembali ke puncak klasemen, Persib resmi tersisih']

In [25]:
for article in popular_articles:
    print('Article:', article)
    print('Top 5 recommended Article:',article_recommender(article_title=article))
    print()

Article: Rupiah masih rawan bergejolak
Top 5 recommended Article: ['Menjaga kurs rupiah, merawat fundamen ekonomi'
 'Pelemahan rupiah belum berpengaruh ke inflasi'
 'Pemanfaatan satelit untuk daerah rawan bencana'
 'Faktor internal tahan rupiah jatuh lebih dalam'
 'Menghadapi gejolak ekonomi global']

Article: Kiat menambah relasi bisnis
Top 5 recommended Article: ['Kiat rebranding untuk bangkitkan bisnis lesu'
 "Kiat merintis bisnis dari ''Filosofi Kopi''" '7 kiat sukses magang'
 'Anggaran dipangkas, target pertumbuhan ditambah'
 'Kiat memaksimalkan Facebook untuk bisnis']

Article: Hasil imbang yang pahit bagi AC Milan
Top 5 recommended Article: ['AC Milan hanya butuh waktu'
 'Mahalnya hasil imbang Milan dan laju 7 tim mapan'
 'Kecewanya Madrid ditahan imbang Bilbao'
 'Inter Milan 1 - 0 Milan - Liga Serie A 2018/2019'
 'Kekalahan Milan, kekecewaan Gattuso']

Article: SRC sasar ritel tradisional jadi pilar ekonomi nasional
Top 5 recommended Article: ['Peran ekonomi kreatif terhadap pe

Metode BM25 Similarity

In [27]:
"""
Data:
-----
.. data:: PARAM_K1 - Free smoothing parameter for BM25.
.. data:: PARAM_B - Free smoothing parameter for BM25.
.. data:: EPSILON - Constant used for negative idf of document in corpus.
"""

import math
from six import iteritems
from six.moves import xrange

PARAM_K1 = 2.5
PARAM_B = 0.85
EPSILON = 0.2

class BM25(object):
    """Implementation of Best Matching 25 ranking function.
    Attributes
    ----------
    corpus_size : int
        Size of corpus (number of documents).
    avgdl : float
        Average length of document in `corpus`.
    corpus : list of list of str
        Corpus of documents.
    f : list of dicts of int
        Dictionary with terms frequencies for each document in `corpus`.
    df : dict
        Dictionary with terms frequencies for whole `corpus`.
    idf : dict
        Dictionary with inversed terms frequencies for while `corpus`.
    doc_len : list of int
        List of document lengths.
    """

    def __init__(self, corpus):
        """
        Parameters
        ----------
        corpus : list of list of str
            Given corpus.
        """
        self.corpus_size = len(corpus)
        self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
        self.corpus = corpus
        self.f = []
        self.df = {}
        self.idf = {}
        self.doc_len = []
        self.initialize()

    def initialize(self):
        for document in self.corpus:
            frequencies = {}
            self.doc_len.append(len(document))
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.f.append(frequencies)

            for word, freq in iteritems(frequencies):
                if word not in self.df:
                    self.df[word] = 0
                self.df[word] += 1

        for word, freq in iteritems(self.df):
            self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq)

    def get_score(self, document, index, average_idf):
        """Computes BM25 score of given `document` in relation to item of corpus
        Parameters
        ----------
        document : list of str
            Document to be scored.
        index : int
            Index of document in corpus selected to score with `document`.
        average_idf : float
            Average idf in corpus.
        Returns
        -------
        float
            BM25 score.
        """
        score = 0
        for word in document:
            if word not in self.f[index]:
                continue
            idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
            score += (idf * self.f[index][word] * (PARAM_K1 + 1)) / (self.f[index][word] + 
                                                                     PARAM_K1 * (1 - PARAM_B + PARAM_B *self.doc_len[index] / self.avgdl))
        return score

    def get_scores(self, document, average_idf):
        scores = []
        for index in xrange(self.corpus_size):
            score = self.get_score(document, index, average_idf)
            scores.append(score)
        return scores


def get_bm25_weights(corpus):
    """Returns BM25 scores (weights) of documents in corpus.
    Each document has to be weighted with every document in given corpus.
    Parameters
    ----------
    corpus : list of list of str
        Corpus of documents.
    Returns
    -------
    list of list of float
        BM25 scores.
    Examples
    --------
    >>> from gensim.summarization.bm25 import get_bm25_weights
    >>> corpus = [
    ...     ["black", "cat", "white", "cat"],
    ...     ["cat", "outer", "space"],
    ...     ["wag", "dog"]
    ... ]
    >>> result = get_bm25_weights(corpus)
    """
    bm25 = BM25(corpus)
    average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)

    weights = []
    for doc in corpus:
        scores = bm25.get_scores(doc, average_idf)
        weights.append(scores)

    return weights

In [28]:
a = [nltk.word_tokenize(doc) for doc in norm_corpus]

In [29]:
norm_corpus_tokens = np.asarray(a, dtype="object")

In [30]:
norm_corpus_tokens[:3]

array([list(['rupiah', 'rawan', 'gejolak']),
       list(['src', 'sasar', 'ritel', 'tradisional', 'jadi', 'pilar', 'ekonomi', 'nasional']),
       list(['kiat', 'maksimal', 'facebok', 'bisnis'])], dtype=object)

Penghitungan BM25 Weights untuk Document

In [31]:
%%time
wts = get_bm25_weights(norm_corpus_tokens)

CPU times: user 845 ms, sys: 20 ms, total: 865 ms
Wall time: 865 ms


In [32]:
bm25_wts_df = pd.DataFrame(wts)
bm25_wts_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1299,1300,1301,1302,1303,1304,1305,1306,1307,1308
0,26.60508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,40.727428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,26.561296,11.31812,5.423869,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,11.31812,27.904699,5.423869,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,5.423869,5.423869,32.020368,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
for article in popular_articles:
    print('Article:', article)
    print('Top 5 recommended Articles:',article_recommender(article_title=article, doc_sims=bm25_wts_df))
    print()

Article: Rupiah masih rawan bergejolak
Top 5 recommended Articles: ['Menghadapi gejolak ekonomi global' 'Menghadapi gejolak ekonomi global'
 'Pemanfaatan satelit untuk daerah rawan bencana'
 'Pelemahan rupiah belum berpengaruh ke inflasi'
 'Menjaga kurs rupiah, merawat fundamen ekonomi']

Article: Kiat menambah relasi bisnis
Top 5 recommended Articles: ['Kiat memaksimalkan Facebook untuk bisnis'
 'Kiat memaksimalkan Facebook untuk bisnis'
 "Kiat merintis bisnis dari ''Filosofi Kopi''"
 'Kiat rebranding untuk bangkitkan bisnis lesu' 'Kiat atasi FOMO']

Article: Hasil imbang yang pahit bagi AC Milan
Top 5 recommended Articles: ['AC Milan hanya butuh waktu'
 'Mahalnya hasil imbang Milan dan laju 7 tim mapan'
 'Kecewanya Madrid ditahan imbang Bilbao'
 'Taktik tak berjalan, Indonesia imbang dengan Vietnam'
 'Kekalahan Milan, kekecewaan Gattuso']

Article: SRC sasar ritel tradisional jadi pilar ekonomi nasional
Top 5 recommended Articles: ['Peran ekonomi kreatif terhadap pertumbuhan ekonomi 