In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv("fatwa-mui.csv")
df.head()

Unnamed: 0,No,Judul Fatwa,Tema Fatwa,Nomor Fatwa,Tgl Ditetapkan
0,1,PANDUAN PENYELENGGARAAN IBADAH DI BULAN RAMADA...,Sosial Kemasyarakatan,KEPUTUSAN IJTIMA ULAMA KOMISI FATWA SE-INDONES...,16 December 2003
1,2,PENGGUNAAN MIKROBA DAN PRODUK MIKROBIAL DALAM ...,POM Iptek,01 Tahun 2010,19 January 2010
2,3,AIR DAUR ULANG,POM Iptek,02 Tahun 2010,27 January 2010
3,4,KIBLAT,Ibadah,03 Tahun 2010,1 February 2010
4,5,ARAH KIBLAT,Ibadah,05 Tahun 2010,1 July 2010


In [3]:
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import spacy
from nltk.corpus import stopwords

nlp = spacy.blank('id')
factory = StemmerFactory()
stemmer = factory.create_stemmer()
indonesia_s = stopwords.words('indonesian')

def preprocess(text):
    no_punctuation = re.sub(r'[^\w\s]','', text)
    no_number = re.sub(r'\d+', "", no_punctuation)
    no_single_letter = ' '.join( [w for w in no_number.split() if len(w)>1] )
    stem_output = stemmer.stem(no_single_letter)
    doc = nlp(stem_output)
    no_stop_words = [token.text for token in doc if not token.is_stop]
    no_stop_clean = " ".join(no_stop_words) 
    return " ".join(dict.fromkeys(no_stop_clean.split()))

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

def tokenize(text1, text2):
    v.fit([text1, text2])
    array1 = v.transform([text1]).toarray()
    array2 = v.transform([text2]).toarray()
    return array1, array2

In [5]:
def calc_dice_coeeficient(array1, array2):
    y_pred = np.array(array1).reshape(-1, 1)
    y_true = np.array(array2).reshape(-1, 1)
    jaccard_index = jaccard_score(y_true, y_pred)
    dice = 2*jaccard_index / (1 + jaccard_index)
    return dice

In [6]:
n = len(df)
empty_array = np.zeros((n, n))
print(len(empty_array))

380


In [7]:
for i in range(n):
    for j in range(len(empty_array[i])):
        if (i != j):
          if (empty_array[j][i] == 0):
            text1 = preprocess(df['Judul Fatwa'][i])
            text2 = preprocess(df['Judul Fatwa'][j])
            vector1, vector2 = tokenize(text1, text2)
            similarity = calc_dice_coeeficient(vector1,vector2)
            empty_array[i][j] = similarity
          else:
            continue
        else:
          empty_array[i][j] = 1
          
print(empty_array)

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.22222222]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [8]:
for i in range(len(empty_array[i])):
    for j in range(len(empty_array[i])):
        if(empty_array[i][j] > 0.7 and empty_array[i][j] < 1):
            empty_array[i][j] = 1
            print(df['Judul Fatwa'][i], ' | ', df['Judul Fatwa'][j], ' | ', empty_array[i][j]);
        else:
            empty_array[i][j] = 0

CARA PENSUCIAN EKSTRAK RAGI (YEAST EKSTRACT) DARI SISA PENGOLAHAN BIR (BREWER YEAST)  |  CARA PENSUCIAN EKSTRAK RAGI (YEAST EXTRACT) DARI SISA PENGOLAHAH BIR (BREWER YEAST)  |  1.0
PENGGUNAAN PLASENTA HEWAN HALAL UNTUK BAHAN OBAT  |  PENGGUNAAN PLASENTA HEWAN HALAL UNTUK BAHAN KOSMETIKA OBAT LUAR  |  1.0
PELAKSANAAN SHALAT JUM'AT, DZIKIR DAN KEGIATAN KEAGAMAAN DI TEMPAT SELAIN MASJID  |  PELAKSAAN SHALAT JUM'AT, DZIKIR, DAN KEGIATAN KEAGAMAAN DI TEMPAT SELAIN MASJID  |  1.0
PRODUK MAKANAN DAN MINUMAN YANG MENGANDUNG ALKOHOL / ETANOL  |  PRODUK KOSMETIKA YANG MENGANDUNG ALKOHOL / ETANOL  |  1.0
HUKUM MENGONSUMSI DAGING KANGURU  |  HUKUM MENGONSUMSI DAGING BULUS  |  1.0
TRANPLANTASI ORGAN DAN ATAU JARINGAN TUBUH DARI PENDONOR HIDUP UNTUK ORANG LAIN  |  TRANSPLANTASI ORGAN DAN/ATAU JARINGAN TUBUH DARI PENDONOR HIDUP UNTUK ORANG LAIN  |  1.0
HUKUM MENGGUNAKAN VAKSIN COVID-19 PRODUK ASTRAZANECA  |  HUKUM PENGGUNAAN VAKSIN COVID-19 PRODUK PT.PFIZER  |  1.0
HUKUM STANDAR SERTIFIKASI HALAL PEN

In [9]:
labeling = np.genfromtxt("labeling_fatwa2.csv", delimiter=",")
predict = empty_array.flatten()
label = labeling.flatten()

In [10]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

precision = precision_score(label, predict)
accuracy = accuracy_score(label, predict)
recall = recall_score(label, predict)
f1score = f1_score(label, predict)

print('Precision ', precision)
print('Accuracy', accuracy)
print('Recal', recall)
print('F1', f1score)

Precision  0.9473684210526315
Accuracy 0.9992590027700831
Recal 0.14516129032258066
F1 0.2517482517482518
