In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv("fatwa-mui.csv")
df.head()

Unnamed: 0,No,Judul Fatwa,Tema Fatwa,Nomor Fatwa,Tgl Ditetapkan
0,1,PANDUAN PENYELENGGARAAN IBADAH DI BULAN RAMADA...,Sosial Kemasyarakatan,KEPUTUSAN IJTIMA ULAMA KOMISI FATWA SE-INDONES...,16 December 2003
1,2,PENGGUNAAN MIKROBA DAN PRODUK MIKROBIAL DALAM ...,POM Iptek,01 Tahun 2010,19 January 2010
2,3,AIR DAUR ULANG,POM Iptek,02 Tahun 2010,27 January 2010
3,4,KIBLAT,Ibadah,03 Tahun 2010,1 February 2010
4,5,ARAH KIBLAT,Ibadah,05 Tahun 2010,1 July 2010


In [3]:
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import spacy
from nltk.corpus import stopwords

nlp = spacy.blank('id')
factory = StemmerFactory()
stemmer = factory.create_stemmer()
indonesia_s = stopwords.words('indonesian')

def preprocess(text):
    no_punctuation = re.sub(r'[^\w\s]','', text)
    no_number = re.sub(r'\d+', "", no_punctuation)
    no_single_letter = ' '.join( [w for w in no_number.split() if len(w)>1] )
    stem_output = stemmer.stem(no_single_letter)
    doc = nlp(stem_output)
    no_stop_words = [token.text for token in doc if not token.is_stop]
    no_stop_clean = " ".join(no_stop_words) 
    return " ".join(dict.fromkeys(no_stop_clean.split()))

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

def tokenize(text1, text2):
    v.fit([text1, text2])
    array1 = v.transform([text1]).toarray()
    array2 = v.transform([text2]).toarray()
    return array1, array2

In [5]:
def calc_jaccard_similarity(array1, array2):
    y_pred = np.array(array1).reshape(-1, 1)
    y_true = np.array(array2).reshape(-1, 1)
    jaccard_index = jaccard_score(y_true, y_pred)
    return jaccard_index

In [6]:
n = len(df)
empty_array = np.zeros((n, n))
print(len(empty_array))

380


In [8]:
for i in range(len(empty_array[i])):
    for j in range(len(empty_array[i])):
        if (i != j):
          if (empty_array[j][i] == 0):
            text1 = preprocess(df['Judul Fatwa'][i])
            text2 = preprocess(df['Judul Fatwa'][j])
            vector1, vector2 = tokenize(text1, text2)
            similarity = calc_jaccard_similarity(vector1,vector2)
            empty_array[i][j] = similarity
          else:
            continue
        else:
          empty_array[i][j] = 1
          
print(empty_array)

KeyboardInterrupt: 

In [None]:
for i in range(len(empty_array[i])):
    for j in range(len(empty_array[i])):
        if(empty_array[i][j] > 0.5 and empty_array[i][j] < 1):
            print(df['Judul Fatwa'][i], ' | ', df['Judul Fatwa'][j], ' | ', empty_array[i][j]);

In [None]:
print(df['Judul Fatwa'][7], ' | ', df['Judul Fatwa'][345], empty_array[7][345])
print(df['Judul Fatwa'][14], ' | ', df['Judul Fatwa'][240], empty_array[14][241])
print(df['Judul Fatwa'][23], ' | ', df['Judul Fatwa'][56], empty_array[23][56])
print(df['Judul Fatwa'][60], ' | ', df['Judul Fatwa'][306], empty_array[60][306])
print(df['Judul Fatwa'][80], ' | ', df['Judul Fatwa'][314], ' | ', empty_array[80][314])
print(df['Judul Fatwa'][84], ' | ', df['Judul Fatwa'][191], ' | ', empty_array[84][191])
print(df['Judul Fatwa'][92], ' | ', df['Judul Fatwa'][194], ' | ', empty_array[92][194])
print(df['Judul Fatwa'][96], ' | ', df['Judul Fatwa'][234], ' | ', empty_array[96][234])
print(df['Judul Fatwa'][154], ' | ', df['Judul Fatwa'][155], ' | ', empty_array[154][155])
print(df['Judul Fatwa'][203], ' | ', df['Judul Fatwa'][205], ' | ', empty_array[203][205])

AMIL ZAKAT  |  PENGELOLAAN ZAKAT 0.3333333333333333
BADAL THAWAF IFADHAH (PELAKSANAAN THAWAF IFADHAH OLEH ORANG LAIN)  |  TASWIYAH AL-MANHAJ (PENYAMAAN POLA PIKIR DALAM MASALAH-MASALAH KEAGAMAAN) 0.0
PENGGUNAAN PLASENTA HEWAN HALAL UNTUK BAHAN OBAT  |  PENGGUNAAN ALKOHOL / ETANOL UNTUK BAHAN OBAT 0.2857142857142857
TRANSPLANTASI ORGAN DAN ATAU / JARINGAN TUBUH UNTUK DIRI SENDIRI  |  TRANSPLANTASI ORGAN DAN/ATAU JARINGAN TUBUH DARI PENDONOR HIDUP UNTUK ORANG LAIN 0.5
PENDAFTARAN HAJI SAAT USIA DINI  |  PENDAFTARAN HAJI USIA DINI  |  1.0
PRODUK VAKSIN COVID-19 DARI SINOVAC LIFE SCIENCES CO. LTD, CHINA DAN PT. BIO FARMA (PERSERO)  |  PRODUK VAKSIN COVID-19 DARI PT. BIOTIS PHARMACEUTICALS INDONESIA  |  0.25
HUKUM PENGGUNAAN VAKSIN COVID-19 PRODUK SINOPHARM CNBG CHINA  |  HUKUM VAKSIN COVID-19 PRODUKSI CANSINO BIOLOGICS INC. CHINA  |  0.36363636363636365
HUKUM DAN PANDUAN PELAKSANAAN IBADAH KURBAN SAAT KONDISI WABAH PENYAKIT MULUT DAN KUKU  |  HUKUM DAN PANDUAN PELAKSANAAN IBADAH KURBAN SAA