In [31]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from numpy.linalg import norm

In [32]:
df = pd.read_csv("fatwa-mui.csv")
df.head()

Unnamed: 0,No,Judul Fatwa,Tema Fatwa,Nomor Fatwa,Tgl Ditetapkan
0,1,PANDUAN PENYELENGGARAAN IBADAH DI BULAN RAMADA...,Sosial Kemasyarakatan,KEPUTUSAN IJTIMA ULAMA KOMISI FATWA SE-INDONES...,16 December 2003
1,2,PENGGUNAAN MIKROBA DAN PRODUK MIKROBIAL DALAM ...,POM Iptek,01 Tahun 2010,19 January 2010
2,3,AIR DAUR ULANG,POM Iptek,02 Tahun 2010,27 January 2010
3,4,KIBLAT,Ibadah,03 Tahun 2010,1 February 2010
4,5,ARAH KIBLAT,Ibadah,05 Tahun 2010,1 July 2010


In [33]:
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def preprocess(text):
    no_punctuation = re.sub(r'[^\w\s]','', text)
    #stem_output = stemmer.stem(no_punctuation)
    return no_punctuation 

In [34]:
df['Sentence'] = df['Judul Fatwa'].apply(preprocess)
print(df[['Judul Fatwa', 'Sentence']].head(10))

                                         Judul Fatwa  \
0  PANDUAN PENYELENGGARAAN IBADAH DI BULAN RAMADA...   
1  PENGGUNAAN MIKROBA DAN PRODUK MIKROBIAL DALAM ...   
2                                     AIR DAUR ULANG   
3                                             KIBLAT   
4                                        ARAH KIBLAT   
5  PENGGUNAAN VAKSIN MENINGITIS BAGI JEMAAH HAJI ...   
6                                         KOPI LUWAK   
7                                         AMIL ZAKAT   
8  PENSUCIAN ALAT PRODUKSI YANG TERKENA NAJIS MUT...   
9  CARA PENSUCIAN EKSTRAK RAGI (YEAST EKSTRACT) D...   

                                            Sentence  
0  PANDUAN PENYELENGGARAAN IBADAH DI BULAN RAMADA...  
1  PENGGUNAAN MIKROBA DAN PRODUK MIKROBIAL DALAM ...  
2                                     AIR DAUR ULANG  
3                                             KIBLAT  
4                                        ARAH KIBLAT  
5  PENGGUNAAN VAKSIN MENINGITIS BAGI JEMAAH HAJI ... 

In [35]:
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
model = AutoModel.from_pretrained("indolem/indobert-base-uncased")

In [36]:
def get_embeddings(text, token_length):
    tokens=tokenizer(text,max_length=token_length,padding='max_length',truncation=True)
    output=model(torch.tensor(tokens.input_ids).unsqueeze(0),
                 attention_mask=torch.tensor(tokens.attention_mask).unsqueeze(0))
    return output[1][0].detach().numpy()

In [37]:
def calc_euclidian_distance(array1, array2):
    sum_sq = np.sum(np.square(array1 - array2))
    return np.sqrt(sum_sq)

In [38]:
n = len(df)
max_length = 40
empty_array = np.zeros((n, n))

In [39]:
vectorize = []
for i in range(n):
    vectorize.append(get_embeddings(df['Judul Fatwa'][i], max_length));

In [40]:
for i in range(len(empty_array[i])):
    for j in range(len(empty_array[i])):
        if (i != j):
          if (empty_array[j][i] == 0):
            similarity = 1 / (1 + calc_euclidian_distance(vectorize[i],vectorize[j]))
            empty_array[i][j] = similarity
          else:
            continue
        else:
          empty_array[i][j] = 0

print(empty_array)

[[0.         0.07890355 0.08502664 ... 0.08945784 0.08005498 0.06440628]
 [0.         0.         0.08083959 ... 0.08021694 0.09272786 0.06814759]
 [0.         0.         0.         ... 0.09088451 0.08473125 0.06453652]
 ...
 [0.         0.         0.         ... 0.         0.09257469 0.06909143]
 [0.         0.         0.         ... 0.         0.         0.06752536]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [44]:
for i in range(len(empty_array[i])):
    for j in range(len(empty_array[i])):
        if(empty_array[i][j] > 0.5 and empty_array[i][j] < 1):
            empty_array[i][j] = 1
            print(df['Judul Fatwa'][i], ' | ', df['Judul Fatwa'][j], ' | ', empty_array[i][j])
        else:
            empty_array[i][j] = 0

In [45]:
labeling = np.genfromtxt("labeling_fatwa2.csv", delimiter=",")
predict = empty_array.flatten()
label = labeling.flatten()

In [46]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

precision = precision_score(label, predict)
accuracy = accuracy_score(label, predict)
recall = recall_score(label, predict)
f1score = f1_score(label, predict)

print('Precision ', precision)
print('Accuracy', accuracy)
print('Recal', recall)
print('F1', f1score)

Precision  0.0
Accuracy 0.9991412742382272
Recal 0.0
F1 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
