In [16]:
import numpy as np
import pandas as pd

In [18]:
raw_id_quran = pd.read_csv("quran/Indonesian.csv")
raw_ar_quran = pd.read_csv("quran/Arabic.csv")
raw_en_quran = pd.read_csv("quran/English.csv")

In [20]:
id_surah = []
id_ayah = []
id_txt = []

for id_text in raw_id_quran['surah|ayah|text']:
    surah_temp, ayah_temp, txt_temp = id_text.split('|')
    id_surah.append(surah_temp)
    id_ayah.append(ayah_temp)
    id_txt.append(txt_temp)

df = pd.DataFrame(list(zip(id_surah, id_ayah, id_txt)), 
                  columns =['surah', 'ayah', 'text'])

df.to_csv('quran/Indonesian_clean.csv', encoding='utf-8', index=False)

In [21]:
from app.lib.preprocess import IndoTextCleaner, StopWordsEliminator
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stemmer = StemmerFactory().create_stemmer()

itc = IndoTextCleaner()
swe = StopWordsEliminator()

df['text'] = df['text'].apply(lambda x: itc.transform(x))
df['text'] = df['text'].apply(lambda x: swe.transform(x))
df['text'] = df['text'].apply(lambda x: stemmer.stem(x))

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

import pickle

pickle.dump(tfidf_vectorizer, open("app/pkl/tfidf_vectorizer.pkl","wb"))
pickle.dump(tfidf_matrix, open("app/pkl/tfidf_verse_matrix.pkl","wb"))

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

input_text = pd.Series(['luth'])

input_text = input_text.apply(lambda x: itc.transform(x))
input_text = input_text.apply(lambda x: swe.transform(x))
input_text = input_text.apply(lambda x: stemmer.stem(x))

input_vector = tfidf_vectorizer.transform(input_text)

res = cosine_similarity(input_vector, tfidf_matrix)

res_sorted = sorted(range(len(res[0])), key=lambda k: res[0][k], reverse = True)

df.loc[ res_sorted[0] , : ]

surah                            26
ayah                            160
text     kaum luth dusta rasulrasul
Name: 3091, dtype: object