In [93]:
import numpy as np
import pandas as pd

In [94]:
raw_id_quran = pd.read_csv("quran/Indonesian.csv")
raw_ar_quran = pd.read_csv("quran/Arabic.csv")
raw_en_quran = pd.read_csv("quran/English.csv")

In [95]:
raw_id_quran['surah|ayah|text'][14]

'2|8|Di antara manusia ada yang mengatakan: "Kami beriman kepada Allah dan Hari kemudian pada hal mereka itu sesungguhnya bukan orang-orang yang beriman.'

In [136]:
id_surah = []
id_ayah = []
id_txt = []

for id_text in raw_id_quran['surah|ayah|text']:
    surah_temp, ayah_temp, txt_temp = id_text.split('|')
    id_surah.append(surah_temp)
    id_ayah.append(ayah_temp)
    id_txt.append(txt_temp)

df = pd.DataFrame(list(zip(id_surah, id_ayah, id_txt)), 
                  columns =['surah', 'ayah', 'text'])

df.to_csv('quran/Indonesian_clean.csv', encoding='utf-8', index=False)

In [98]:
from app.lib.preprocess import IndoTextCleaner, StopWordsEliminator
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stemmer = StemmerFactory().create_stemmer()

itc = IndoTextCleaner()
swe = StopWordsEliminator()

df['text'] = df['text'].apply(lambda x: itc.transform(x))
df['text'] = df['text'].apply(lambda x: swe.transform(x))
df['text'] = df['text'].apply(lambda x: stemmer.stem(x))

In [137]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

import pickle

pickle.dump(tfidf_vectorizer, open("app/pkl/tfidf_vectorizer.pkl","wb"))
pickle.dump(tfidf_matrix, open("app/pkl/tfidf_verse_matrix.pkl","wb"))

In [100]:
df['text'][2]

'maha murah maha sayang'

In [101]:
from sklearn.metrics.pairwise import cosine_similarity

raw_res = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)

sorted(range(len(raw_res[0])), key=lambda k: raw_res[0][k], reverse = True)

# raw_res[0][3]

# sorted(raw_res[0], reverse = True)

# raw_res[0][4]

[0,
 2,
 4219,
 3188,
 4901,
 5374,
 169,
 1513,
 5052,
 5074,
 3053,
 3071,
 6108,
 6093,
 5147,
 6106,
 598,
 2940,
 2999,
 3035,
 3090,
 3106,
 3122,
 3148,
 4455,
 5156,
 3709,
 2795,
 1261,
 381,
 4249,
 198,
 4102,
 2015,
 766,
 1850,
 742,
 2337,
 2345,
 5615,
 179,
 2340,
 3151,
 588,
 5148,
 1947,
 3508,
 2508,
 1887,
 2860,
 224,
 4619,
 1693,
 4616,
 5948,
 602,
 5482,
 4978,
 2352,
 1562,
 5834,
 644,
 702,
 4474,
 2924,
 323,
 2341,
 906,
 2810,
 2808,
 3556,
 2267,
 4419,
 4290,
 4596,
 2327,
 4134,
 1918,
 3381,
 1687,
 230,
 4511,
 5229,
 1545,
 3167,
 2334,
 4612,
 4369,
 2336,
 3267,
 3573,
 5922,
 4662,
 5708,
 421,
 4058,
 1228,
 3164,
 4274,
 250,
 4762,
 4810,
 4589,
 3413,
 3762,
 205,
 4704,
 5260,
 5754,
 639,
 3494,
 5177,
 801,
 2936,
 2524,
 1106,
 1104,
 2668,
 5126,
 5083,
 707,
 650,
 3477,
 2628,
 3169,
 1336,
 1229,
 5124,
 326,
 842,
 2826,
 2318,
 2010,
 5216,
 2342,
 4110,
 1204,
 2634,
 2653,
 3689,
 2019,
 4035,
 43,
 5269,
 4061,
 1428,
 3344,
 18

In [141]:
input_text = pd.Series(['luth'])

input_text = input_text.apply(lambda x: itc.transform(x))
input_text = input_text.apply(lambda x: swe.transform(x))
input_text = input_text.apply(lambda x: stemmer.stem(x))

input_vector = tfidf_vectorizer.transform(input_text)

res = cosine_similarity(input_vector, tfidf_matrix)

res_sorted = sorted(range(len(res[0])), key=lambda k: res[0][k], reverse = True)

df.loc[ res_sorted[0] , : ]

surah                                22
ayah                                 43
text     dan kaum Ibrahim dan kaum Luth
Name: 2637, dtype: object

In [138]:
id_quran_clean = pd.read_csv("quran/Indonesian_clean.csv")

id_quran_clean

Unnamed: 0,surah,ayah,text
0,1,1,Dengan menyebut nama Allah Yang Maha Pemurah l...
1,1,2,Segala puji bagi Allah Tuhan semesta alam.
2,1,3,Maha Pemurah lagi Maha Penyayang.
3,1,4,Yang menguasai di Hari Pembalasan.
4,1,5,Hanya Engkaulah yang kami sembah dan hanya kep...
5,1,6,Tunjukilah kami jalan yang lurus
6,1,7,(yaitu) Jalan orang-orang yang telah Engkau be...
7,2,1,Alif laam miim.
8,2,2,Kitab (Al Quran) ini tidak ada keraguan padany...
9,2,3,(yaitu) mereka yang beriman kepada yang ghaib ...
