In [12]:
import re
import string
import pandas as pd
import matplotlib as pyplot

from gensim.models import Word2Vec, Phrases
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import pickle
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import re
import gensim
import matplotlib.pyplot as plt
from gensim.test.utils import get_tmpfile


%matplotlib inline

In [3]:
quran_df = pd.read_csv('../data/en-sahih.csv')

In [6]:
quran_df.head()

Unnamed: 0,Surah Name,Surah Number,Verse Number,Verse Text
0,الفاتحة,1,1,"In the name of Allah, the Entirely Merciful, t..."
1,الفاتحة,1,2,"[All] praise is [due] to Allah, Lord of the wo..."
2,الفاتحة,1,3,"The Entirely Merciful, the Especially Merciful,"
3,الفاتحة,1,4,Sovereign of the Day of Recompense.
4,الفاتحة,1,5,It is You we worship and You we ask for help.


In [5]:
hadith_df = pd.read_csv('../data/all_hadiths_clean.csv')

In [7]:
hadith_df.head()

Unnamed: 0,id,hadith_id,source,chapter_no,hadith_no,chapter,chain_indx,text_ar,text_en
0,0,1,Sahih Bukhari,1,1,Revelation - كتاب بدء الوحى,"30418, 20005, 11062, 11213, 11042, 3",حدثنا الحميدي عبد الله بن الزبير، قال حدثنا سف...,Narrated 'Umar bin Al-Khattab: ...
1,1,2,Sahih Bukhari,1,2,Revelation - كتاب بدء الوحى,"30355, 20001, 11065, 10511, 53",حدثنا عبد الله بن يوسف، قال أخبرنا مالك، عن هش...,Narrated 'Aisha: ...
2,2,3,Sahih Bukhari,1,3,Revelation - كتاب بدء الوحى,"30399, 20023, 11207, 11013, 10511, 53",حدثنا يحيى بن بكير، قال حدثنا الليث، عن عقيل، ...,Narrated 'Aisha: (the m...
3,3,4,Sahih Bukhari,1,4,Revelation - كتاب بدء الوحى,"11013, 10567, 34",قال ابن شهاب وأخبرني أبو سلمة بن عبد الرحمن، أ...,Narrated Jabir bin 'Abdullah Al-Ansari while ...
4,4,5,Sahih Bukhari,1,5,Revelation - كتاب بدء الوحى,"20040, 20469, 11399, 11050, 17",حدثنا موسى بن إسماعيل، قال حدثنا أبو عوانة، قا...,Narrated Said bin Jubair: ...


In [8]:
documents = quran_df['Verse Text'].append(hadith_df['text_en'].dropna())

In [9]:
len(documents)

39824

In [14]:
def stem_words(text):
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

In [15]:
cleaning_funcs = [make_lower_case, 
                  remove_stop_words, 
                  remove_punctuation, 
                  stem_words]


In [19]:
cleaned_documents = []
for document in documents:
    clean_doc = document
    for f in cleaning_funcs:
        clean_doc = f(clean_doc)
    
    cleaned_documents.append(clean_doc)

# TFIDF Model

In [21]:
#Fit TFIDF 
#Learn vocabulary and tfidf from all style_ids.
tf = TfidfVectorizer(analyzer='word', 
                     min_df=10,
                     ngram_range=(1, 2),
                     #max_features=1000,
                     stop_words='english')
tf.fit(cleaned_documents)

#Transform style_id products to document-term matrix.
tfidf_matrix = tf.transform(cleaned_documents)
pickle.dump(tf, open("../models/tfidf_model.pkl", "wb"))

print(tfidf_matrix.shape)

(39824, 17854)


In [22]:
# Compress with SVD
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500)
latent_matrix = svd.fit_transform(tfidf_matrix)
pickle.dump(svd, open("../models/svd_model.pkl", "wb"))

print(latent_matrix.shape)

(39824, 500)


In [38]:
n = 50 #pick components

# a
# doc_labels = df.title
svd_feature_matrix = pd.DataFrame(latent_matrix[:,0:n])
print(svd_feature_matrix.shape)
svd_feature_matrix.head()

pickle.dump(svd_feature_matrix, open("../models/lsa_embeddings.pkl", "wb"))

(39824, 50)


# Doc2Vec Model

In [23]:
split_docs = []
for document in cleaned_documents:
    split_docs.append(document.split())

In [25]:
formatted_documents = [gensim.models.doc2vec.TaggedDocument(doc, [i]) 
                       for i, doc in enumerate(split_docs)]

model = Doc2Vec(vector_size=50,
                min_count=5,
                epochs=200, 
                seed=0, 
                window=3, 
                dm=1)

model.build_vocab(formatted_documents)

In [29]:
model.train(formatted_documents, 
            total_examples=model.corpus_count, 
            epochs=model.epochs)

In [30]:
fname = get_tmpfile("../models/doc2vec_model")
model.save("../models/doc2vec_model")
model = Doc2Vec.load("../models/doc2vec_model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [31]:
vector = model.infer_vector(doc_words=["this", "is", "a", "test"], epochs=50)
vector

array([ 0.395602  , -1.0176108 ,  0.04891639, -0.23413673, -1.1014264 ,
        0.25590667,  0.543723  ,  0.34218332, -1.2481732 ,  0.9248172 ,
       -0.12828165, -1.225692  , -0.03927625, -0.1434317 ,  0.36229712,
       -0.3615093 ,  0.19589567,  0.5047545 ,  0.43324736, -0.46560958,
       -0.14506127,  0.03595288, -0.2649576 , -1.3081894 ,  0.7348324 ,
        1.2057209 , -0.05394661, -0.44939655,  0.02879823, -0.8890898 ,
        1.5298823 , -1.2370139 , -0.0226839 , -1.7492684 ,  0.67260015,
        0.35627657,  0.8090528 ,  0.15275992,  0.31522354,  0.23707627,
        0.41513297,  0.03648197,  0.33456528,  0.20344214,  0.65324324,
        0.46326482,  0.8589235 ,  0.37457705,  0.70699465, -0.69354427],
      dtype=float32)

In [35]:
doctovec_feature_matrix = pd.DataFrame(model.docvecs.vectors_docs)
print(doctovec_feature_matrix.shape)

doctovec_feature_matrix.head(3)
pickle.dump(doctovec_feature_matrix, open("../models/doctovec_embeddings.pkl", "wb"))


(39824, 50)
