# One Thousand and One Nights (Arabian nights), is a collection of Middle Eastern folk tales compiled in Arabic during the Islamic Golden Age.

It is still not clear where does exactly these stories are originated? Is it Persia, or Arabia?

The work was collected over many centuries by various authors, translators, and scholars across West, Central and South Asia, and North Africa. Some tales themselves trace their roots back to ancient and medieval Arabic, Persian, Indian, Greek, Jewish and Turkish folklore and literature.

To answer the long-lived question, I implemented two different Natural Language Processing (NLP) methods:

1. Topic modelling using LDA (Latent Dirichlet Allocation)
2. Word2vec neural language model

In both methods, Preprcessing, and cleaning of the data is needed. Data cleaning starts with tokenization, followed by lemmatization.

Tokenizing the text:

Words are features in text which carries information. Tokenization means to give each word its own identity. 

Lemmatization:

Grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.

Removing stop words:

A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) which won't add any meaning to the text and should be ignored.

In [5]:
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 28 10:20:41 2020

@author: fsaff
"""

import spacy
import gensim
spacy.load('en')
from spacy.lang.en import English
parser = English()

# Tokenizing the text meaning to give each word its own identity
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

import nltk
nltk.download('wordnet')

# Present each word based on the root and meaning
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)
# adding stopwords which are thw words that would not add much meaning to overall meaning
nltk.download('stopwords')
from nltk.corpus import stopwords
en_stop = set(nltk.corpus.stopwords.words('english'))
HHH=list(en_stop)
newStopWords = ['would','could','shall','can','without','nothing','going','allow','ask','saying','hear','thousand','eveything']
stopwords=HHH+newStopWords

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in stopwords]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fsaff\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fsaff\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# LOADING THE TRANSLATION FILES:

#I collected two different translation of the Arabian nights in which one is loger with more than 1000000 tokens. Brief version, however, only contains ~200000 tokens.

Here, for the sake of time, only the brief version is considered.

In this section, all pre-processing functions are applied to the text and words in the sentences are arranged in a list of lits. The list of lists will then be used for a topic modelling.

In [8]:
import random
text_data = []
import re
#uploading the file and preprocessing/cleaning sentence by sentence
with open('Arabian-Nights1N.txt') as f:
    for line in f:
        f=str(f)
        re.sub("\w*/(?!NOUN)[A-Z]*","NIL",f)
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['king', 'gazelle', 'looking', 'telling', 'bring', 'herdsman', 'sitting', 'something', 'please', 'reward', 'agree', 'master', 'daughter', 'young', 'teach', 'magic', 'woman', 'stay', 'yesterday', 'cover', 'tears', 'burst', 'laughter', 'father', 'cheap', 'bring', 'strange', 'strange', 'ask', 'laugh', 'crying', 'master', 'spell', 'mother', 'father', 'laugh', 'reason', 'father', 'kill', 'mother', 'astonish', 'found', 'morning']
['noble', 'patience', 'resolute']
[]
[]
[]
['hearing', 'voice']
['whoever', 'people', 'secret']
['birth', 'hero']
[]
['except', 'pierce', 'heart', 'blood']
['patience', 'nature', 'treacherous']
['ifrit', 'whore', 'lover', 'look', 'recognize', 'never', 'spite', 'punishment', 'confess', 'ask', 'insist', 'never', 'ifrit', 'sword', 'sword', 'stand', 'gesture', 'eyebrow', 'tears', 'cheek', 'understand', 'gesture', 'reply', 'forgiveness', 'inwardly', 'recite']
[]
['reason', 'hands', 'lift']
['continue', 'serve', 'companion', 'thirty', 'night', 'fortieth', 'youth', 'gladne

['agree', 'hasan', 'clothes', 'hasan', 'cauldron', 'boil', 'something', 'look', 'pitch', 'smear', 'cheek', 'look', 'black', 'slave', 'dress', 'clothes', 'servant', 'giving', 'kebab', 'black', 'things', 'need', 'market', 'vegetable', 'approach', 'politely', 'address', 'argot', 'black', 'greet', 'since', 'forty', 'black', 'slave', 'cooking', 'morning', 'another', 'evening', 'feeding', 'forty', 'prepare', 'dalila', 'daughter', 'zainab', 'kebab', 'drink', 'house', 'drunk', 'different', 'dish', 'kitchen', 'pantry', 'drunk', 'everything', 'conceal', 'sober', 'clothes', 'knife', 'vegetable', 'basket', 'market', 'vegetable', 'enter', 'kitchen', 'pantry', 'dalila', 'add', 'slave', 'dalila', 'zainab', 'upstairs', 'fetch', 'clothes', 'marry', 'zainab', 'bring', 'forty', 'carrier', 'pigeon']
['turn', 'others', 'ask', 'zuraiq', 'leader', 'gang', 'almost', 'mountain', 'grasp', 'star', 'steal', 'equal', 'thing', 'repent', 'open', 'bring', 'dinar', 'money', 'purse', 'thread', 'attach', 'brass', 'bell'

# LDA MODEL:

LDA will start with collecting random words 



In [11]:
from gensim import corpora
from gensim.corpora import Dictionary
words = corpora.Dictionary(text_data)
corpus = [words.doc2bow(doc) for doc in text_data]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=3, 
                                           random_state=2,
                                           update_every=1,
                                           passes=400,
                                           alpha='auto',
                                           per_word_topics=True)




[(0, '0.010*"night" + 0.008*"bring" + 0.007*"slave" + 0.006*"father" + 0.006*"ask" + 0.006*"house" + 0.006*"purse" + 0.006*"servant" + 0.005*"morning" + 0.005*"black" + 0.005*"master" + 0.005*"hundred" + 0.005*"break" + 0.004*"continue" + 0.004*"dawn"'), (1, '0.007*"sharkan" + 0.006*"dirham" + 0.006*"rider" + 0.005*"champion" + 0.005*"happen" + 0.005*"young" + 0.005*"night" + 0.005*"palace" + 0.005*"ja‘far" + 0.004*"horse" + 0.004*"attack" + 0.004*"sword" + 0.004*"fighting" + 0.004*"kingdom" + 0.004*"reply"'), (2, '0.009*"father" + 0.009*"tears" + 0.008*"ask" + 0.006*"bring" + 0.006*"truth" + 0.006*"prince" + 0.006*"reply" + 0.006*"disobedience" + 0.006*"hear" + 0.005*"ifrit" + 0.005*"brother" + 0.005*"return" + 0.005*"horse" + 0.004*"janshah" + 0.004*"palace"')]


# Results:

Disadvantages of the LDA model:

1. The meaning of each topic usually spills into other topics which will make topics hard to interpret
2. In this case, I could NOT find a topic directly related to "Persianess" or "Arabness" of each tale

Advantages of LDA model:
1. It takes advantage of statistical features of a text
2. It is usefull to be used for text summarization and topic modelling

In the next section we explore the Word2vec neural language model


In the era of blooming artificial intelligence (AI) sciences, I introduced an effective Natural language processing (NLP) method to examine the origin of Arabian nights. Furthermore, to explore which cultures have had a more substantial influence on the two major translated English versions of Arabian nights? The highly controversial yet still obscure origins of The Arabian Nights have been rooted in various countries and cultures, including ancient Egypt, India, pre-Islamic Iran, and Arab cultures of the Middle East. The translations used here are Muhsin Mahdi’s Critical Edition (translated by Hussein Haddaway (2011)) and Arabian Nights (2008) translated by Lyons. To find reliable answers to this question we had to go beyond conventional LDA approaches or statistical classifications such as TF/IDF. Earlier statistical methods ignore the effect the neighbors of a word have on its meaning and how those relationships affect the overall meaning of a statement. Using word2vec approach, however,    considers the meaning of neighborhood words around the specific words and showed it to be extremely effective. Latter is done by creating the small bag of words (BOG) of tokens from a “neighborhood” of few words, typically fewer than 10 tokens. The neural language model ensures that these neighborhoods of meaning don’t spill over into adjacent sentences which guarantees the independence and relevance of BOGs. 

In [None]:
import spacy 
import numpy as np
import pandas as pd

from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
sp = spacy.load('en_core_web_sm')
nlp = spacy.load('en')

file_content = open('Arabian Nights Text A-2020.txt', encoding="utf8").read()
file_content=file_content.lower()
file_content1=file_content.split()
f=[]
f=list(file_content1)
f2=pd.DataFrame(columns={"A"})
f2["A"]=f

f2['A'] = f2['A'].str.replace('\d+', '')
#Preprocessing of a text and filtering 
f2=(f2[f2['A'].apply(lambda x: len(x.split('-')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split(',')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split(':')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split('"')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split('?')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split('.')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split('*')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split('.')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split(';')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split('%')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split('‘')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split('“')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split('9')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split('!')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split('(')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split(')')) < 2)])
f2=(f2[f2['A'].apply(lambda x: len(x.split('=')) < 2)])
lf2=list(f2["A"])

f3=str(f2)

parser = English()

document = nlp(f3)

lemmas = [token.lemma_ for token in document if not token.is_stop]

import spacy 
nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])
from nltk.corpus import stopwords
#updating stopwords
stops = stopwords.words("english")
nlp.Defaults.stop_words |= {"",",","'","@","$",":","-",".","AHMAGHHHHHHHHHHH","(",")"}

def normalize(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    return " ".join(lemmatized)



f=pd.DataFrame()
f["A"]=f2["A"]
f['A']= f['A'].apply(normalize, lowercase=True, remove_stopwords=True)
f['A'].replace('', np.nan, inplace=True)
f.dropna(subset=['A'], inplace=True)


LLL=[]
LLL=list(f["A"])
writer = pd.ExcelWriter('TextA.xlsx', engine='xlsxwriter')
writer0 = pd.ExcelWriter('errorA.xlsx', engine='xlsxwriter')
writer2 = pd.ExcelWriter('BestA13.xlsx', engine='xlsxwriter')

NNNN=300
new_lst=list(range(0,NNNN))
import gensim 
from gensim.models import Word2Vec 
from gensim.models import LdaModel
import random
S1=[]
S2=[]
S3=[]
S4=[]
for iii in range (1,NNNN):

# Training the neural language network in a loop using Skip-gram method
    NN=random.randint(1,101)
    model3 = gensim.models.Word2Vec([LLL],min_count = 1, 
					 seed=NN,sg=0, size = 150, window = 5)
    model3.build_vocab([LLL], update=True)

# Finding the similarities between the-most-common-words and "Persian", "Arab", "Persins", and "Arabs".    
    S1.append(model3.similarity("persian","man")) 
    S2.append(model3.similarity("arab","man"))     
    S3.append(model3.similarity("persians","man")) 
    S4.append(model3.similarity("arabic","man"))  
   

    S1.append(model3.similarity("persian","king")) 
    S2.append(model3.similarity("arab","king"))
    S3.append(model3.similarity("persians","king")) 
    S4.append(model3.similarity("arabs","king"))
   
 

    S1.append(model3.similarity("persian","god")) 
    S2.append(model3.similarity("arab","god")) 
    

    S1.append(model3.similarity("persian","girl")) 
    S2.append(model3.similarity("arab","girl")) 
    


    S1.append(model3.similarity("persian","morning")) 
    S2.append(model3.similarity("arab","morning"))
    

    S1.append(model3.similarity("persian","night")) 
    S2.append(model3.similarity("arab","night"))
    


    S1.append(model3.similarity("persian","love")) 
    S2.append(model3.similarity("arab","love"))  
    

    S1.append(model3.similarity("persian","old")) 
    S2.append(model3.similarity("arab","old"))  
    


    S1.append(model3.similarity("persian","fortunate")) 
    S2.append(model3.similarity("arab","fortunate")) 
    

    S1.append(model3.similarity("persian","slave")) 
    S2.append(model3.similarity("arab","slave"))  
   
 

    S1.append(model3.similarity("persian","vizier")) 
    S2.append(model3.similarity("arab","vizier"))  
   

   
    S1.append(model3.similarity("persian","father")) 
    S2.append(model3.similarity("arab","father")) 
    


    S1.append(model3.similarity("persian","young")) 
    S2.append(model3.similarity("arab","young"))  
    


    S1.append(model3.similarity("persian","heart")) 
    S2.append(model3.similarity("arab","heart"))
    


    S1.append(model3.similarity("persian","kiss")) 
    S2.append(model3.similarity("arab","kiss"))  
   
    S3.append(model3.similarity("persians","god")) 
    S4.append(model3.similarity("arabs","god"))  


    S3.append(model3.similarity("persians","girl")) 
    S4.append(model3.similarity("arabs","girl"))  


    S3.append(model3.similarity("persians","morning")) 
    S4.append(model3.similarity("arabs","morning"))  


    S3.append(model3.similarity("persians","night")) 
    S4.append(model3.similarity("arabs","night"))  
 

    S3.append(model3.similarity("persians","love")) 
    S4.append(model3.similarity("arabs","love"))  


    S3.append(model3.similarity("persians","old")) 
    S4.append(model3.similarity("arabs","old"))  
 


    S3.append(model3.similarity("persians","fortunate")) 
    S4.append(model3.similarity("arabs","fortunate"))  


    S3.append(model3.similarity("persians","slave")) 
    S4.append(model3.similarity("arabs","slave"))  


    S3.append(model3.similarity("persians","vizier")) 
    S4.append(model3.similarity("arabs","vizier"))  
 
   
    S3.append(model3.similarity("persians","father")) 
    S4.append(model3.similarity("arabs","father"))  
 

    S3.append(model3.similarity("persians","young")) 
    S4.append(model3.similarity("arabs","young"))  


    S3.append(model3.similarity("persians","heart")) 
    S4.append(model3.similarity("arabs","heart"))  


    S3.append(model3.similarity("persians","kiss")) 
    S4.append(model3.similarity("arabs","kiss")) 
    
    S1.append(model3.similarity("persian","tell")) 
    S2.append(model3.similarity("arab","tell"))     
    S3.append(model3.similarity("persians","tell")) 
    S4.append(model3.similarity("arabic","tell"))  
   

    S1.append(model3.similarity("persian","shahrazad")) 
    S2.append(model3.similarity("arab","shahrazad"))
    S3.append(model3.similarity("persians","shahrazad")) 
    S4.append(model3.similarity("arabs","shahrazad"))


# Postprocessing and structuring the data for visualization and statistical analysis
NS1=np.array([S1,S3])
MNS1=np.array([])
MNS1=np.average(NS1, axis=0)
LL1n=MNS1.reshape(17,NNNN-1)
LL1=np.average(LL1n, axis=1)
SLL1=np.std(LL1n, axis=1)
DMNS1=pd.DataFrame(LL1)

NS2=np.array([S2,S4])
MNS2=np.array([])
MNS2=np.average(NS2, axis=0)
LL2n=MNS2.reshape(17,NNNN-1)
LL2=np.average(LL2n, axis=1)
SLL2=np.std(LL2n, axis=1)
DMNS2=pd.DataFrame(LL2)

MNS1=np.average(MNS1, axis=0)
SS=pd.DataFrame(S1)
SS1=pd.DataFrame(S2)
SS2=pd.DataFrame(S3)
II=1     
pd.DataFrame(LL1).to_excel(writer,index = False,  sheet_name= 'run%d' %(II)) 
pd.DataFrame(LL2).to_excel(writer,index = False,  sheet_name= 'run%d' %(II+1)) 
pd.DataFrame(SLL1).to_excel(writer,index = False,  sheet_name= 'run%d' %(II+2)) 
pd.DataFrame(SLL2).to_excel(writer,index = False,  sheet_name= 'run%d' %(II+3)) 




writer.save()
writer0.save()



import spacy
from collections import Counter
from collections import Counter
word_freq = Counter(LLL)
NN11=100
common_words = word_freq.most_common(NN11)

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
plt.clf()

labels = ['man', 'king', 'god', 'girl', 'morning']
persian_means = [LL1[0], LL1[1], LL1[2], LL1[3], LL1[4]]
Arab_means = [LL2[0], LL2[1], LL2[2], LL2[3], LL2[4]]
error1=[SLL1[0],SLL1[1],SLL1[2],SLL1[3],SLL1[4]]
        
error2=[SLL2[0],SLL2[1],SLL2[2],SLL2[3],SLL2[4]]
x = np.arange(len(labels))  
width = 0.35  
ind = np.arange(5)    
width = 0.35      


fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, persian_means, width, yerr=error1,label='Persianness')
rects2 = ax.bar(x + width/2, Arab_means, width,yerr=error2, label='Arabness')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Similarity')
#ax.set_title('Scores by group and gender')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()


#plt.legend((p1[0], p2[0]), ('Men', 'Women'))



fig.tight_layout()

plt.show()


plt.clf()

labels = ['night', 'love', 'old', 'fortunate', 'slave']
persian_means = [LL1[5], LL1[6], LL1[7], LL1[8], LL1[9]]
Arab_means = [LL2[5], LL2[6], LL2[7], LL2[8], LL2[9]]
error3=[SLL1[5],SLL1[6],SLL1[7],SLL1[8],SLL1[9]]
        
error4=[SLL2[5],SLL2[6],SLL2[7],SLL2[8],SLL2[9]]
x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, persian_means, width,yerr=error3, label='Persianness')
rects2 = ax.bar(x + width/2, Arab_means, width,yerr=error4, label='Arabness')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Similarity')
#ax.set_title('Scores by group and gender')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()







fig.tight_layout()

plt.show()

plt.clf()

labels = ['vizier', 'father', 'young', 'heart', 'kiss']
persian_means = [LL1[10], LL1[11], LL1[12], LL1[13], LL1[14]]
Arab_means = [LL2[10], LL2[11], LL2[12], LL2[13], LL2[14]]
error5=[SLL1[10], SLL1[11], SLL1[12], SLL1[13], SLL1[14]]
        
error6=[SLL2[10], SLL2[11], SLL2[12], SLL2[13], SLL2[14]]
x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, persian_means, width, yerr=error5,label='Persianness')
rects2 = ax.bar(x + width/2, Arab_means, width,yerr=error6, label='Arabness')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Similarity')
#ax.set_title('Scores by group and gender')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()







fig.tight_layout()

plt.show()
C=[]
for i in range(0,NN11-1):
    C.append(common_words[i][0])
C1=str(C)
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
import pandas as pd 
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stops, 
                min_font_size = 5).generate(C1) 
  
# plot the WordCloud image                        
plt.figure(figsize = (6, 6), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
sp = spacy.load('en_core_web_sm')
nlp = spacy.load('en')

#Sentiment analysis of the text

import textwrap

FFF1=[]
content=str(LLL)
FFF1= textwrap.wrap(content,7700)

S1=[]
S2=[]
from textblob import TextBlob
for i in range(1,10):
    blob=TextBlob(FFF1[i])
    blob.tags 
    blob.noun_phrases
    S1.append(blob.sentiment.polarity)
    S2.append(blob.sentiment.subjectivity)




import matplotlib.pyplot as plt

for z in range(len(S1)):
   x_data=[z]
   y_data1=S1[z]
   plt.plot(x_data,y_data1, "r--o") 

plt.ylabel('Sentiment-polarity')
plt.xlabel('Docs')
plt.show() 
