# Preprocess Data

In [124]:
from lang_trans.arabic import buckwalter
import pandas as pd
import numpy as np

In [125]:
data_input = pd.ExcelFile('data testing arab.xlsx').parse(0)
X_arab = data_input.Arab.to_numpy()
gold_wazan = data_input.wazan.to_numpy()

In [126]:
X = []
for x in X_arab:
    X.append(buckwalter.transliterate(x))

# Pattern Extraction

In [127]:
data = pd.ExcelFile('List Rule_2.xlsx')
excel_files = []
for i in range(1,len(data.sheet_names)):
    file = data.parse(i)
    excel_files.append(file)

In [128]:
def make_dict(data):
    temp_dict = {}
    for i in range(len(data)):
        temp_dict[data[i]] = i+2
    temp_dict[''] = 1
    return temp_dict

In [129]:
def make_dictPrefix(data):
    temp_dict = {}
    for i in range(len(data)):
        temp_dict[data[i].split('|')[0]] = i+2
    temp_dict[''] = 1
    return temp_dict

In [130]:
vectorizer = []

In [131]:
prefixes = excel_files[len(excel_files)-1].Prefix.to_numpy()
prefixes = sorted(prefixes, key=len,reverse=True)
dict_prefixes = make_dictPrefix(prefixes)
vectorizer.extend(dict_prefixes.keys())
suffixes = excel_files[len(excel_files)-1].Suffix.to_numpy()[:35]
suffixes = sorted(suffixes, key=len,reverse=True)
vectorizer.extend(suffixes)
dict_suffixes = make_dict(suffixes)
dict_infixes = {}
infixes_1 = excel_files[len(excel_files)-1].Infix_1.to_numpy()[:6]
infixes_1 = sorted(infixes_1, key=len,reverse=True)
vectorizer.extend(infixes_1)
dict_infixes[0] = make_dict(infixes_1)
infixes_2 = excel_files[len(excel_files)-1].Infix_2.to_numpy()[:16]
infixes_2 = sorted(infixes_2, key=len,reverse=True)
vectorizer.extend(infixes_2)
dict_infixes[1] = make_dict(infixes_2)
infixes_3 = excel_files[len(excel_files)-1].Infix_3.to_numpy()[:11]
infixes_3 = sorted(infixes_3, key=len,reverse=True)
vectorizer.extend(infixes_3)
dict_infixes[2] = make_dict(infixes_3)
infixes_4 = excel_files[len(excel_files)-1].Infix_4.to_numpy()[:11]
infixes_4 = sorted(infixes_4, key=len,reverse=True)
vectorizer.extend(infixes_4)
dict_infixes[3] = make_dict(infixes_4)
infixes_5 = excel_files[len(excel_files)-1].Infix_5.to_numpy()[:2]
infixes_5 = sorted(infixes_5, key=len,reverse=True)
vectorizer.extend(infixes_5)
dict_infixes[4] = make_dict(infixes_5)
konsonan = excel_files[len(excel_files)-1].konsonan.to_numpy()[:25]
vectorizer.extend(konsonan)
dict_konsonan = make_dict(konsonan)
vectorizer = make_dict(vectorizer)

In [132]:
del excel_files[-1]

In [133]:
def prefixExtractor(prefixes,word):
    prefix = ''
    lemma = word
    for d in prefixes:
        preSplit = d.split('|')
        pre = preSplit[0]
        minLength = int(preSplit[1])
        lemmaTemp = word[len(pre):]
        if word.startswith(pre) and len(lemmaTemp) >= minLength:
            prefix = pre
            lemma = lemmaTemp
            break
    return prefix, lemma

In [134]:
def suffixExtractor(suffixes,word):
    suffix = ''
    lemma = word
    found = False
    for suff in suffixes:
        if word.endswith(suff):
            lemma = word[:len(word)-len(suff)]
            if (len(lemma) >= 6):
                suffix = suff
                found = True
            else:
                 lemma = word
        if found:
            break
    return lemma, suffix 

In [135]:
def infixExtractor(konsonan,word):
    root = []
    infix = word
    for c in word:
        if c in konsonan:
            root.append(c)
            infix = infix.replace(c,'_')
    return infix.split('_')[1:], root

In [136]:
def vectorizeInfix(dict_infixes,infix,root,dict_konsonan):
    length = len(infix*2)
    v = np.ones(10)
    j = 0
    for i in range(0,length,2):
        v[i] = dict_konsonan[root[j]]
        v[i+1] = dict_infixes[infix[j]]
        j+=1
    return v

In [137]:
def pattern_extraction(df):
    feature_input = []
    idx = 0
    i = 0
    X_data = []
    for w in df:
        lemma, suffix = suffixExtractor(suffixes,w)
        prefix , lemma2 = prefixExtractor(prefixes,lemma)
        infix, root = infixExtractor(konsonan,lemma2)
        if (len(root) < 3 and suffix != ''):
            root.append(suffix[0]) 
            suffix = ''
        pattern = {'prefix':prefix,'infix':infix,'suffix':suffix,'root':root}
        string_pattern = prefix+'|'+''.join(infix)+'|'+suffix
        X_data.append({'word':w,'pattern':pattern,'string_pattern':string_pattern})
    return X_data

In [138]:
X_data = pattern_extraction(X)

In [139]:
testing = pd.ExcelFile('data testing vector.xlsx').parse(0)
testing_X = testing.X.to_numpy()
for i in range(0,len(testing_X)):
    X_data[i]['vector'] = np.fromstring(testing_X[i],sep=' ')

In [140]:
X_data

[{'word': 'narokaEu',
  'pattern': {'prefix': 'na',
   'infix': ['o', 'a', 'u'],
   'suffix': '',
   'root': ['r', 'k', 'E']},
  'string_pattern': 'na|oau|',
  'vector': array([31., 10.,  7., 22., 14., 18., 12.,  1.,  1.,  1.,  1.,  1.])},
 {'word': '>iSodaEo',
  'pattern': {'prefix': '>i',
   'infix': ['o', 'a', 'o'],
   'suffix': '',
   'root': ['S', 'd', 'E']},
  'string_pattern': '>i|oao|',
  'vector': array([26., 14.,  7.,  8., 14., 18., 11.,  1.,  1.,  1.,  1.,  1.])},
 {'word': 'manaEotumaA',
  'pattern': {'prefix': '',
   'infix': ['a', 'a', 'o'],
   'suffix': 'tumaA',
   'root': ['m', 'n', 'E']},
  'string_pattern': '|aao|tumaA',
  'vector': array([ 1., 24.,  5., 25., 14., 18., 11.,  1.,  1.,  1.,  1.,  4.])},
 {'word': 'mat~iEiyo',
  'pattern': {'prefix': '',
   'infix': ['a', '~i', 'i'],
   'suffix': 'yo',
   'root': ['m', 't', 'E']},
  'string_pattern': '|a~ii|yo',
  'vector': array([ 1., 24.,  5.,  3., 12., 18., 10.,  1.,  1.,  1.,  1., 23.])},
 {'word': 'ma$okuworuwona',


# Verb Form Identification

In [141]:
from keras.models import load_model

In [142]:
# load model
model = load_model('model.h5')
# summarize model.
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 12, 16)            2880      
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 128)               18560     
_________________________________________________________________
dense_2 (Dense)              (None, 15)                1935      
Total params: 23,375
Trainable params: 23,375
Non-trainable params: 0
_________________________________________________________________


In [143]:
wazan = ['Iii',
 '(VII) tsulatsy',
 '(IV) tsulatsy',
 '(III) tsulatsy',
 '(X) tsulatsy',
 'Iuu',
 '(VI) tsulatsy',
 '(V) tsulatsy',
 '(IX) tsulatsy',
 'Iaa',
 'Iai',
 'Iia',
 "(I) ruba'iy",
 '(II) tsulatsy',
 'Iau']

In [144]:
for i in range(0,len(X_data)):
    X_data[i]['wazan'] = gold_wazan[i]
    yhat = model.predict(np.expand_dims(X_data[i]['vector'],0))
    yhat = np.argsort(yhat[0])[-1]
    X_data[i]['wazan_predict'] = wazan[yhat]

# Pronoun and Type of Word Identification

In [145]:
wazan_index = [5,11,8,7,14,4,10,9,13,2,1,3,18,6,0]

In [146]:
import string
def trim_space(strng):
    return strng.translate({ord(c): None for c in string.whitespace})

In [147]:
def extract_pronoun_tow(excel_file):
    excel_file_numpy = excel_file.to_numpy()
    type_of_words = list(excel_file.columns.values)
    dict_pattern = {}
    for i in range(0,excel_file_numpy.shape[0]):
        pron = excel_file_numpy[i][excel_file_numpy.shape[1]-1]
        for j in range(0,excel_file_numpy.shape[1]-1):
            tow = type_of_words[j]
            pattern =  excel_file_numpy[i][j]
            if (type(pattern) == str):
                pattern = trim_space(pattern)
                if ('/'  in pattern):
                    for p in pattern.split('/'):
                        dict_pattern[p] = [pron,tow]
                else:
                    dict_pattern[pattern] = [pron,tow]
    return dict_pattern

In [148]:
dict_wazan_pattern = {}
for i in range(0,len(wazan)):
    dict_wazan_pattern[wazan[i]] = extract_pronoun_tow(excel_files[wazan_index[i]])

In [149]:
dict_wazan_pattern

{'Iii': {'ma|ouwoN|': ['هُوَ', 'اِسِمْ مَفْعُوْلْ'],
  '|aAiN|': ['هُوَ', 'اِسِمْ فَاعِلْ'],
  'ya|oiu|': ['هُوَ', 'فِعِلْ مُضَارِعْ'],
  '|aia|': ['هُوَ', 'فِعِلْ مَاضِ'],
  'ma|ouwoa|Ani': ['هُمَا', 'اِسِمْ مَفْعُوْلْ'],
  'ma|ouwoa|yoni': ['هُمَا', 'اِسِمْ مَفْعُوْلْ'],
  '|aAia|Ani': ['هُمَا', 'اِسِمْ فَاعِلْ'],
  '|aAia|yoni': ['هُمَا', 'اِسِمْ فَاعِلْ'],
  'ya|oia|Ani': ['هُمَا', 'فِعِلْ مُضَارِعْ'],
  '|aia|A': ['هُمَا', 'فِعِلْ مَاضِ'],
  'ma|ouwou|wona': ['هُمْ', 'اِسِمْ مَفْعُوْلْ'],
  'ma|ouwoi|yona': ['هُمْ', 'اِسِمْ مَفْعُوْلْ'],
  '|aAiu|wona': ['هُمْ', 'اِسِمْ فَاعِلْ'],
  '|aAii|yona': ['هُمْ', 'اِسِمْ فَاعِلْ'],
  'ya|oiu|wona': ['هُمْ', 'فِعِلْ مُضَارِعْ'],
  '|aiu|woA': ['هُمْ', 'فِعِلْ مَاضِ'],
  'ma|ouwoa|pN': ['هِيَ', 'اِسِمْ مَفْعُوْلْ'],
  '|aAia|pN': ['هِيَ', 'اِسِمْ فَاعِلْ'],
  'ta|oiu|': ['أَنْتَ', 'فِعِلْ مُضَارِعْ'],
  '|aia|to': ['هِيَ', 'فِعِلْ مَاضِ'],
  'ma|ouwoa|taAni': ['هُمَا', 'اِسِمْ مَفْعُوْلْ'],
  'ma|ouwoa|tayoni': ['هُمَا', 'اِسِمْ مَفْعُوْلْ'

In [150]:
pattern_berhasil = []
pattern_gagal = []
list_tow = []
list_pron = []
for i in range(0,len(X_data)):
    pattern = X_data[i]['string_pattern']
    wazan = X_data[i]['wazan']
    wazan_predict =  X_data[i]['wazan_predict']
    word = X_data[i]['word']
    if 'w' in X_data[i]['pattern']['suffix'] and 'o' not in X_data[i]['pattern']['suffix']:
        if len(X_data[i]['pattern']['suffix']) == 1:
            suffix = 'woA'
        else:
            pos = X_data[i]['pattern']['suffix'].find('w')+1
            suffix = X_data[i]['pattern']['suffix'][:pos] + 'o' + X_data[i]['pattern']['suffix'][pos:]
        if(suffix != 'wona' and X_data[i]['pattern']['prefix'] != 'laAta' and X_data[i]['pattern']['prefix'] != '>u' and X_data[i]['pattern']['prefix'] != '' and X_data[i]['pattern']['prefix'] != '>i'):
            suffix = 'wona'
        pattern =  X_data[i]['pattern']['prefix']+'|'+''.join( X_data[i]['pattern']['infix'])+'|'+suffix
    if (pattern in dict_wazan_pattern[wazan_predict]):
        result = dict_wazan_pattern[wazan_predict][pattern]
        X_data[i]['tow'] = result[1]
        X_data[i]['pron'] = result[0]
        list_tow.append(result[1])
        list_pron.append(result[0])
        pattern_berhasil.append([word,pattern,wazan,wazan_predict])
    else:
        pattern_gagal.append([word,pattern,wazan,wazan_predict])

In [151]:
len(pattern_berhasil)

6133

In [152]:
len(pattern_gagal)

55

In [153]:
pattern_gagal

[['>a$oxaSu', '>a|oau|', 'Iaa', '(IV) tsulatsy'],
 ['>usojunaA', '|ou|', 'Iau', 'Iau'],
 ['>aTomivu', '>a|oiu|', 'Iai', 'Iia'],
 ['>uqonutaA', '|ou|', 'Iau', 'Iau'],
 ['laAtanoTaliquwoA', 'laAta|oaiu|woA', '(VII) tsulatsy', '(VII) tsulatsy'],
 ['laAtanoqalibaA', 'laAta|oaia|A', '(VII) tsulatsy', '(VII) tsulatsy'],
 ['>arojiEu', '>a|oiu|', 'Iai', '(IV) tsulatsy'],
 ['Aino$aqaqoti', 'Aino|aao|ti', '(IX) tsulatsy', '(IX) tsulatsy'],
 ['laAtu*oEinaA', '|aAuoi|naA', '(IV) tsulatsy', '(IV) tsulatsy'],
 ['>aboxalu', '>a|oau|', 'Iia', '(IV) tsulatsy'],
 ['laAtanotaZiruwoA', 'laAta|oaiu|woA', '(VII) tsulatsy', '(VII) tsulatsy'],
 ['>aEolinaA', '|oi|', '(IV) tsulatsy', '(IV) tsulatsy'],
 ['laAtarokinaA', '|aAaoi|naA', 'Iia', 'Iia'],
 ['taqonutu', '|aou|tu', 'Iau', 'Iau'],
 ['laAtudohinaA', '|aAuoi|naA', '(IV) tsulatsy', '(IV) tsulatsy'],
 ['laAtakotumaA', '|aAao|tumaA', 'Iau', 'Iau'],
 ['>asokana', '|oa|', '(IV) tsulatsy', '(IV) tsulatsy'],
 ['laAtanota$irona', 'laAta|oaio|na', '(VII) tsulatsy',

In [154]:
len(pattern_berhasil)/len(X_data)

0.9911118293471235

# MSD Identification

In [155]:
list_tow = list(set(list_tow))
list_pron = list(set(list_pron))

In [156]:
list_tow

['فِعِلْ مَاضِ',
 'اِسِمْ مَفْعُوْلْ',
 'فِعِلْ النَهْيِ',
 'اِسِمْ فَاعِلْ',
 'فِعِلْ الأَمرْ',
 'مَصْدَرْ',
 'فِعِلْ مُضَارِعْ']

In [157]:
list_pron

['هُمْ',
 'أَنْتُنَّ',
 'نَحْنُ',
 'هُمَا',
 'أَنْتِ',
 'هِيَ',
 'هُوَ',
 'هُنَّ',
 'أَنْتَ',
 'أَنَا',
 'أَنْتُمْ',
 'أَنْتُمَا']

In [158]:
dict_msd = {
    'اِسِمْ فَاعِلْ':{'POS':'N'},
    'اِسِمْ مَفْعُوْلْ':{'POS':'N'},
    'فِعِلْ مَاضِ':{'POS':'V','aspect':'PFV','tense':'PST'},
    'فِعِلْ مُضَارِعْ':{'POS':'V','aspect':'IPFV','tense':'PRS/FUT'},
    'فِعِلْ الأَمرْ':{'POS':'V','mood':'IMP'},
    'فِعِلْ النَهْيِ':{'POS':'V','mood':'IMP'},
    'مَصْدَرْ':{'POS':'N'},
    'أَنْتُنَّ':{'PER':'2','NUM':'PL','GEN':'FEM'},
    'هِيَ':{'PER':'3','NUM':'SG','GEN':'FEM'},
    'أَنَا':{'PER':'1','NUM':'SG'},
    'نَحْنُ':{'PER':'1','NUM':'PL'},
    'هُمْ':{'PER':'3','NUM':'PL','GEN':'MASC'},
    'هُنَّ':{'PER':'3','NUM':'PL','GEN':'FEM'},
    'أَنْتَ':{'PER':'2','NUM':'SG','GEN':'MASC'},
    'هُوَ':{'PER':'3','NUM':'SG','GEN':'MASC'},
    'أَنْتُمْ':{'PER':'2','NUM':'PL','GEN':'MASC'},
    'هُمَا':{'PER':'3','NUM':'DU'},
    'أَنْتُمَا':{'PER':'2','NUM':'DU'},
    'أَنْتِ':{'PER':'2','NUM':'SG','GEN':'FEM'}
}

In [159]:
dataGagal = []

for i in range(0,len(X_data)):
    verbForm = {'Verb Form':X_data[i]['wazan_predict']}
    if ('tow' in X_data[i] ):
        tow = X_data[i]['tow']
        pron = X_data[i]['pron']
        dict_tow = dict_msd[tow].copy() 
        dict_pron = dict_msd[pron].copy() 
        dict_tow.update(dict_pron)
        dict_tow.update(verbForm)
        if (dict_tow['POS'] == 'N'):
            del dict_tow['PER']
        X_data[i]['msd'] = dict_tow
    else:
        X_data[i]['msd_predict'] = verbForm
        dataGagal.append(X_data[i])
    if (X_data[i]['wazan_predict'] != X_data[i]['wazan']):
        dataGagal.append(X_data[i])

In [161]:
import json

dataGabung = []
for x in X_data:
    if 'tow' in x:
        dataGabung.append([buckwalter.untransliterate(x['word']),json.dumps(x['msd']),x['tow'],x['pron'],x['wazan'],x['wazan_predict']])
    else:
        dataGabung.append([buckwalter.untransliterate(x['word']),json.dumps(x['msd']),'','',x['wazan'],x['wazan_predict']])

        

{'word': 'narokaEu', 'pattern': {'prefix': 'na', 'infix': ['o', 'a', 'u'], 'suffix': '', 'root': ['r', 'k', 'E']}, 'string_pattern': 'na|oau|', 'vector': array([31., 10.,  7., 22., 14., 18., 12.,  1.,  1.,  1.,  1.,  1.]), 'wazan': 'Iaa', 'wazan_predict': 'Iaa', 'tow': 'فِعِلْ مُضَارِعْ', 'pron': 'نَحْنُ', 'msd': {'POS': 'V', 'aspect': 'IPFV', 'tense': 'PRS/FUT', 'PER': '1', 'NUM': 'PL', 'Verb Form': 'Iaa'}}
{'word': '>iSodaEo', 'pattern': {'prefix': '>i', 'infix': ['o', 'a', 'o'], 'suffix': '', 'root': ['S', 'd', 'E']}, 'string_pattern': '>i|oao|', 'vector': array([26., 14.,  7.,  8., 14., 18., 11.,  1.,  1.,  1.,  1.,  1.]), 'wazan': 'Iaa', 'wazan_predict': 'Iaa', 'tow': 'فِعِلْ الأَمرْ', 'pron': 'أَنْتَ', 'msd': {'POS': 'V', 'mood': 'IMP', 'PER': '2', 'NUM': 'SG', 'GEN': 'MASC', 'Verb Form': 'Iaa'}}
{'word': 'manaEotumaA', 'pattern': {'prefix': '', 'infix': ['a', 'a', 'o'], 'suffix': 'tumaA', 'root': ['m', 'n', 'E']}, 'string_pattern': '|aao|tumaA', 'vector': array([ 1., 24.,  5., 2

KeyError: 'msd'

In [None]:
import json

dataGabungGagal = []
for x in dataGagal:
    if 'tow' in x:
        dataGabungGagal.append([buckwalter.untransliterate(x['word']),json.dumps(x['msd']),x['tow'],x['pron'],x['wazan'],x['wazan_predict']])
    else:
        dataGabungGagal.append([buckwalter.untransliterate(x['word']),json.dumps(x['msd']),'','',x['wazan'],x['wazan_predict']])

        

In [None]:
df = pd.DataFrame(data=dataGabung,  columns=["Word","MSD","Type of Word","Pronoun","wazan","wazan predict"])
df.to_excel("Hasil.xlsx")

In [None]:
df = pd.DataFrame(data=dataGabungGagal,  columns=["Word","MSD","Type of Word","Pronoun","wazan","wazan predict"])
df.to_excel("Hasil Gagal.xlsx")