In [59]:
import csv
import pandas as pd
import json

word_gloss = {
        'ل':'لي',
        'ول':'ولا',
        'الى':'لي',
        'م':'ما',
        'دا':'ده',
        'كدا':'كده',
        'ي':'يا'
        }

char_gloss = {
    'ئ':'ي'
}

def connect(sent):
    '''connect loose clitics to the beginning of the next word'''
    chars = ['ح', 'ب','ف']
    
    split = sent.split()
    
    for i in range(len(split)):
        try:
            if split[i] in chars:
                split[i+1] = f'{split[i]}{split[i+1]}'
                del split[i]
        except IndexError:
            break
    
    return ' '.join(x for x in split)


def word_level(sent):
    '''change words as they appear in word_gloss'''

    new = []
    for word in sent.split():
        if word in word_gloss:
            new.append(word_gloss[word])
        else:
            new.append(word)
           
    return ' '.join(new)

def char_level(word):
    '''change characters as they appear in char_gloss'''

    word = word.translate({ord(c): char_gloss[c] for c in char_gloss})
    if word[-1] == 'ء':
        return word[:-1]
    return word

def codafy(sent):
    '''combine char_level and word_level'''
    ans = word_level(sent)
    ans = ' '.join(char_level(word) for word in ans.split())
    return connect(ans)



            




In [60]:
frame = pd.read_csv('1600data.csv')
frame.drop('Unnamed: 0', axis = 1, inplace = True)

df_merge = pd.DataFrame(columns= ['sent', 'CODA','prep']) #sentences that need merging
df_split = pd.DataFrame(columns = ['sent', 'CODA', 'prep']) #sentences that need a split
df_same = pd.DataFrame(columns = ['sent', 'CODA','prep']) #sentences that don't need merge or split

total_same = 0  #for statistics purposes
total_merge = 0
total_split = 0

codafy_good_same = 0
codafy_good_merge = 0
codafy_good_split = 0

for i in range(len(frame)): #iterate all rows
    raw = frame['sent'].values[i]
    coda = frame['CODA'].values[i]
    
    if (len(raw.split()) == len(coda.split())): # if same number of tokens, insert to df_same
        df_same.loc[len(df_same)] = [raw, coda, codafy(raw)]
        if raw != coda:
            total_same += 1
            if coda == codafy(raw):
                codafy_good_same += 1
       
    if (len(raw.split()) > len(coda.split())): #if raw has more tokens, merge is needed, insert to df_merge
        df_merge.loc[len(df_merge)] = [raw, coda, codafy(raw)]
        if raw != coda:
            total_merge += 1
            if coda == codafy(raw):
                codafy_good_merge += 1
        
    if (len(raw.split()) < len(coda.split())): #if coda has more tokens, split is needed, insert to df_split
        df_split.loc[len(df_split)] = [raw, coda, codafy(raw)]
        if raw != coda:
            total_split += 1
        if coda == codafy(raw):
            codafy_good_split += 1
        
print("Sentences which:\n")
print("stayed same length:")
print(f"Total changes: {total_same}, codafy did {codafy_good_same}, accuracy : {100*codafy_good_same/total_same:.1f}\n")
print("need merge:")
print(f"Total changes: {total_merge}, codafy did {codafy_good_merge}, accuracy : {100*codafy_good_merge/total_merge}\n")
print("need split:")
print(f"Total changes: {total_split}, codafy did {codafy_good_split}, accuracy : {codafy_good_split*100/total_split}\n")

print(f"total sentences that need change = {total_split+total_merge+total_same}")


Sentences which:

stayed same length:
Total changes: 310, codafy did 94, accuracy : 30.3

need merge:
Total changes: 125, codafy did 85, accuracy : 68.0

need split:
Total changes: 65, codafy did 0, accuracy : 0.0

total sentences that need change = 500


In [61]:
same_change = pd.DataFrame(columns = ['sent', 'CODA', 'prep']) 

dict_counter  = {} #dict to keep count of each token that changed

for i in range(len(df_same)): 
    '''for sentences with no split/merge, I want to see the tokens changed'''
    raw = df_same['sent'].values[i]
    coda = df_same['CODA'].values[i]
    prep = codafy(raw)
    
    if (raw != coda):
        if (len(raw.split()) == len(prep.split())):
            for r in range(len(raw.split())):
                if raw.split()[r] != coda.split()[r]:
                    if raw.split()[r] not in dict_counter:
                        dict_counter[raw.split()[r]] =1
                    else:
                        dict_counter[raw.split()[r]]+=1
                    same_change.loc[len(same_change)] = [raw.split()[r], coda.split()[r], prep.split()[r]]

print(len(same_change))

same_change.to_csv('same_change.csv')

dict_counter = {k: v for k, v in sorted(dict_counter.items(), key=lambda item: item[1], reverse=True)}


for k in dict_counter:
    print(f'{k} : {dict_counter[k]}')

df_same.to_csv('same.csv')
df_merge.to_csv('merge.csv')
df_split.to_csv('split.csv')


print(len(df_split))

print("--")

print(len(df_merge) + len(df_split) + len(df_same))

383
دا : 85
تلاتة : 25
اتنين : 22
ول : 14
تانية : 10
تاني : 9
التاني : 8
اكتر : 8
لا : 7
تمنية : 7
لسة : 7
لي : 6
اسى : 5
وتلاتين : 5
خمستاشر : 4
حداشر : 4
كدا : 4
الى : 3
تلاتين : 3
كتير : 3
كتيرة : 3
برضه : 3
ماشى : 3
دائما : 3
مئة : 3
التالتة : 2
على : 2
وتمنين : 2
الي : 2
تمانية : 2
م : 2
تانيه : 2
شويه : 2
القي : 2
هندا : 2
اعزرني : 2
رأس : 2
في : 2
اتناشر : 2
كلو : 2
شكلو : 2
تلتمية : 2
المئة : 2
وتمانين : 2
نضيفة : 2
الغى : 2
ليلى : 2
سجائر : 2
ل : 1
تلاته : 1
انه : 1
اشيل : 1
دايرو : 1
تضربو : 1
الشنطه : 1
بفترض : 1
جاء : 1
تتلقى : 1
اتغدا : 1
التلاتاء : 1
انزل : 1
هادئ : 1
الحقيقه : 1
نكته : 1
صغيره : 1
بيره : 1
جبنه : 1
مضفره : 1
نضيفه : 1
بكره : 1
غلطه : 1
مجهزه : 1
سته : 1
جمب : 1
اجرى : 1
شرطه : 1
لى : 1
تقويه : 1
اشتري : 1
اديني : 1
برضة : 1
حادث : 1
اطول : 1
مع : 1
احسن : 1
اشرب : 1
بره : 1
اراجع : 1
بالكتير : 1
تحول : 1
ستاشر : 1
جرى : 1
الفكه : 1
اوبرت : 1
الشئ : 1
لوا : 1
تايلندية : 1
مشروباتو : 1
الجائزة : 1
بي : 1
ريحتو : 1
المعيشه : 1
فيهو : 1
اي : 1
موسيقي : 1
قدا

In [62]:
count = 0
total = 0

for i in range(len(df_same)):
    raw = df_same.sent.iloc[i]
    coda = df_same.CODA.iloc[i]
    prep = df_same.prep.iloc[i]
    
    
    for x in range(len(coda.split())): 
        total += 1
        if coda.split()[x] == raw.split()[x]:
            count += 1
    
                 
print(count)
print(total)

print(count * 100 /total)

codafy('هل ب تعرف كاثي')
            
        

7521
7913
95.04612662706938


'هل بتعرف كاثي'