In [1]:
from docx import Document
import pandas as pd
import re
import nltk
from difflib import ndiff
import pickle

from my_functions import read_docx_tables as read_df, ha, listReader, delta

In [2]:
ha([['tr', 'trtr', 'trtrtr']])

['tr', 'trtr', 'trtrtr']

### Reading

In [3]:
df = read_df('All_irregular_verb_list.docx')[0]

In [4]:
df[:3]

Unnamed: 0,A,A.1,A.2,A.3
0,abide,abode; abided,abode; abided,пребывать; держаться
1,arise,arose,arisen,подняться; возникнуть
2,awake,awoke,awaked; awoke,будить; проснуться


In [5]:
df = df.rename(columns={'A': 'First', 'A.1': 'Second', 'A.2': 'Third', 'A.3': 'Translate'})

In [6]:
for i in range(len(df.columns)):
    df[df.columns[i]] = df[df.columns[i]].apply(lambda r: re.sub(r';.+', '', r))
    
df = df[df.First.apply(lambda x: len(x) > 1)]
df = df.reset_index(drop=True)

In [7]:
df[:3]

Unnamed: 0,First,Second,Third,Translate
0,abide,abode,abode,пребывать
1,arise,arose,arisen,подняться
2,awake,awoke,awaked,будить


In [8]:
df.to_csv('All_irregular_verb_list.csv', sep=';', encoding='utf-8')

In [8]:
#ord('t')

### Графема => Фонема

In [9]:
#nltk.download('cmudict')

In [10]:
arpabet = nltk.corpus.cmudict.dict() 
# for word in ('barbels', 'barbeque', 'barbequed', 'barbequeing', 'barbeques'): 
#     print(arpabet[word]) 

In [11]:
df = df[df.First.apply(lambda x: True if x in arpabet.keys() else False)]
df = df[df.Second.apply(lambda x: True if x in arpabet.keys() else False)]
df = df[df.Third.apply(lambda x: True if x in arpabet.keys() else False)]
df = df.reset_index(drop=True)

df = df.assign(TR_1 = pd.Series([arpabet[x][0] for x in df[df.columns[0]]]))
df = df.assign(TR_2 = pd.Series([arpabet[x][0] for x in df[df.columns[1]]]))
df = df.assign(TR_3 = pd.Series([arpabet[x][0] for x in df[df.columns[2]]]))

df = df[['First', 'Second', 'Third', 'TR_1', 'TR_2', 'TR_3', 'Translate']]

In [12]:
df[:3]

Unnamed: 0,First,Second,Third,TR_1,TR_2,TR_3,Translate
0,abide,abode,abode,"[AH0, B, AY1, D]","[AH0, B, OW1, D]","[AH0, B, OW1, D]",пребывать
1,arise,arose,arisen,"[ER0, AY1, Z]","[ER0, OW1, Z]","[ER0, IH1, Z, AH0, N]",подняться
2,be,was,been,"[B, IY1]","[W, AA1, Z]","[B, IH1, N]",быть


In [13]:
vocab = list(arpabet.values())

vocab = [el[0] for el in vocab]
vocab = list(set(ha(vocab)))

In [14]:
with open('vokab.pickle', 'wb') as f:
    pickle.dump(vocab, f)

In [15]:
vocab[0], len(vocab)

('IH1', 69)

In [16]:
dl = ndiff(['12', '23', '33'], ['12', '24', '56'])

In [17]:
for el in dl:
    print(el)

  12
- 23
- 33
+ 24
+ 56


In [16]:
delta(0, df)

(['AY1', 'D'], ['OW1', 'D'], ['OW1', 'D'])

In [17]:
delta(1, df)

(['AY1', 'Z'], ['OW1', 'Z'], ['IH1', 'Z', 'AH0', 'N'])

In [18]:
df = df.assign(DELTA = pd.Series([delta(i, df) for i in range(df.index.size)]))

In [19]:
df = df[['First', 'Second', 'Third', 'TR_1', 'TR_2', 'TR_3', 'DELTA', 'Translate']]

In [20]:
df[:3]

Unnamed: 0,First,Second,Third,TR_1,TR_2,TR_3,DELTA,Translate
0,abide,abode,abode,"[AH0, B, AY1, D]","[AH0, B, OW1, D]","[AH0, B, OW1, D]","([AY1, D], [OW1, D], [OW1, D])",пребывать
1,arise,arose,arisen,"[ER0, AY1, Z]","[ER0, OW1, Z]","[ER0, IH1, Z, AH0, N]","([AY1, Z], [OW1, Z], [IH1, Z, AH0, N])",подняться
2,be,was,been,"[B, IY1]","[W, AA1, Z]","[B, IH1, N]","([B, IY1], [W, AA1, Z], [B, IH1, N])",быть


In [39]:
df.to_csv('test_1.csv', encoding='utf-8', sep=';')

In [40]:
new_df = pd.read_csv('test_1.csv', encoding='utf-8', sep=';')

In [41]:
new_df = new_df[['First', 'Second', 'Third', 'TR_1', 'TR_2', 'TR_3', 'DELTA', 'Translate']]

In [42]:
new_df[:3]

Unnamed: 0,First,Second,Third,TR_1,TR_2,TR_3,DELTA,Translate
0,abide,abode,abode,"['AH0', 'B', 'AY1', 'D']","['AH0', 'B', 'OW1', 'D']","['AH0', 'B', 'OW1', 'D']","(['AY1', 'D'], ['OW1', 'D'], ['OW1', 'D'])",пребывать
1,arise,arose,arisen,"['ER0', 'AY1', 'Z']","['ER0', 'OW1', 'Z']","['ER0', 'IH1', 'Z', 'AH0', 'N']","(['AY1', 'Z'], ['OW1', 'Z'], ['IH1', 'Z', 'AH0...",подняться
2,be,was,been,"['B', 'IY1']","['W', 'AA1', 'Z']","['B', 'IH1', 'N']","(['B', 'IY1'], ['W', 'AA1', 'Z'], ['B', 'IH1',...",быть


In [43]:
t = new_df[new_df.columns[6]][0]

In [44]:
t

"(['AY1', 'D'], ['OW1', 'D'], ['OW1', 'D'])"

In [45]:
t = listReader(t)

In [46]:
t

[['AY1', 'D'], ['OW1', 'D'], ['OW1', 'D)']]