In [72]:
import os
deck = 'minimal'
infile = 'minimal/minimal.txt'
outdir = 'media_dir'
textdir = 'text_dir'
os.makedirs(outdir, exist_ok=True)
os.makedirs(textdir, exist_ok=True)
metafile = os.path.join(textdir, deck + os.path.basename(infile) + '_metadata.txt')
separator = '\n\n################################\n\n'
print('media data in: ', outdir, 'text data in: ', textdir)
print('')

minimal_pairs = []
minimal_pairs_special = []
with open(infile) as f:
    for line in f:
        line = line.strip()
        tokens = line.split(' ≠ ')
        if len(tokens) != 2:
            print('ooo', line)
            continue
        if '(' in line and ')' in line:
            minimal_pairs_special.append(tokens)
        else:
            minimal_pairs.append(tokens)     

media data in:  media_dir text data in:  text_dir



In [77]:
from google.cloud import texttospeech
import requests
import json
import glob
import cv2
import shutil
import os
import time 

base_url = 'https://od-api.oxforddictionaries.com/api/v1'
app_id = '88b408cd'
app_key = '8c3ae542c22acf7770e5f7cf67b84bf9'
language = 'en'
region='us'
client = texttospeech.TextToSpeechClient()
voice = texttospeech.types.VoiceSelectionParams(
    language_code='en-US', 
    ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL, name='en-US-Wavenet-D')
audio_config = texttospeech.types.AudioConfig(
    audio_encoding=texttospeech.enums.AudioEncoding.MP3)

def get_pronounciation(word, word_id):
    synthesis_input = texttospeech.types.SynthesisInput(text=word)
    response = client.synthesize_speech(synthesis_input, voice, audio_config)
    outfile = os.path.join(outdir, word_id + '.mp3')
    with open(outfile, 'wb') as out:
        out.write(response.audio_content)
    return os.path.basename(outfile)

def get_headword(word):
    url = 'https://od-api.oxforddictionaries.com:443/api/v1/search/' + language + '?q=' + word.lower() + '&prefix=false' + '&regions=' + region
    r = requests.get(url, headers = {'app_id': app_id, 'app_key': app_key})
    if r.status_code != 200:
        print('[Get headword] Invalid status code: ', word, r.status_code)
        if r.status_code == 403:
            print(json.dumps(r.json(), indent=4))
            raise Exception('Too many requests.')
        return None
    for entry in r.json()['results']:
        if entry['region'] == 'us':
            return entry
    return None

def get_word_info(word_id):
    url = 'https://od-api.oxforddictionaries.com:443/api/v1/entries/' 
    url += language + '/' + word_id.lower() + '/regions=' + region 
    r = requests.get(url, headers = {'app_id': app_id, 'app_key': app_key})
    if r.status_code != 200:
        print('[Get word info], Invalid status code: ', word_id, r.status_code)
        if r.status_code == 403:
            print(json.dumps(r.json(), indent=4))
            raise Exception('Too many requests.')
        return None
    return r.json()

def get_card_meaning(word, card_id):
    res = {'card_id' : card_id, 'match_word' : word, 'word' : word, 'senses' : [], 'examples' : [],
           'pronounciation' : None, 'image' : None, 'headword' : None, 'ipa' : ''}
    entry = get_headword(word)
    if entry is None:
        return res
    headword = entry['id']
    res['headword'] = headword
    res['word'] = entry['word']
    info = get_word_info(headword)
    if info is None:
        return res
    entry = info['results'][0]['lexicalEntries'][0]
    senses = []
    if 'senses' in entry['entries'][0]: 
        senses = entry['entries'][0]['senses']
    if 'pronunciations' in entry.keys():
        pronounciations = entry['pronunciations']
        for p in pronounciations:
            if 'audioFile' in p.keys() and 'dialects' in p.keys() and 'American English' in p['dialects']:
                res['pronounciation'] = p['audioFile']
            if 'phoneticNotation' in p.keys() and 'IPA' in p['phoneticNotation']:
                if 'phoneticSpelling' in p.keys():
                    res['ipa'] = p['phoneticSpelling']
    def get_definitions(sense):
        if 'short_definitions' in sense:
            return sense['short_definitions'][0]
        if 'definitions' in sense:
            return sense['definitions'][0]
        return '--'
    res['senses'] = [get_definitions(sense) for sense in senses]
    return res

def add_card_pronounciation(card):
    if card['pronounciation'] is not None:
        return card
    card['pronounciation'] = get_pronounciation(card['match_word'], card['card_id'] + '_wpron')
    return card

def get_all_word_data(word):
    card = get_card_meaning(word, '_'.join(word.split(' ')))
    card = add_card_pronounciation(card)
    return card

outmetafile = open(metafile, 'w')

for pair in minimal_pairs:
    print(pair)
    
    while True:
        try:
            w1 = get_all_word_data(pair[0])
        except Exception as e:
            print(e)
            time.sleep(2)
            continue
        break
    
    while True:
        try:
            w2 = get_all_word_data(pair[1])
        except Exception as e:
            print(e)
            time.sleep(2)
            continue
        break    
    outmetafile.write(json.dumps([w1, w2], indent=4) + separator)
outmetafile.close()
print('DONE')    

['fleece', 'fleas']
[Get headword] Invalid status code:  fleece 403
Expecting value: line 1 column 1 (char 0)
[Get headword] Invalid status code:  fleece 403
Expecting value: line 1 column 1 (char 0)


KeyboardInterrupt: 

In [59]:
from ipywidgets import widgets
import os
import IPython.display as ipd

def add_card_pronounciation(card):
    if card['pronounciation'] is not None:
        return card
    card['pronounciation'] = get_pronounciation(card['match_word'], card['card_id'] + '_wpron')
    return card

metafile = os.path.join(textdir, deck + os.path.basename(infile) + '_metadata.txt')
trouble_file = os.path.join(textdir, deck + os.path.basename(infile) + '_trouble.txt')
update_file = os.path.join(textdir, deck + os.path.basename(infile) + '_update.txt')
deck_final_file = deck + '_final.txt'
separator = '\n\n################################\n\n'

for f in [update_file, trouble_file]:
    f = open(f, 'w')
    f.close()
    
old_ids = {}
if os.path.isfile(deck_final_file):
    with open(deck_final_file) as f:
        deck_data = [json.loads(s) for s in f.read().split(separator) if len(s) > 0]
    old_ids = set([card[0]['card_id'] for card in deck_data]) 
if os.path.isfile(trouble_file):
    with open(trouble_file) as f:
        deck_data = [json.loads(s) for s in f.read().split(separator) if len(s) > 0]
    old_ids.update([card[0]['card_id'] for card in deck_data])    
    
all_cards = []
with open(metafile) as f:
    data = f.read()
    card_strings = data.split(separator)
    
    for card_string in card_strings:
        if len(card_string) == 0:
            continue
        card = json.loads(card_string)
        if card[0]['card_id'] in old_ids:
            continue
        all_cards.append(card)
        
def update_sound(path):
    fname = os.path.join(outdir, path)
    if os.path.isfile(fname):
        d = ipd.Audio(fname)
        display(d)
    else:
        print('url?', fname)
        
def validate_card(i, cards):
    data = []
    for card in cards:
        for k, v in card.items():
            if v is None:
                card[k] = ''
        orig_word = widgets.Text()
        orig_word.value = card['match_word']
        display(orig_word)
        word = widgets.Text(disabled=True)
        word.value = card['word']
        display(word)
        ipa = widgets.Text()
        ipa.value = card['ipa']
        display(ipa)
        
        if card['word'] != card['match_word']:
            card['pronounciation'] = None
            card = add_card_pronounciation(card)
            
        sound = widgets.Text()
        sound.value = card['pronounciation']
        display(sound)    
        update_sound(sound.value)
        
        senses = []
        for text in card['senses'][:2]:
            sense = widgets.Textarea()
            sense.value = text
            clear = widgets.Button(
                value=False,
                description='clear',
                disabled=False,
                button_style='', #, 'warning', 'danger' or ''
                icon='check',
                layout=widgets.Layout(width='7%', height='25px')
            )
            def clear_output(field):
                def f(sender):
                    field.value = ''
                return f
            def clear_button(button):
                def f(sender):
                    button.value = False
                return f
            clear.on_click(clear_output(sense))
            sense.observe(clear_button(clear))
            sense_box = widgets.HBox([sense, clear])
            display(sense_box)
            senses.append(sense)
        data.append({'orig_word' : orig_word, 'senses' : senses, 'ipa' : ipa, 'sound' : sound})
        print('')
    
    good = widgets.Button(
            value=False,
            description='GOOD',
            disabled=False,
            button_style='success', #, 'warning', 'danger' or ''
            icon='check',
            layout=widgets.Layout(width='15%', height='50px')
        )
    skip = widgets.Button(
            value=False,
            description='SKIP',
            disabled=False,
            button_style='warning', #, 'warning', 'danger' or ''
            icon='check',
            layout=widgets.Layout(width='15%', height='50px')
        )
    exit = widgets.Button(
            value=False,
            description='EXIT',
            disabled=False,
            button_style='danger', #, 'warning', 'danger' or ''
            icon='check',
            layout=widgets.Layout(width='15%', height='50px')
        )
    def show_next(i):
        if i < len(all_cards):
            validate_card(i, all_cards[i])
        else:
            print('NO MORE CARDS')
            print('check files: ', deck_final_file, update_file, trouble_file)
            
    def get_updated_card_data():
        for j in range(2):
            data_dict = data[j]
            card = cards[j]
            card['match_word'] = data_dict['orig_word'].value
            card['senses'] = [sense.value for sense in data_dict['senses'] if len(sense.value) > 0]
            card['pronounciation'] = data_dict['sound'].value
            card['ipa'] = data_dict['ipa'].value
        return cards
    
    def skip_f(sender):
        card = get_updated_card_data()
        with open(trouble_file, 'a') as f:
            f.write(json.dumps(card, indent=4) + separator)
        ipd.clear_output()
        show_next(i+1)
        
    def good_f(sender):
        card = get_updated_card_data()
        with open(deck_final_file, 'a') as f:
            f.write(json.dumps(card, indent=4) + separator)
        with open(update_file, 'a') as f:
            f.write(json.dumps(card, indent=4) + separator)
        ipd.clear_output()
        show_next(i+1)
    
    def exit_f(sender):
        print('EXIT')
        print('check files: ', deck_final_file, update_file, trouble_file)
    good.on_click(good_f)
    skip.on_click(skip_f)
    exit.on_click(exit_f)
    nav_buttons = widgets.HBox([skip, good, exit])   
    display(nav_buttons)
validate_card(0, all_cards[0])

NO MORE CARDS
check files:  minimal_final.txt text_dir/minimalminimal.txt_update.txt text_dir/minimalminimal.txt_trouble.txt
