In [37]:
import os, copy, json, time, pickle
import pandas as pd
from openai import OpenAI



In [55]:

api_key=open('openai_api_key.txt', 'r').read()
client = OpenAI(api_key=api_key)

# Big dataset of chinese words to learn and chinese_chars,pinyin_pronunciation,english_meaning
ALL_WORDS = [[p.strip() for p in l.split(',')] for l in open('chinese_words.txt').readlines()]
ch2pinyin = dict((p[0], p[1]) for p in ALL_WORDS)
ch2english = dict((p[0], p[2]) for p in ALL_WORDS)


N_KNOWN = 30
N_LEARNING = 5
N_TARGET_SENTENCES = 5
BATCH_SIZE = 20   # ask for several at once
N_SUCCESS_TO_MEMORIZE = 2  # this many successes in a row --> word memorized

In [63]:
'''
TODO
The API call slowed down a LOT when I switched to the current prompt.
It was much faster previously when I just asked for sentences
and figured out the target words and pronunciation myself.
'''

PROMPT_TEMPLATE = """
Here are a list of "Known words" that you can use as much as you want,
and a list of "Target words" - you can only use one of them in a sentence.
That idea is that every sentence should have exactly one "Target" word
and have all the other words be "Known" words.
You may ONLY use words from the "Known words" list and the "Target words" list.
Do NOT use any other words.
You may repeat words.

Known words:
{known_words_s}

Target words:
{target_words_s}

Task:
Write {n} idiomatic, natural, everyday Chinese sentences.
Each sentence must contain exactly one word from the "Target words" list
and have the rest be from the "Known words" list.

Each line should have the target word, the sentence (with spaces), pronunciation, and an English translation. 
Separated by pipes.

Example lines:
喜欢|我喜欢你|Wǒ xǐ huan nǐ|I like you
朋友|你是我的朋友|Nǐ shì wǒ de péng you|You are my friend
"""
def get_lines(known_words: list[list[str]], target_words: list[list[str]], n, model="gpt-5-mini") -> list[tuple[str,str]]:
    known_words_s = ','.join(known_words)
    target_words_s = ','.join(target_words)
    prompt = PROMPT_TEMPLATE.format(**locals())
    response = client.responses.create(
        model = model,
        input=prompt
    )
    lines = response.output_text.strip().splitlines()
    lines = [l.split('|') for l in lines if l.count('|')==3 and l.split('|')[0] in target_words]
    return lines
#
#get_lines(known_words, target_words, 5, model="gpt-5-mini")

In [64]:
# Variable state

try:
    state = json.loads(open('state.json','r').read())
    print('Recovering saved state')
    assert False
except:
    print('Initializing empty state')
    state = {
        'known_words': [wb[0] for wb in ALL_WORDS[:N_KNOWN]],
        'target_words': [wb[0] for wb in ALL_WORDS[N_KNOWN+1:N_KNOWN+N_LEARNING]],
        'perf': {w[0]:[] for w in ALL_WORDS}
    }

Recovering saved state
Initializing empty state


In [None]:
while True:
    print('\n\n** NEW ROUND **')
    known_words, target_words, perf = state['known_words'], state['target_words'], state['perf']
    print('target_words', target_words)
    print('known_words', known_words)
    assert not set(known_words).intersection(target_words)
    sentences = get_lines(known_words, target_words, N_TARGET_SENTENCES)
    for s in sentences: print(s)
    for i, (tw, s, pron, trans) in enumerate(sentences):
        print(f"{i}. {s.replace(' ', '')}")
    time.sleep(1)
    errors = input("Which numbers did you miss?")
    # print('errors', errors)
    if errors=='end':
        # End session
        open('state.json','w').write(json.dumps(state))
        break
    errors = [int(x) for x in errors]
    print('errors:', errors)
    for i, (tw, s, pron, trans) in enumerate(sentences):
        if i in errors:
            print('Failed', tw)
            print(s,'\n',pron,'\n',trans)
            perf[tw].append('fail')
        else:
            # print('Suceeded', tw)
            perf[tw].append('success')
        if len(perf[tw])>N_SUCCESS_TO_MEMORIZE and set(perf[tw][-1*N_SUCCESS_TO_MEMORIZE:])==set(['success']):
            assert not set(known_words).intersection(target_words)
            if tw in target_words:
                print('tw', tw)
                print('target_words', target_words)
                known_words.append(tw)
                state['target_words'] = [w for w in target_words if w != tw]
                new_wb = [wb for wb in ALL_WORDS if wb[0] not in known_words][0]
                state['target_words'].append(new_wb[0])




** NEW ROUND **
target_words ['没', '吧', '把', '跟']
known_words ['的', '在', '有', '一', '个', '我', '不', '是', '这', '他', '了', '你', '们', '也', '说', '就', '人', '都', '和', '来', '上', '去', '看', '为', '到', '能', '这儿', '那', '好', '想']
