Updated to provide better examples for words able to be used as mulit…

…ple different parts of speech.
faceleg · Apr 16, 2024 · 6f201d4 · 6f201d4
1 parent 3c3125a
commit 6f201d4
Show file tree

Hide file tree

Showing 6 changed files with 126 additions and 27 deletions.
diff --git a/src/main.ts b/src/main.ts
@@ -6,14 +6,40 @@ import { fetchExamples } from './openai/get-sentences-from-chatgpt';
 import sleep from './utils/sleep';
 require('axios-debug-log/enable');
 
-const MAX_NOTES_PROCESSED_AT_ONCE = 8;
-const MAX_NOTES_PROCESSED_AT_ONE_RUN = 200;
+const MAX_NOTES_PROCESSED_AT_ONCE = 5;
+const MAX_NOTES_PROCESSED_AT_ONE_RUN = 10;
 
 let notesProcessedCount = 0;
 
 void (async function () {
     dotenv.config({ path: '.env.local' });
-
+//    console.log(await fetchExamples('Mandarin', [
+//     { 
+//         noteId: 1, 
+//         text: '得'
+//     }, 
+//     { 
+//         noteId: 2, 
+//         text: '不'
+//     }, 
+//     { 
+//         noteId: 3, 
+//         text: '下' 
+//     },
+//     { 
+//         noteId: 4, 
+//         text: '就' 
+//     },
+//     {
+//         noteId: 5,
+//         text: '又'
+//     },
+//     {
+//         noteId: 6,
+//         text: '喂'
+//     }
+// ]));
+// return;
     const ankiDeck = process.env['ANKI_DECK'];
     if (ankiDeck === undefined) {
         throw new Error('ANKI_DECK is not configured in env!');
@@ -30,6 +56,9 @@ void (async function () {
     while (true) {
         const notes = await fetchNotesFromAnki(ankiDeck);
         console.log(`Found ${notes.length} notes eligible for fetching!`);
+        if (notes.length === 0) {
+            break;
+        }
 
         const notesForProcessing = notes.slice(0, MAX_NOTES_PROCESSED_AT_ONCE);
         console.log(`Processing batch of ${notesForProcessing.length} notes.`);
@@ -40,8 +69,8 @@ void (async function () {
             const noteId = vocabularyExample.id;
             const examples = vocabularyExample.exampleSentences;
 
-            console.log(`Updating Anki note ${noteId} with ${examples}`)
-            const fieldText = examples.map((example) => `<div class="char_example">${example}</div>`).join('')
+            console.log(`Updating Anki note ${noteId} with ${examples.join('\n')}`);
+            const fieldText = `${examples.join('')}`;
 
             const noteForAnki = {
                 id: +noteId,
@@ -58,6 +87,7 @@ void (async function () {
         notesProcessedCount += notesForProcessing.length;
         if (notesProcessedCount >= MAX_NOTES_PROCESSED_AT_ONE_RUN) {
             console.log(`Processed ${notesProcessedCount} notes, quiting.`);
+            console.dir(notes.map((note) => note.text).join(', '));
             break;
         }
     }

diff --git a/src/openai/generate-prompt.test.ts b/src/openai/generate-prompt.test.ts
@@ -1,8 +1,8 @@
 import { expect, test } from 'vitest';
-import { generatePrompt } from './generate-prompt';
+import { generateBasicVocabularyPromptSegment } from './generate-prompt';
 
 test('generate prompt', () => {
-    const prompt = generatePrompt([
+    const prompt = generateBasicVocabularyPromptSegment([
         {
             noteId: 1,
             text: 'der Foo',

diff --git a/src/openai/generate-prompt.ts b/src/openai/generate-prompt.ts
@@ -1,5 +1,29 @@
 import type { NoteForProcessing } from '../anki';
+import type { PartsOfSpeech } from './typechat-response-parts-of-speech-schema';
 
-export const generatePrompt = (vocabulary: NoteForProcessing[]): string => {
-    return vocabulary.map((note) => `${note.noteId}: ${note.text}`).join('\n');
+export const generateBasicVocabularyPromptSegment = (vocabulary: NoteForProcessing[]): string => {
+    return vocabulary
+            .map((note) => `${note.noteId}: ${note.text}`)
+            .join('\n');
 };
+
+const partsOfSpeechLookup = (noteId: number, partsOfSpeech: PartsOfSpeech[]): string => {
+    const partOfSpeech = partsOfSpeech.find(
+        (partsOfSpeech): boolean | undefined => noteId === partsOfSpeech.id
+    )
+
+    if (partOfSpeech === undefined) {
+        throw Error(`Part of speech not found for ${noteId}`)
+    } 
+
+    return partOfSpeech.partsOfSpeech    
+}
+
+export const generatePartsOfSpeechVocabularyPromptSegment = (
+    vocabulary: NoteForProcessing[], 
+    partsOfSpeech: PartsOfSpeech[]
+): string => {
+    return vocabulary
+        .map((note) => `${note.noteId}: ${note.text} (${partsOfSpeechLookup(note.noteId, partsOfSpeech)})`)
+        .join('\n')
+}
diff --git a/src/openai/get-sentences-from-chatgpt.ts b/src/openai/get-sentences-from-chatgpt.ts
@@ -1,41 +1,78 @@
 /* eslint-disable @typescript-eslint/no-unused-vars */
 import type { NoteForProcessing } from '../anki';
-import { generatePrompt } from './generate-prompt';
+import { generateBasicVocabularyPromptSegment, generatePartsOfSpeechVocabularyPromptSegment } from './generate-prompt';
 import { createJsonTranslator, createLanguageModel } from 'typechat';
 import { createTypeScriptJsonValidator } from "typechat/ts";
 import fs from 'fs';
 import path from 'path';
-import type { VocabularyExamples, AllVocabularyExamples } from './typechat-response-schema';
+import type { AllVocabularyExamples, VocabularyExamples } from './typechat-response-schema';
+import type { AllPartsOfSpeech } from './typechat-response-parts-of-speech-schema';
 
 export const fetchExamples = async (language: string, vocabulary: NoteForProcessing[]): Promise<VocabularyExamples[]> => {
     if (vocabulary.length === 0) {
         throw new Error('No vocabulary passed!');
     }
 
-    const model = createLanguageModel(process.env);
+    const partsOfSpeechModel = createLanguageModel(process.env);
     // @todo path is from `build/openai/`
-    const schema = fs.readFileSync(path.join(__dirname, '../../src/openai/typechat-response-schema.ts'), 'utf8');
+    const partsOfSpeechSchema = fs.readFileSync(path.join(__dirname, '../../src/openai/typechat-response-parts-of-speech-schema.ts'), 'utf8');
     // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-call
-    const validator = createTypeScriptJsonValidator<AllVocabularyExamples>(schema, "AllVocabularyExamples");
+    const partsOfSpeechValidator = createTypeScriptJsonValidator<AllPartsOfSpeech>(partsOfSpeechSchema, "AllPartsOfSpeech");
     // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
-    const translator = createJsonTranslator(model, validator);
+    const partsOfSpeechTranslator = createJsonTranslator(partsOfSpeechModel, partsOfSpeechValidator);
 
-    const vocabularyPrompt = generatePrompt(vocabulary);
-    console.log({ vocabularyPrompt});
+    const vocabularyPrompt = generateBasicVocabularyPromptSegment(vocabulary);
 
-    const prompt =
-        'You are a helpful vocabulary learning assistant who helps user generate example sentences in ' +
-        language +
-        ' for language learning. I will provide each word prefixed by ID and you will generate two example sentences for each input. The sentences should be complex enough for HSK4 or higher level.\n' +
-        vocabularyPrompt;
+    // Splitting
+    const partsOfSpeechPrompt = `You are a helpful vocabulary learning assistant who helps users generate example sentences in Mandarin for language learning. You understand that in Mandarin, words can serve different parts of speech depending on context, for example: nouns, pronouns, verbs, adjectives, adverbs, number words, measure words, prepositions, conjunctions, interjections, onomatopoeias, particles, function words, verbal measure words, auxiliary verbs.
 
-    const response = await translator.translate(prompt);
+I will provide each word prefixed by ID. For each word, please find the possible usages for the words. Return the parts of speech as a single comma separated string.
+    
+Here is the list of words prefixed by ID:
+    
+${vocabularyPrompt}
+
+For each word in the provided list, define all possible parts of speech based on context. If a word can have multiple parts of speech, list all possible parts of speech for that word explicitly.`;
+
+    console.log(partsOfSpeechPrompt)
+
+    const response = await partsOfSpeechTranslator.translate(partsOfSpeechPrompt);
     if (!response.success) {
         console.dir({ response });
         throw new Error('Error fetching data from chatGPT: ' + response.message);
+    } 
+    const partsOfSpeech = response.data.partsOfSpeech
+
+    console.dir(partsOfSpeech)
+
+    const exampleSentencesModel = createLanguageModel(process.env);
+    const exampleSentencesSchema = fs.readFileSync(path.join(__dirname, '../../src/openai/typechat-response-schema.ts'), 'utf8');
+    // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-call
+    const exampleSentencesValidator = createTypeScriptJsonValidator<AllVocabularyExamples>(exampleSentencesSchema, "AllVocabularyExamples");
+    // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
+    const exampleSentencesTranslator = createJsonTranslator(exampleSentencesModel, exampleSentencesValidator);
+
+    const exampleSentencesPrompt = `Now that you have defined the possible parts of speech for each word, please generate one example sentence per part of speech for each word. The sentences should be medium or longer length and complexity of HSK5 or higher. Each possible part of speech must have at least one sentence generated for it.
+
+Here is the list of words prefixed by ID:
+    
+${generatePartsOfSpeechVocabularyPromptSegment(vocabulary, partsOfSpeech)}
+    
+For each word in the provided list, generate example sentences for each defined part of speech. Wrap the ${language} sentences in <div class="char_example"></div>. `
+
+/**
+ * Also generate the English translation, wrapped in <div class="char_example_English"></div>. At the end of each English translation include the part of speech in brackets.
+
+Return the examples and translations as a flat list of strings, with the the example and translation in pairs.
+ */
+
+    console.log(exampleSentencesPrompt)
+
+    const exampleSentencesResponse = await exampleSentencesTranslator.translate(exampleSentencesPrompt);
+    if (!exampleSentencesResponse.success) {
+        console.dir({ response });
+        throw new Error('Error fetching data from chatGPT: ' + exampleSentencesResponse.message);
     } else {
-        // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
-        console.dir({ items: response.data.items });
-        return (response.data.items as VocabularyExamples[]);
+        return exampleSentencesResponse.data.items;
     }
 };
diff --git a/src/openai/typechat-response-parts-of-speech-schema.ts b/src/openai/typechat-response-parts-of-speech-schema.ts
@@ -0,0 +1,8 @@
+export interface AllPartsOfSpeech {
+    partsOfSpeech: PartsOfSpeech[];
+}
+
+export interface PartsOfSpeech {
+    id: number;
+    partsOfSpeech: string;
+}
diff --git a/src/openai/typechat-response-schema.ts b/src/openai/typechat-response-schema.ts
@@ -4,5 +4,5 @@ export interface AllVocabularyExamples {
 
 export interface VocabularyExamples {
     id: number;
-    exampleSentences: [string, string];
+    exampleSentences: string[];
 }