# 英文單字與片語提取器

這個 notebook 整合了以下功能：
1. 從圖片中提取文字 (OCR)
2. 使用 OpenAI GPT 分析文字並提取學習內容

In [8]:
# 安裝必要的套件
!pip install openai python-dotenv Pillow pytesseract spacy nltk



In [1]:
import os
import json
from typing import List, Dict, Optional
from PIL import Image
import pytesseract
import openai
from dotenv import load_dotenv
import nltk
import spacy

# 下載必要的 NLTK 數據
nltk.download('wordnet')
nltk.download('brown')

# 載入環境變量
load_dotenv()

# 設置 OpenAI API 密鑰
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yaoCat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\yaoCat\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


## OCR 功能

In [2]:
def extract_text_from_image(image_path: str) -> str:
    """
    從圖片中提取文字
    
    Args:
        image_path (str): 圖片路徑
        
    Returns:
        str: 提取的文字內容
    """
    try:
        # 開啟圖片
        image = Image.open(image_path)
        
        # 使用 pytesseract 進行 OCR
        text = pytesseract.image_to_string(image)
        
        return text.strip()
    except Exception as e:
        print(f"⚠️ OCR 處理時發生錯誤: {str(e)}")
        return ""

## OpenAI GPT 功能

In [62]:
def extract_learning_phrases_with_gpt(text: str, prompt: str = None) -> List[Dict[str, str]]:
    """
    使用 OpenAI GPT 從文本中提取適合學習的英文單字和片語
    
    Args:
        text (str): 輸入的英文文本
        prompt (str, optional): 自定義提示詞。如果未提供，將使用預設提示詞。
        
    Returns:
        List[Dict[str, str]]: 包含提取的單字/片語及其解釋的列表
    """
    try:
        # 如果未提供提示詞，使用預設提示詞
        if prompt is None:
            prompt = f"""
            Extract 3 useful English words or phrases from the following text for language learners.
            For each entry, return:
            1. The word or phrase
            2. Its Chinese translation
            3. A short explanation in simple English
            4. An example sentence or usage

            Please return the result in the following JSON format:
            {{
            "phrases": [
            {{
                "phrase": "word or phrase",
                "translation": "Chinese translation",
                "explanation": "simple English explanation",
                "example": "example sentence"
            }},
            ...
            ]
            }}
            
            Please ensure the output is a valid JSON object, no extra commentary or explanation outside the JSON.
            Text:
            {text}
            """
            
        # 調用 OpenAI API（新版本）
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "你是一個專業的英語教學助手，擅長從文本中提取重要的學習內容。"},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=500
        )
        
        # 解析回應（新版本）
        content = response.choices[0].message.content
        # 移除可能的 Markdown 格式
        if content.startswith('```json'):
            content = content[7:]  # 移除 ```json
        if content.endswith('```'):
            content = content[:-3]  # 移除結尾的 ```
            
        # 清理內容並解析 JSON
        content = content.strip()
        result = json.loads(content)
        
        return result.get("phrases", [])
        
    except Exception as e:
        print(f"⚠️ 提取學習內容時發生錯誤: {str(e)}")
        return []

In [63]:
def build_prompt(extracted_text: str) -> str:
    return f"""
Extract 3 useful English words or phrases from the following text for language learners.

For each entry, return:
1. The word or phrase
2. Its Chinese translation
3. A short explanation in simple English
4. An example sentence or usage

Please return the result in the following JSON format:
{{
  "phrases": [
    {{
      "phrase": "word or phrase",
      "translation": "Chinese translation",
      "explanation": "simple English explanation",
      "example": "example sentence"
    }},
    ...
  ]
}}

Please ensure the output is a valid JSON object, no extra commentary or explanation outside the JSON.

Text:
{extracted_text}
""".strip()

In [64]:

def get_learning_content(text: str) -> Optional[Dict[str, List[Dict[str, str]]]]:
    """
    整合 GPT 提取的學習內容，並進行後處理
    
    Args:
        text (str): 輸入的英文文本
        
    Returns:
        Optional[Dict[str, List[Dict[str, str]]]]: 處理後的學習內容
    """
    try:
        # 使用 GPT 提取學習內容
        prompt = build_prompt(text)
        phrases = extract_learning_phrases_with_gpt(text, prompt)
        
        if not phrases:
            return None
            
        # 對每個提取的內容進行額外處理
        processed_content = {
            "vocabulary": [],
            "phrases": []
        }
        
        for item in phrases:
            # 檢查是否為單字（不包含空格）
            if " " not in item["phrase"]:
                processed_content["vocabulary"].append(item)
            else:
                processed_content["phrases"].append(item)
        
        return processed_content
        
    except Exception as e:
        print(f"⚠️ 處理學習內容時發生錯誤: {str(e)}")
        return None 

## 使用示例

In [65]:
# 從圖片提取文字
image_path = "20240607222058_1.jpg"  # 替換為您的圖片路徑
# extracted_text = extract_text_from_image(image_path)
extracted_text = "Old Chronos warrants every bit of justice you dispensed and more, my niece! I was already angry with him after everything he'd done, and that was prior to my realizing that he'd unleashed terrifying Typhon on us all!"
print("提取的文字：")
print(extracted_text)
print("\n---\n")

提取的文字：
Old Chronos warrants every bit of justice you dispensed and more, my niece! I was already angry with him after everything he'd done, and that was prior to my realizing that he'd unleashed terrifying Typhon on us all!

---



In [79]:
# 分析文字並提取學習內容
answer1 = get_learning_content(extracted_text)

if answer1:
    print("📚 單字：")
    for vocab in answer1["vocabulary"]:
        print(f"\n單字: {vocab['phrase']}")
        print(f"翻譯: {vocab['translation']}")
        print(f"解釋: {vocab['explanation']}")
        print(f"例句: {vocab['example']}")
    
    print("\n📝 片語：")
    for phrase in answer1["phrases"]:
        print(f"\n片語: {phrase['phrase']}")
        print(f"翻譯: {phrase['translation']}")
        print(f"解釋: {phrase['explanation']}")
        print(f"例句: {phrase['example']}")

📚 單字：

單字: warrants
翻譯: 保證
解釋: to require or deserve
例句: His hard work warrants a promotion.

單字: unleashed
翻譯: 釋放
解釋: to release or set free
例句: The company unleashed a new product into the market.

📝 片語：

片語: prior to
翻譯: 在...之前
解釋: before a particular time or event
例句: I had never traveled abroad prior to this trip.


In [105]:

if answer1:
    print("📚 單字：")
    for vocab in answer1["vocabulary"]:
        print(f"\n單字: {vocab['phrase']}")
        print(f"翻譯: {vocab['translation']}")
        print(f"解釋: {vocab['explanation']}")
        print(f"例句: {vocab['example']}")
    
    print("\n📝 片語：")
    for phrase in answer1["phrases"]:
        print(f"\n片語: {phrase['phrase']}")
        print(f"翻譯: {phrase['translation']}")
        print(f"解釋: {phrase['explanation']}")
        print(f"例句: {phrase['example']}")

📚 單字：

單字: warrants
翻譯: 保證
解釋: 表示應當或應該發生的事情
例句: The evidence warrants further investigation.

單字: dispensed
翻譯: 分發
解釋: 分發或分配某物
例句: The nurse dispensed medicine to the patients.

單字: unleashed
翻譯: 釋放
解釋: 釋放或引發某種力量或事物
例句: The storm unleashed its fury on the coastal town.

📝 片語：


In [67]:
learning_content

{'vocabulary': [{'phrase': 'warrants',
   'translation': '應得',
   'explanation': 'to deserve or justify',
   'example': 'He warrants a promotion for his hard work.'},
  {'phrase': 'dispensed',
   'translation': '分發',
   'explanation': 'to distribute or give out',
   'example': 'The nurse dispensed medication to the patients.'},
  {'phrase': 'unleashed',
   'translation': '釋放',
   'explanation': 'to release or set free',
   'example': 'The storm unleashed its fury on the coast.'}],
 'phrases': []}

##Gemma 


In [102]:
import requests
def extract_learning_phrases_with_gemma(text: str, prompt: str = None) -> List[Dict[str, str]]:
    """
    使用 Gemma 從文本中提取適合學習的英文單字和片語
    
    Args:
        text (str): 輸入的英文文本
        prompt (str, optional): 自定義提示詞。如果未提供，將使用預設提示詞。
        
    Returns:
        List[Dict[str, str]]: 包含提取的單字/片語及其解釋的列表
    """
    try:
        # 如果未提供提示詞，使用預設提示詞
        if prompt is None:
            prompt = f"""
            Extract at most 3 useful English words or phrases from the following text for medium class language learners.
            For each entry, return:
            1. The word or phrase
            2. Its Chinese translation
            3. A short explanation in simple English
            4. An example sentence or usage

            Please return the result in the following JSON format:
            {{
            "phrases": [
            {{
                "phrase": "word or phrase",
                "translation": "Chinese translation",
                "explanation": "simple English explanation",
                "example": "example sentence"
            }},
            ...
            ]
            }}
            
            Please ensure the output is a valid JSON object, no extra commentary or explanation outside the JSON.
            Text:
            {text}
            """
            
        # 調用 OpenAI API（新版本）
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": "gemma:2b",
                "prompt": prompt,
                "stream": False
            }
        )
        
        # 解析回應（新版本）
        raw = response.json()["response"]
        content = raw.strip()
            
        # 清理內容並解析 JSON
        content = content.strip()
        result = json.loads(content)
        
        return result.get("phrases", [])
        
    except Exception as e:
        print(f"⚠️ 提取學習內容時發生錯誤: {str(e)}")
        return []

In [103]:

def get_learning_content_gemma(text: str) -> Optional[Dict[str, List[Dict[str, str]]]]:
    """
    整合 Gemma 提取的學習內容，並進行後處理
    
    Args:
        text (str): 輸入的英文文本
        
    Returns:
        Optional[Dict[str, List[Dict[str, str]]]]: 處理後的學習內容
    """
    try:
        # 使用 GPT 提取學習內容
        prompt = build_prompt(text)
        phrases = extract_learning_phrases_with_gemma(text, prompt)
        
        if not phrases:
            return None
            
        # 對每個提取的內容進行額外處理
        processed_content = {
            "vocabulary": [],
            "phrases": []
        }
        
        for item in phrases:
            # 檢查是否為單字（不包含空格）
            if " " not in item["phrase"]:
                processed_content["vocabulary"].append(item)
            else:
                processed_content["phrases"].append(item)
        
        return processed_content
        
    except Exception as e:
        print(f"⚠️ 處理學習內容時發生錯誤: {str(e)}")
        return None 

In [84]:
prompt = build_prompt(extracted_text)
response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": "gemma:2b",
                "prompt": prompt,
                "stream": False
            }
        )

In [96]:
raw = response.json()["response"]
raw
content = raw.strip()
result = json.loads(content)
answer2 = result

In [104]:
# 分析文字並提取學習內容
answer2 = get_learning_content_gemma(extracted_text)

if answer2:
    print("📚 單字：")
    for vocab in answer2["vocabulary"]:
        print(f"\n單字: {vocab['phrase']}")
        print(f"翻譯: {vocab['translation']}")
        print(f"解釋: {vocab['explanation']}")
        print(f"例句: {vocab['example']}")
    
    print("\n📝 片語：")
    for phrase in answer2["phrases"]:
        print(f"\n片語: {phrase['phrase']}")
        print(f"翻譯: {phrase['translation']}")
        print(f"解釋: {phrase['explanation']}")
        print(f"例句: {phrase['example']}")

📚 單字：

單字: warrants
翻譯: 保證
解釋: to require or deserve
例句: His hard work warrants a promotion.

單字: unleashed
翻譯: 釋放
解釋: to release or set free
例句: The company unleashed a new product into the market.

📝 片語：

片語: prior to
翻譯: 在...之前
解釋: before a particular time or event
例句: I had never traveled abroad prior to this trip.


In [106]:
evaluation_prompt = f"""
You are an expert English tutor helping intermediate to advanced learners select the most effective vocabulary explanations.

Below are two sets of vocabulary learning answers generated by different AI models. Each set includes:
- 3 English words or phrases
- Their Chinese translations
- Short English explanations
- Example sentences

Evaluate both answers based on the following criteria:
1. **Relevance** – Are the chosen words/phrases useful and non-trivial for intermediate to advanced learners?
2. **Accuracy** – Are the explanations and translations correct and clearly understandable?
3. **Clarity** – Are the example sentences natural, relevant, and illustrative?
4. **Depth** – Does the explanation provide insight into real usage, including nuances?

Please respond with:
- Which answer (1 or 2) is better 

Here are the answers:

Answer 1:
{answer1}

Answer 2:
{answer2}
"""

In [98]:
evaluation_response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user", "content": evaluation_prompt}
    ],
    temperature=0.7,
    max_tokens=500
)

In [99]:
evaluation_response

ChatCompletion(id='chatcmpl-Bcw7RYd2NXdda8BkpIA3ImBsYg9A9', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='I would say that Answer 1 is better because:\n\n1. Relevance: The vocabulary words chosen (warrants, dispensed, unleashed) are more commonly used and relevant for intermediate to advanced learners compared to the phrases in Answer 2.\n2. Accuracy: The translations and explanations in Answer 1 are clear and accurate, making it easier for learners to understand the words in context.\n3. Clarity: The example sentences in Answer 1 are natural and illustrative, providing a clear picture of how the words are used in sentences.\n4. Depth: Answer 1 provides insight into real usage by explaining the nuances of each word, which is important for learners looking to expand their vocabulary effectively.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1748619573, model='gpt-3.5-tu

In [100]:
evaluation_res = evaluation_response.choices[0].message.content

In [101]:
evaluation_res

'I would say that Answer 1 is better because:\n\n1. Relevance: The vocabulary words chosen (warrants, dispensed, unleashed) are more commonly used and relevant for intermediate to advanced learners compared to the phrases in Answer 2.\n2. Accuracy: The translations and explanations in Answer 1 are clear and accurate, making it easier for learners to understand the words in context.\n3. Clarity: The example sentences in Answer 1 are natural and illustrative, providing a clear picture of how the words are used in sentences.\n4. Depth: Answer 1 provides insight into real usage by explaining the nuances of each word, which is important for learners looking to expand their vocabulary effectively.'