In [1]:
from eng_to_ipa import convert
from difflib import SequenceMatcher

def compare_ipa(transcript: str, reference_text: str) -> list:
    """
    So sánh IPA của transcript với bản tham chiếu.
    Trả về danh sách lỗi phát âm IPA.
    """
    transcript_words = transcript.split()
    reference_words = reference_text.split()
    
    ipa_errors = []

    # So sánh từng từ
    for ref_word, spoken_word in zip(reference_words, transcript_words):
        ref_ipa = convert(ref_word)  # Chuyển từ tham chiếu sang IPA
        spoken_ipa = convert(spoken_word)  # Chuyển từ transcript sang IPA

        # Tính mức độ tương đồng giữa IPA chuẩn và thực tế
        similarity = SequenceMatcher(None, ref_ipa, spoken_ipa).ratio()

        if similarity < 0.9:  # Ngưỡng sai phát âm (90% tương đồng)
            ipa_errors.append({
                "word": spoken_word,
                "reference_ipa": ref_ipa,
                "spoken_ipa": spoken_ipa,
                "similarity": round(similarity * 100, 2)  # Tính phần trăm tương đồng
            })
    
    return ipa_errors


In [2]:
def generate_pronunciation_tips(ipa_errors: list) -> list:
    """Sinh gợi ý cải thiện phát âm từ lỗi IPA."""
    tips = []
    for error in ipa_errors:
        tips.append({
            "word": error["word"],
            "tip": f"Hãy phát âm '{error['word']}' chính xác như: /{error['reference_ipa']}/"
        })
    return tips


In [7]:
transcript = "This is a test"
reference_text = "This is a test"
# Phân tích lỗi IPA
ipa_errors = compare_ipa(transcript, reference_text)

# Gợi ý cải thiện phát âm
pronunciation_tips = generate_pronunciation_tips(ipa_errors)

In [8]:
# Trả về kết quả
response = {
    "ipa_errors": ipa_errors,
    "pronunciation_tips": pronunciation_tips
}

response

{'ipa_errors': [], 'pronunciation_tips': []}

In [10]:
import language_tool_python

def calculate_grammar_score(transcript: str) -> float:
    """Tính điểm Grammar dựa trên số lỗi."""
    tool = language_tool_python.LanguageTool("en-US")
    matches = tool.check(transcript)
    total_words = len(transcript.split())
    grammar_errors = len(matches)
    
    # Tính điểm (100% nếu không có lỗi)
    grammar_score = max(0, 100 - (grammar_errors / total_words) * 100)
    return grammar_score


In [11]:
from difflib import SequenceMatcher

def calculate_vocabulary_score(transcript: str, reference_text: str) -> float:
    """Tính điểm Vocabulary dựa trên từ đúng và sai."""
    transcript_words = set(transcript.lower().split())
    reference_words = set(reference_text.lower().split())
    
    # Tính số từ đúng và sai
    correct_words = transcript_words.intersection(reference_words)
    vocab_score = (len(correct_words) / len(reference_words)) * 100 if reference_words else 0
    return vocab_score


In [12]:
def calculate_fluency_score(segments: list) -> float:
    """Tính điểm Fluency dựa trên số lần dừng và độ dài dừng."""
    pauses = [segments[i]['start'] - segments[i - 1]['end'] for i in range(1, len(segments))]
    long_pauses = [pause for pause in pauses if pause > 0.5]  # Dừng dài hơn 0.5s
    
    # Điểm fluency dựa trên số lần dừng (ít dừng = điểm cao)
    total_pauses = len(pauses)
    fluency_score = max(0, 100 - (len(long_pauses) / total_pauses) * 100 if total_pauses > 0 else 0)
    return fluency_score


In [13]:
def calculate_pronunciation_score(transcript: str, ipa_reference: list, ipa_transcript: list) -> float:
    """Tính điểm Pronunciation dựa trên sự khớp IPA."""
    correct_pronunciation = sum(1 for ref, spoken in zip(ipa_reference, ipa_transcript) if ref == spoken)
    total_pronunciation = len(ipa_reference)
    
    # Điểm phát âm dựa trên tỷ lệ khớp
    pronunciation_score = (correct_pronunciation / total_pronunciation) * 100 if total_pronunciation > 0 else 0
    return pronunciation_score


In [14]:
def calculate_overall_score(transcript: str, reference_text: str, segments: list, ipa_reference: list, ipa_transcript: list):
    """Tính điểm tổng hợp."""
    grammar_score = calculate_grammar_score(transcript)
    vocabulary_score = calculate_vocabulary_score(transcript, reference_text)
    fluency_score = calculate_fluency_score(segments)
    pronunciation_score = calculate_pronunciation_score(transcript, ipa_reference, ipa_transcript)
    
    # Tính điểm trung bình
    overall_score = (grammar_score + vocabulary_score + fluency_score + pronunciation_score) / 4
    
    return {
        "grammar": grammar_score,
        "vocabulary": vocabulary_score,
        "fluency": fluency_score,
        "pronunciation": pronunciation_score,
        "overall": overall_score
    }


In [16]:
import torch
import whisper
device = 'cuda' if torch.cuda.is_available() else 'cpu'
whisper.load_model('small').to(device)

  checkpoint = torch.load(fp, map_location=device)


Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-11): 12 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=False)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (out): Linear(in_features=768, out_features=768, bias=True)
        )
        (attn_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
        (mlp_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-0

In [17]:
def transcribe_audio_whisper(audio_file):
    """Sử dụng Whisper để nhận diện giọng nói."""
    model = whisper.load_model("small")  # Tải mô hình Whisper Small
    result = model.transcribe(audio_file)
    transcript = result['text']
    segments = result['segments']  # Lấy thời gian từng đoạn
    print(f"Transcript: {transcript}")
    return transcript, segments

# Kiểm tra
audio_file = "./IELTS_PracticeAndEvaluation/test_data/how-are-you-doing-now-a-days.wav"
transcript, segments = transcribe_audio_whisper(audio_file)


  checkpoint = torch.load(fp, map_location=device)


Transcript:  How are you doing nowadays?


In [18]:
# Gọi Whisper để lấy transcript và segments
transcript, segments

(' How are you doing nowadays?',
 [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 2.0,
   'text': ' How are you doing nowadays?',
   'tokens': [50364, 1012, 366, 291, 884, 13434, 30, 50464],
   'temperature': 0.0,
   'avg_logprob': -0.37167814042833114,
   'compression_ratio': 0.7714285714285715,
   'no_speech_prob': 0.009825356304645538}])

In [25]:
transcript

' How are you doing nowadays?'

In [26]:
transcribe_audio_whisper(audio_file)[0]

  checkpoint = torch.load(fp, map_location=device)


Transcript:  How are you doing nowadays?


' How are you doing nowadays?'

In [20]:
def calculate_overall_score_with_ipa(transcript: str, reference_text: str, segments: list, ipa_errors: list):
    """Tính điểm và thêm lỗi phát âm IPA."""
    grammar_score = calculate_grammar_score(transcript)
    vocabulary_score = calculate_vocabulary_score(transcript, reference_text)
    fluency_score = calculate_fluency_score(segments)
    pronunciation_score = 100 - len(ipa_errors)  # Giảm điểm phát âm nếu có lỗi IPA
    
    # Tính điểm trung bình
    overall_score = (grammar_score + vocabulary_score + fluency_score + pronunciation_score) / 4

    return {
        "grammar": grammar_score,
        "vocabulary": vocabulary_score,
        "fluency": fluency_score,
        "pronunciation": pronunciation_score,
        "overall": overall_score,
        "ipa_errors": ipa_errors  # Thêm chi tiết lỗi IPA
    }


In [22]:
res = calculate_overall_score_with_ipa(transcript, "how are you going", segments, ipa_errors)
res

{'grammar': 100.0,
 'vocabulary': 75.0,
 'fluency': 0,
 'pronunciation': 100,
 'overall': 68.75,
 'ipa_errors': []}

In [32]:
import difflib
from eng_to_ipa import convert as text_to_ipa
import markdown
import re

def preprocess_text(text):
    """Remove punctuation and normalize text for comparison"""
    return re.sub(r'[^\w\s]', '', text).strip().lower()

def highlight_differences(reference, actual):
    """Highlight differences between reference and actual IPA transcriptions"""
    diff = difflib.SequenceMatcher(None, reference, actual)
    highlighted_reference = []
    highlighted_actual = []

    for tag, i1, i2, j1, j2 in diff.get_opcodes():
        ref_segment = reference[i1:i2]
        act_segment = actual[j1:j2]

        if tag == 'equal':
            highlighted_reference.append(ref_segment)
            highlighted_actual.append(act_segment)
        elif tag in ('replace', 'delete'):
            highlighted_reference.append(f"**{ref_segment}**")
        if tag in ('replace', 'insert'):
            highlighted_actual.append(f"**{act_segment}**")

    return "".join(highlighted_reference), "".join(highlighted_actual)

def evaluate_pronunciation(input_text, audio_path):
    """Evaluate pronunciation and return Markdown-formatted result"""
    # Convert input text to IPA
    expected_ipa = text_to_ipa(input_text)

    # Transcribe audio to text using Whisper
    transcribed_text = transcribe_audio_whisper(audio_path)[0]
    normalized_transcribed_text = preprocess_text(transcribed_text)

    # Convert Whisper transcription to IPA
    transcribed_ipa = text_to_ipa(transcribed_text)

    # Highlight differences in IPA
    highlighted_expected, highlighted_actual = highlight_differences(expected_ipa, transcribed_ipa)

    # Highlight differences in the original text
    highlighted_text, highlighted_transcription = highlight_differences(input_text, transcribed_text)

    # Create Markdown output
    markdown_output = f"""### Pronunciation Evaluation

#### Original Text:
{highlighted_text}

#### Transcribed Text:
{highlighted_transcription}

#### Expected IPA:
{highlighted_expected}

#### Transcribed IPA:
{highlighted_actual}
"""
    return markdown_output

In [33]:
input_text = "why hello here"  # Replace with the user's text
audio_path = "./IELTS_PracticeAndEvaluation/test_data/why-hello-there-103596.wav"  # Replace with the path to the user's audio file

result = evaluate_pronunciation(input_text, audio_path)
print(result)
# Save the result to a Markdown file
with open("pronunciation_evaluation.md", "w") as f:
    f.write(result)

print("Pronunciation evaluation saved to pronunciation_evaluation.md")

Transcript:  Why hello there.
### Pronunciation Evaluation

#### Original Text:
**w**hy hello here

#### Transcribed Text:
** W**hy hello **t**here**.**

#### Expected IPA:
waɪ hɛˈloʊ **hi**r

#### Transcribed IPA:
waɪ hɛˈloʊ **ðɛ**r**.**

Pronunciation evaluation saved to pronunciation_evaluation.md


Transcript:  Why hello there.
### Pronunciation Evaluation

#### Original Text:
**w**hy hello here

#### Transcribed Text:
** W**hy hello **t**here**.**

#### Expected IPA:
waɪ hɛˈloʊ **hi**r

#### Transcribed IPA:
waɪ hɛˈloʊ **ðɛ**r**.**

In [41]:
import difflib
from eng_to_ipa import convert as text_to_ipa
import markdown
import re

def preprocess_text(text):
    """Remove punctuation and normalize text for comparison"""
    return re.sub(r'[^\w\s]', '', text).strip().lower()

def highlight_differences(reference, actual):
    """Highlight differences between reference and actual transcriptions"""
    diff = difflib.SequenceMatcher(None, reference, actual)
    highlighted_reference = []
    highlighted_actual = []

    for tag, i1, i2, j1, j2 in diff.get_opcodes():
        ref_segment = reference[i1:i2]
        act_segment = actual[j1:j2]

        if tag == 'equal':
            highlighted_reference.append(ref_segment)
            highlighted_actual.append(act_segment)
        elif tag in ('replace', 'delete'):
            highlighted_reference.append(f" **{ref_segment}** ")
        if tag in ('replace', 'insert'):
            highlighted_actual.append(f" **{act_segment}** ")

    return "".join(highlighted_reference), "".join(highlighted_actual)

def highlight_mismatched_characters(real_text, matched_text):
    """Highlight mismatched characters or words in the text."""
    highlighted_text = []
    for real_char, matched_char in zip(real_text, matched_text):
        if real_char != matched_char:
            highlighted_text.append(f" **{real_char}** ")
        else:
            highlighted_text.append(real_char)
    # Append any remaining characters (if lengths differ)
    if len(real_text) > len(matched_text):
        highlighted_text.extend(f" **{char}** " for char in real_text[len(matched_text):])
    return "".join(highlighted_text)

def display_results(data):
    """Generate the result with mismatched characters highlighted."""
    real_transcripts = data["real_transcripts"]

    # Generate matched transcripts and IPA using Whisper and eng_to_ipa
    transcribed_text = transcribe_audio_whisper(data["audio_path"])[0]
    matched_transcripts = preprocess_text(transcribed_text)

    real_transcripts_ipa = text_to_ipa(real_transcripts)
    matched_transcripts_ipa = text_to_ipa(matched_transcripts)

    # Highlight mismatched characters in text and IPA
    highlighted_text = highlight_mismatched_characters(real_transcripts, matched_transcripts)
    highlighted_ipa = highlight_differences(real_transcripts_ipa, matched_transcripts_ipa)[1]

    # Format the result
    result = f"""
### Pronunciation Feedback

#### Text with Mistakes Highlighted:
{highlighted_text}

#### Correct IPA:
/ {real_transcripts_ipa} /

#### Transcribed IPA:
/ {highlighted_ipa} /
"""
    return result

# Example usage
if __name__ == "__main__":
    # Example input JSON
    json_input = {
        "real_transcripts": "why hello here",
        "audio_path": "./IELTS_PracticeAndEvaluation/test_data/why-hello-there-103596.wav"  # Replace with the path to the user's audio file
    }

    # Display pronunciation evaluation
    markdown_result = display_results(json_input)
    print(markdown_result)

    # Save to Markdown file
    with open("pronunciation_evaluation.md", "w") as f:
        f.write(markdown_result)

    print("Pronunciation evaluation saved to pronunciation_evaluation.md")


Transcript:  Why hello there.

### Pronunciation Feedback

#### Text with Mistakes Highlighted:
why hello  **h**  **e**  **r**  **e** 

#### Correct IPA:
/ waɪ hɛˈloʊ hir /

#### Transcribed IPA:
/ waɪ hɛˈloʊ  **ðɛ** r /

Pronunciation evaluation saved to pronunciation_evaluation.md


In [42]:
def transcribe_audio_whisper(audio_path):
    """Simulated function for transcribing audio using Whisper model (replace with actual implementation)."""
    # Implement actual transcription code here
    return ["why hello there"]  # Dummy transcription for now

def display_results(data):
    """Generate the result with mismatched characters highlighted."""
    real_transcripts = data["real_transcripts"]

    # Generate matched transcripts and IPA using Whisper and eng_to_ipa
    transcribed_text = transcribe_audio_whisper(data["audio_path"])[0]
    matched_transcripts = preprocess_text(transcribed_text)

    real_transcripts_ipa = text_to_ipa(real_transcripts)
    matched_transcripts_ipa = text_to_ipa(matched_transcripts)

    # Highlight mismatched characters in text and IPA
    highlighted_text = highlight_mismatched_characters(real_transcripts, matched_transcripts)
    highlighted_ipa = highlight_differences(real_transcripts_ipa, matched_transcripts_ipa)[1]

    # Return results as JSON
    result = {
        "pronunciation_feedback": {
            "text_with_mistakes_highlighted": highlighted_text,
            "correct_ipa": real_transcripts_ipa,
            "transcribed_ipa": highlighted_ipa
        }
    }
    return result

res = display_results(json_input)
res

{'pronunciation_feedback': {'text_with_mistakes_highlighted': 'why hello  **h**  **e**  **r**  **e** ',
  'correct_ipa': 'waɪ hɛˈloʊ hir',
  'transcribed_ipa': 'waɪ hɛˈloʊ  **ðɛ** r'}}

Transcript:  Why hello there.

### Pronunciation Feedback

#### Text with Mistakes Highlighted:
why hello  **h**  **e**  **r**  **e** 

#### Correct IPA:
/ waɪ hɛˈloʊ hir /

#### Transcribed IPA:
/ waɪ hɛˈloʊ  **ðɛ** r /