In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

1. Cosine Similarity with Dynamic Time Warping (DTW)

In [None]:
# Function to compare similarity between two texts
def similar_text(seq1, seq2):
    # Tokenize the paragraphs
    seq1_tokens = word_tokenize(seq1)
    seq2_tokens = word_tokenize(seq2)

    # Combine tokens
    all_tokens = [seq1_tokens, seq2_tokens]

    # Vectorize tokens
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform([' '.join(tokens) for tokens in all_tokens])

    # Compute cosine similarity between tokens
    similarity_matrix = cosine_similarity(X)

    # Find the maximum similarity score
    max_similarity_score = similarity_matrix[0, 1]

    return max_similarity_score, seq1_tokens, seq2_tokens

# Function to calculate DTW distance
def calculate_dtw_distance(seq1, seq2):
    # Initialize DTW matrix with zeros
    dtw_matrix = np.zeros((len(seq1) + 1, len(seq2) + 1))

    # Fill the first row and column with large values
    for i in range(1, len(seq1) + 1):
        dtw_matrix[i, 0] = np.inf
    for j in range(1, len(seq2) + 1):
        dtw_matrix[0, j] = np.inf

    # Fill the DTW matrix
    for i in range(1, len(seq1) + 1):
        for j in range(1, len(seq2) + 1):
            cost = np.abs(seq1[i - 1] - seq2[j - 1])
            dtw_matrix[i, j] = cost + min(dtw_matrix[i-1, j], dtw_matrix[i, j-1], dtw_matrix[i-1, j-1])

    # Return the DTW distance and accumulated errors
    return dtw_matrix[len(seq1), len(seq2)], dtw_matrix

# Function to calculate similarity grade, DTW distance, and errors between texts
def calculate_similarity_grade(seq1_tokens, seq2_tokens, word_to_number):
    # Convert words in seq1 to numerical vectors using the provided mapping
    vector1 = [word_to_number[word] for word in seq1_tokens]

    # Calculate DTW distance between the sequences
    dtw_distance, _ = calculate_dtw_distance(vector1, [word_to_number.get(word, 0) for word in seq2_tokens])

    # Normalize the DTW distance to get a similarity grade out of ten
    similarity_grade = 5 / (1 + dtw_distance)

    return similarity_grade


# Example texts and word-to-number mapping
seq1 = "الولد ذهب إلى المدرسة ودرس الرياضيات"
seq2 = " الولد ذهب إلى المدرسة"

word_to_number = {
    "ذهب": 1,
    "الولد": 2,
    "إلى": 3,
    "المدرسة": 4,
    "ودرس": 5,
    "الرياضيات": 6,
}

# Compare similarity between texts
similarity_score, seq1_tokens, seq2_tokens = similar_text(seq1, seq2)

# Check if texts are similar
if similarity_score > 0.5:  # Adjust threshold as needed
    print("Texts are similar!")

    # Calculate similarity grade between the similar texts using DTW
    similarity_grade = calculate_similarity_grade(seq1_tokens, seq2_tokens, word_to_number)
    print("Similarity grade (out of 5):", similarity_grade)
else:
    print("Texts are not similar.")


Texts are similar!
Similarity grade (out of 10): 1.25


In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize

# Function to compare similarity between two texts
def similar_text(seq1, seq2):
    # Tokenize the paragraphs
    seq1_tokens = word_tokenize(seq1)
    seq2_tokens = word_tokenize(seq2)

    # Combine tokens
    all_tokens = [seq1_tokens, seq2_tokens]

    # Vectorize tokens
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform([' '.join(tokens) for tokens in all_tokens])

    # Compute cosine similarity between tokens
    similarity_matrix = cosine_similarity(X)

    # Find the maximum similarity score
    max_similarity_score = similarity_matrix[0, 1]

    return max_similarity_score, seq1_tokens, seq2_tokens

# Function to calculate DTW distance
def calculate_dtw_distance(seq1, seq2):
    # Initialize DTW matrix with zeros
    dtw_matrix = np.zeros((len(seq1) + 1, len(seq2) + 1))

    # Fill the first row and column with large values
    for i in range(1, len(seq1) + 1):
        dtw_matrix[i, 0] = np.inf
    for j in range(1, len(seq2) + 1):
        dtw_matrix[0, j] = np.inf

    # Fill the DTW matrix
    for i in range(1, len(seq1) + 1):
        for j in range(1, len(seq2) + 1):
            cost = np.abs(seq1[i - 1] - seq2[j - 1])
            dtw_matrix[i, j] = cost + min(dtw_matrix[i-1, j], dtw_matrix[i, j-1], dtw_matrix[i-1, j-1])

    # Return the DTW distance and accumulated errors
    return dtw_matrix[len(seq1), len(seq2)], dtw_matrix

# Function to calculate similarity grade, DTW distance, and errors between texts
def calculate_similarity_grade(seq1_tokens, seq2_tokens, word_to_number):
    # Convert words in seq1 to numerical vectors using the provided mapping
    vector1 = [word_to_number[word] for word in seq1_tokens]

    # Convert words in seq2 using the mapping, if a word is not in the mapping, use 0
    vector2 = [word_to_number.get(word, 0) for word in seq2_tokens]

    # Calculate DTW distance between the sequences
    dtw_distance, _ = calculate_dtw_distance(vector1, vector2)

    # Normalize the DTW distance to get a similarity grade out of 5
    similarity_grade = 5 / (1 + dtw_distance)

    return similarity_grade

# Example texts and word-to-number mapping
seq1 = "الولد ذهب إلى المدرسة ودرس الرياضيات"
seq2 = "الولد ذهب إلى المدرسة ودرس الرياضيات"

word_to_number = {
    "ذهب": 1,
    "الولد": 2,
    "إلى": 3,
    "المدرسة": 4,
    "ودرس": 5,
    "الرياضيات": 6,
}

# Compare similarity between texts
similarity_score, seq1_tokens, seq2_tokens = similar_text(seq1, seq2)

# Check if texts are similar
if similarity_score > 0.5:  # Adjust threshold as needed
    print("Texts are similar!")

    # Calculate similarity grade between the similar texts using DTW
    similarity_grade = calculate_similarity_grade(seq1_tokens, seq2_tokens, word_to_number)
    print("Similarity grade (out of 5):", similarity_grade)
else:
    print("Texts are not similar.")


Texts are similar!
Similarity grade (out of 5): 5.0



2. CountVectorizer

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize

# Function to compare similarity between two texts
def similar_text(seq1, seq2):
    # Tokenize the paragraphs
    seq1_tokens = word_tokenize(seq1)
    seq2_tokens = word_tokenize(seq2)

    # Combine tokens
    all_tokens = [seq1_tokens, seq2_tokens]

    # Vectorize tokens
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform([' '.join(tokens) for tokens in all_tokens])

    # Compute cosine similarity between tokens
    similarity_matrix = cosine_similarity(X)

    # Find the maximum similarity score
    max_similarity_score = similarity_matrix[0, 1]

    return max_similarity_score, seq1_tokens, seq2_tokens

# Function to calculate DTW distance
def calculate_dtw_distance(seq1, seq2):
    # Initialize DTW matrix with zeros
    dtw_matrix = np.zeros((len(seq1) + 1, len(seq2) + 1))

    # Fill the first row and column with large values
    for i in range(1, len(seq1) + 1):
        dtw_matrix[i, 0] = np.inf
    for j in range(1, len(seq2) + 1):
        dtw_matrix[0, j] = np.inf

    # Fill the DTW matrix
    for i in range(1, len(seq1) + 1):
        for j in range(1, len(seq2) + 1):
            cost = abs(seq1[i - 1] - seq2[j - 1])  # Use absolute difference as cost
            dtw_matrix[i, j] = cost + min(dtw_matrix[i-1, j], dtw_matrix[i, j-1], dtw_matrix[i-1, j-1])

    # Return the DTW distance
    return dtw_matrix[len(seq1), len(seq2)]

# Function to calculate similarity grade between texts
def calculate_similarity_grade(seq1_tokens, seq2_tokens):
    # Calculate DTW distance between the sequences
    dtw_distance = calculate_dtw_distance(seq1_tokens, seq2_tokens)

    # Normalize the DTW distance to obtain similarity grade out of ten
    similarity_grade = 5 / (1 + dtw_distance)

    return similarity_grade

# Example texts
seq1 = "الولد ذهب إلى المدرسة ودرس الرياضيات"
seq2 = "ذهب الولد إلى المدرسة"

# Tokenize the sequences
seq1_tokens = [1, 2, 3, 4]
seq2_tokens = [2, 1, 3, 4, 6, 7]

# Calculate similarity grade between texts
similarity_grade = calculate_similarity_grade(seq1_tokens, seq2_tokens)

# Output similarity grade
print("Similarity grade (out of 5):", similarity_grade)


Similarity grade (out of 10): 0.625


In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize

# Function to find the longest common subsequence (LCS) between two sequences
def longest_common_subsequence(seq1, seq2):
    # Initialize matrix to store lengths of LCSs
    lcs_matrix = [[0] * (len(seq2) + 1) for _ in range(len(seq1) + 1)]

    # Fill the matrix
    for i in range(1, len(seq1) + 1):
        for j in range(1, len(seq2) + 1):
            if seq1[i - 1] == seq2[j - 1]:
                lcs_matrix[i][j] = lcs_matrix[i - 1][j - 1] + 1
            else:
                lcs_matrix[i][j] = max(lcs_matrix[i - 1][j], lcs_matrix[i][j - 1])

    # Traceback to find the LCS
    lcs = []
    i, j = len(seq1), len(seq2)
    while i > 0 and j > 0:
        if seq1[i - 1] == seq2[j - 1]:
            lcs.insert(0, seq1[i - 1])
            i -= 1
            j -= 1
        elif lcs_matrix[i - 1][j] > lcs_matrix[i][j - 1]:
            i -= 1
        else:
            j -= 1

    return lcs

# Function to compare similarity between two texts
def similar_text(seq1, seq2):
    # Tokenize the paragraphs
    seq1_tokens = word_tokenize(seq1)
    seq2_tokens = word_tokenize(seq2)

    # Find the longest common subsequence
    lcs = longest_common_subsequence(seq1_tokens, seq2_tokens)

    return lcs

# Function to calculate similarity grade between texts
def calculate_similarity_grade(lcs_length):
    # Calculate similarity grade based on the length of LCS
    similarity_grade = 5 / (1 + lcs_length)

    return similarity_grade

# Example texts
seq1 = "ذهب الولد إلى المدرسة"
seq2 = "ذهب الولد إلى المدرسة"

# Find the similar part between texts
similar_part = similar_text(seq1, seq2)

# Check if there is a similar part
if similar_part:
    print("Similar part found:", ' '.join(similar_part))

    # Calculate similarity grade for the similar part using the LCS
    similarity_grade = calculate_similarity_grade(len(similar_part))
    print("Similarity grade (out of 5):", similarity_grade)
else:
    print("No similar part found.")


Similar part found: ذهب الولد إلى المدرسة
Similarity grade (out of 10): 1.0



3. SequenceMatcher

In [None]:
from difflib import SequenceMatcher
from nltk.tokenize import word_tokenize

# Function to find the most similar contiguous part between two texts
def find_most_similar_contiguous_part(seq1, seq2):
    matcher = SequenceMatcher(None, seq1, seq2)
    match = matcher.find_longest_match(0, len(seq1), 0, len(seq2))
    return seq1[match.a: match.a + match.size]

# Function to calculate similarity grade between texts
def calculate_similarity_grade(similar_part, seq1, seq2):
    # Calculate the similarity score based on the length of the similar part
    similarity_score = len(similar_part) / max(len(seq1), len(seq2))

    # Normalize the similarity score to obtain similarity grade out of ten
    similarity_grade = similarity_score * 10

    # If the similar part is the same as one of the original texts, set the similarity grade to 10
    if ' '.join(similar_part) in [' '.join(seq1), ' '.join(seq2)]:
        similarity_grade = 10

    return similarity_grade

# Example texts
seq1 = "ذهب الولد إلى المدرسة"
seq2 = "ذهب الولد إلى المدرسة "

# Tokenize the sequences
seq1_tokens = word_tokenize(seq1)
seq2_tokens = word_tokenize(seq2)

# Find the most similar contiguous part between texts
similar_part = find_most_similar_contiguous_part(seq1_tokens, seq2_tokens)

# Check if a similar part is found
if similar_part:
    print("Similar part found:", ' '.join(similar_part))

    # Calculate similarity grade for the similar part
    similarity_grade = calculate_similarity_grade(similar_part, seq1_tokens, seq2_tokens)
    print("Similarity grade (out of 5):", similarity_grade)
else:
    print("No similar part found.")


Similar part found: ذهب الولد إلى المدرسة
Similarity grade (out of 10): 10


In [None]:
from difflib import SequenceMatcher
from nltk.tokenize import word_tokenize

# Function to find the most similar contiguous part between two texts
def find_most_similar_contiguous_part(seq1, seq2):
    matcher = SequenceMatcher(None, seq1, seq2)
    match = matcher.find_longest_match(0, len(seq1), 0, len(seq2))
    return seq1[match.a: match.a + match.size]

# Function to calculate similarity grade between texts
def calculate_similarity_grade(similar_part, seq1, seq2):
    # Calculate the similarity score based on the length of the similar part
    similarity_score = len(similar_part) / max(len(seq1), len(seq2))

    # Normalize the similarity score to obtain similarity grade out of ten
    similarity_grade = similarity_score * 10

    # If the similar part is the same as one of the original texts, set the similarity grade to 10
    if ' '.join(similar_part) in [' '.join(seq1), ' '.join(seq2)]:
        similarity_grade = 10

    return similarity_grade

# Example texts
seq1 = "ذهب الولد إلى المدرسة "
seq2 = "الولد ذهب إلى المدرسة"

# Tokenize the sequences
seq1_tokens = word_tokenize(seq1)
seq2_tokens = word_tokenize(seq2)

# Find the most similar contiguous part between texts
similar_part = find_most_similar_contiguous_part(seq1_tokens, seq2_tokens)

# Check if a similar part is found
if similar_part:
    print("Similar part found:", ' '.join(similar_part))

    # Calculate similarity grade for the similar part
    similarity_grade = calculate_similarity_grade(similar_part, seq1_tokens, seq2_tokens)
    print("Similarity grade (out of 5):", similarity_grade)
else:
    print("No similar part found.")

Similar part found: إلى المدرسة
Similarity grade (out of 10): 5.0


4. WER with Levenshtein Distances Enhanced by DTW

In [None]:
import numpy as np

# Function to calculate DTW distance
def calculate_dtw_distance(s1, s2):
    # Initialize DTW matrix with zeros
    dtw_matrix = np.zeros((len(s1) + 1, len(s2) + 1))

    # Fill the first row and column with large values
    for i in range(1, len(s1) + 1):
        dtw_matrix[i, 0] = np.inf
    for j in range(1, len(s2) + 1):
        dtw_matrix[0, j] = np.inf

    # Fill the DTW matrix
    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            cost = abs(ord(s1[i - 1][0]) - ord(s2[j - 1][0]))
            dtw_matrix[i, j] = cost + min(dtw_matrix[i-1, j], dtw_matrix[i, j-1], dtw_matrix[i-1, j-1])

    # Return the DTW distance and accumulated errors
    return dtw_matrix[len(s1), len(s2)], dtw_matrix

def levenshtein(s1, s2):
    if len(s1) < len(s2):
        s1, s2 = s2, s1

    if len(s2) == 0:
        return len(s1), []

    previous_row = range(len(s2) + 1)
    table = [previous_row]
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
        table.append(current_row)

    # Traceback to find the alignment and mark errors
    marked_reference = []
    x, y = len(s1), len(s2)
    while x > 0 and y > 0:
        if s1[x-1] == s2[y-1]:
            marked_reference.append(s2[y-1])
            x, y = x-1, y-1
        elif table[x][y] == table[x-1][y-1] + 1:  # Substitution
            marked_reference.append('$' + s2[y-1] + '$')
            x, y = x-1, y-1
        elif table[x][y] == table[x][y-1] + 1:  # Insertion in s2
            marked_reference.append('$' + s2[y-1] + '$')
            y -= 1
        else:  # Deletion in s1
            x -= 1

    # Catch remaining parts of the reference that are missing in the transcript
    while y > 0:
        marked_reference.append('$' + s2[y-1] + '$')
        y -= 1

    marked_reference.reverse()
    return table[-1][-1], ' '.join(marked_reference)

def calculate_wer(transcript, reference):
    transcript_words = transcript.split()
    reference_words = reference.split()
    _, dtw_matrix = calculate_dtw_distance(transcript_words, reference_words)
    distance, marked_reference = levenshtein(transcript_words, reference_words)
    wer = distance / max(len(transcript_words), len(reference_words))
    return wer, marked_reference

def grade_transcript(wer):
    if wer < 0.1: return 5
    if wer < 0.2: return 4.5
    if wer < 0.3: return 4
    if wer < 0.4: return 3.5
    if wer < 0.5: return 3
    if wer < 0.6: return 2.5
    if wer < 0.7: return 2
    if wer < 0.8: return 1.5
    if wer < 0.9: return 1
    return 0.5 if wer < 1 else 0

# Example usage
reference = "كان ياما كان في قديم الزمان وسالف العصر والأوان وكان هناك ملك عظيم الشأن يحكم بلاد  "
transcript = "كان ياما كان في قديم زمان وسالف العصر والاون وكان هنالك ملك عظيم يحكم بلاد "

wer, marked_reference = calculate_wer(transcript, reference)
grade = grade_transcript(wer)

print("WER:", wer)
print("Grade:", grade)
print("Marked Reference:", marked_reference)

WER: 0.25
Grade: 4
Marked Reference: كان ياما كان في قديم $زمان$ وسالف العصر $والاون$ وكان $هنالك$ ملك عظيم يحكم بلاد


In [None]:
import numpy as np

# Function to calculate DTW distance
def calculate_dtw_distance(s1, s2):
    # Initialize DTW matrix with zeros
    dtw_matrix = np.zeros((len(s1) + 1, len(s2) + 1))

    # Fill the first row and column with large values
    for i in range(1, len(s1) + 1):
        dtw_matrix[i, 0] = np.inf
    for j in range(1, len(s2) + 1):
        dtw_matrix[0, j] = np.inf

    # Fill the DTW matrix
    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            cost = abs(ord(s1[i - 1][0]) - ord(s2[j - 1][0]))
            dtw_matrix[i, j] = cost + min(dtw_matrix[i-1, j], dtw_matrix[i, j-1], dtw_matrix[i-1, j-1])

    # Return the DTW distance and accumulated errors
    return dtw_matrix[len(s1), len(s2)], dtw_matrix

def levenshtein(s1, s2):
    if len(s1) < len(s2):
        s1, s2 = s2, s1

    if len(s2) == 0:
        return len(s1), []

    previous_row = range(len(s2) + 1)
    table = [previous_row]
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
        table.append(current_row)

    # Traceback to find the alignment and mark errors
    marked_reference = []
    x, y = len(s1), len(s2)
    while x > 0 and y > 0:
        if s1[x-1] == s2[y-1]:
            marked_reference.append(s2[y-1])
            x, y = x-1, y-1
        elif table[x][y] == table[x-1][y-1] + 1:  # Substitution
            marked_reference.append('$' + s2[y-1] + '$')
            x, y = x-1, y-1
        elif table[x][y] == table[x][y-1] + 1:  # Insertion in s2
            marked_reference.append('$' + s2[y-1] + '$')
            y -= 1
        else:  # Deletion in s1
            x -= 1

    # Catch remaining parts of the reference that are missing in the transcript
    while y > 0:
        marked_reference.append('$' + s2[y-1] + '$')
        y -= 1

    marked_reference.reverse()
    return table[-1][-1], ' '.join(marked_reference)

def calculate_wer(transcript, reference):
    transcript_words = transcript.split()
    reference_words = reference.split()
    _, dtw_matrix = calculate_dtw_distance(transcript_words, reference_words)
    distance, marked_reference = levenshtein(transcript_words, reference_words)
    wer = distance / max(len(transcript_words), len(reference_words))
    return wer, marked_reference

def grade_transcript(wer):
    if wer < 0.1: return 5
    if wer < 0.2: return 4.5
    if wer < 0.3: return 4
    if wer < 0.4: return 3.5
    if wer < 0.5: return 3
    if wer < 0.6: return 2.5
    if wer < 0.7: return 2
    if wer < 0.8: return 1.5
    if wer < 0.9: return 1
    return 0.5 if wer < 1 else 0

# Example usage
reference = "الثعلب البني السريع يقفز فوق الكلب الكسول"
transcript = "الثعلب البني السريع يقفز فوق الكلب الكسول"

wer, marked_reference = calculate_wer(transcript, reference)
grade = grade_transcript(wer)

print("WER:", wer)
print("Grade:", grade)
print("Marked Reference:", marked_reference)

WER: 0.0
Grade: 5
Marked Reference: الثعلب البني السريع يقفز فوق الكلب الكسول


**Our Grading system :**

In [None]:
import numpy as np

# Function to calculate DTW distance
def calculate_dtw_distance(s1, s2):
    # Create a matrix initialized with zeros
    dtw_matrix = np.zeros((len(s1) + 1, len(s2) + 1))

    # Initialize the first row and column with infinity (np.inf)
    dtw_matrix[1:, 0] = np.inf
    dtw_matrix[0, 1:] = np.inf

    # Calculate DTW by comparing elements and considering the minimum path to each cell
    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            cost = abs(ord(s1[i - 1][0]) - ord(s2[j - 1][0]))
            dtw_matrix[i, j] = cost + min(dtw_matrix[i-1, j],    # Insertion
                                          dtw_matrix[i, j-1],    # Deletion
                                          dtw_matrix[i-1, j-1])  # Match or substitution

    # Return the DTW distance and the entire matrix
    return dtw_matrix[len(s1), len(s2)], dtw_matrix

# Function to calculate Levenshtein distance
def levenshtein(s1, s2):
    dp = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]

    # Base cases initialization
    for i in range(len(s1) + 1):
        dp[i][0] = i
    for j in range(len(s2) + 1):
        dp[0][j] = j

    # Fill the DP table
    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            if s1[i - 1] == s2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j],    # Insertion
                                   dp[i][j - 1],    # Deletion
                                   dp[i - 1][j - 1])  # Substitution

    # Construct marked differences for visual feedback
    marked_reference = []
    i, j = len(s1), len(s2)
    while i > 0 or j > 0:
        if i > 0 and j > 0 and s1[i-1] == s2[j-1]:
            marked_reference.append(s1[i-1])
            i, j = i - 1, j - 1
        elif i > 0 and (j == 0 or dp[i][j] == dp[i-1][j] + 1):
            marked_reference.append(f'${s1[i-1]}$')
            i -= 1
        else:
            j -= 1

    marked_reference.reverse()
    return dp[len(s1)][len(s2)], ' '.join(marked_reference)

# Function to calculate Word Error Rate (WER)
def calculate_wer(transcript, reference):
    transcript_words = transcript.split()
    reference_words = reference.split()
    _, dtw_matrix = calculate_dtw_distance(transcript_words, reference_words)
    distance, marked_reference = levenshtein(reference_words, transcript_words)
    wer = distance / max(len(transcript_words), len(reference_words))
    return wer, marked_reference

# Function to grade the transcript based on WER, scaled to a max score of 5
def grade_transcript(wer):
    if wer < 0.1: return 5
    if wer < 0.2: return 4.5
    if wer < 0.3: return 4
    if wer < 0.4: return 3.5
    if wer < 0.5: return 3
    if wer < 0.6: return 2.5
    if wer < 0.7: return 2
    if wer < 0.8: return 1.5
    if wer < 0.9: return 1
    return 0.5 if wer < 1 else 0

# Example usage
reference = "ذهب الولد إلى المدرسة ودرس الرياضيات"
transcript = "ذهب الولد إلى المدرسة"

wer, marked_reference = calculate_wer(transcript, reference)
grade = grade_transcript(wer)

print("WER:", wer)
print("Grade:", grade)
print("Marked Reference:", marked_reference)


WER: 0.3333333333333333
Grade: 3.5
Marked Reference: ذهب الولد إلى المدرسة $ودرس$ $الرياضيات$
