In [11]:
from difflib import SequenceMatcher

# Load the contents of the ground truth and predicted files
file_path_ground_truth = 'ICR TEST SECI text.txt'
file_path_predicted = 'SECI_Predicted.txt'

with open(file_path_ground_truth, 'r', encoding='utf-8') as f:
    ground_truth_lines = f.readlines()

with open(file_path_predicted, 'r', encoding='utf-8') as f:
    predicted_lines = f.readlines()

# Function to calculate CER and operations for a single line
def calculate_cer_line(ground_truth_line, predicted_line):
    matcher = SequenceMatcher(None, ground_truth_line, predicted_line)

    substitutions = 0
    deletions = 0
    insertions = 0

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'replace':
            substitutions += max(i2 - i1, j2 - j1)
        elif tag == 'delete':
            deletions += i2 - i1
        elif tag == 'insert':
            insertions += j2 - j1

    total_characters = len(ground_truth_line)
    cer = (substitutions + deletions + insertions) / total_characters if total_characters > 0 else 0

    return substitutions, deletions, insertions, cer

# Calculate CER line by line
line_results = []
overall_substitutions, overall_deletions, overall_insertions = 0, 0, 0
average_cer = 0

total_operations = 0

for i, (gt_line, pred_line) in enumerate(zip(ground_truth_lines, predicted_lines)):
    substitutions, deletions, insertions, cer = calculate_cer_line(gt_line.strip(), pred_line.strip())
    overall_substitutions += substitutions
    overall_deletions += deletions
    overall_insertions += insertions
    total_operations += substitutions + deletions + insertions
    average_cer += cer
    line_results.append((i + 1, substitutions, deletions, insertions, substitutions + deletions + insertions, cer))

# Calculate overall statistics
total_characters_ground_truth = sum(len(line.strip()) for line in ground_truth_lines)
overall_cer = (overall_substitutions + overall_deletions + overall_insertions) / total_characters_ground_truth
average_cer /= len(line_results)

# Output line-by-line results and overall statistics
print("Line-by-Line Results:")
print("Line | Substitutions | Deletions | Insertions | Operations | CER")
for line_num, subs, dels, ins, ops, cer in line_results:
    print(f"{line_num:4d} | {subs:13d} | {dels:9d} | {ins:10d} | {ops:10d} | {cer:.4f}")

print("\nOverall Results:")
print("Substitutions:", overall_substitutions)
print("Deletions:", overall_deletions)
print("Insertions:", overall_insertions)
print("Total Operations:", total_operations)
print("Total Characters in Ground Truth:", total_characters_ground_truth)
print("Overall Character Error Rate (CER):", overall_cer)
print("Average Character Error Rate (CER):", average_cer)


Line-by-Line Results:
Line | Substitutions | Deletions | Insertions | Operations | CER
   1 |             4 |         0 |          0 |          4 | 0.6667
   2 |             0 |         0 |        899 |        899 | 899.0000
   3 |             0 |        55 |          0 |         55 | 1.0000
   4 |            42 |         2 |          0 |         44 | 0.9565
   5 |          1230 |         0 |          0 |       1230 | 17.8261
   6 |             0 |        19 |          0 |         19 | 1.0000
   7 |            27 |         0 |          0 |         27 | 0.9643
   8 |           384 |         0 |          2 |        386 | 19.3000
   9 |             0 |        21 |          0 |         21 | 1.0000
  10 |            36 |         1 |          0 |         37 | 0.9250
  11 |          1355 |         0 |        189 |       1544 | 51.4667
  12 |             0 |        28 |          0 |         28 | 1.0000
  13 |             9 |         0 |          0 |          9 | 0.7500
  14 |          1276 |  

In [17]:
from difflib import SequenceMatcher

# Load the contents of the ground truth and predicted files
file_path_ground_truth = r'C:\Users\deepa\Desktop\Prog Lang\Python\Python_practice\Recorrected  ICR_test_ROAS.txt'
file_path_predicted = 'ROAS_Predicted.txt'

with open(file_path_ground_truth, 'r', encoding='utf-8') as f:
    ground_truth_lines = f.readlines()

with open(file_path_predicted, 'r', encoding='utf-8') as f:
    predicted_lines = f.readlines()

# Function to calculate CER and operations for a single line
def calculate_cer_line(ground_truth_line, predicted_line):
    matcher = SequenceMatcher(None, ground_truth_line, predicted_line)

    substitutions = 0
    deletions = 0
    insertions = 0

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'replace':
            substitutions += max(i2 - i1, j2 - j1)
        elif tag == 'delete':
            deletions += i2 - i1
        elif tag == 'insert':
            insertions += j2 - j1

    total_characters = len(ground_truth_line)
    cer = (substitutions + deletions + insertions) / total_characters if total_characters > 0 else 0

    return substitutions, deletions, insertions, cer

# Calculate CER line by line
line_results = []
overall_substitutions, overall_deletions, overall_insertions = 0, 0, 0
average_cer = 0

total_operations = 0

for i, (gt_line, pred_line) in enumerate(zip(ground_truth_lines, predicted_lines)):
    substitutions, deletions, insertions, cer = calculate_cer_line(gt_line.strip(), pred_line.strip())
    overall_substitutions += substitutions
    overall_deletions += deletions
    overall_insertions += insertions
    total_operations += substitutions + deletions + insertions
    average_cer += cer
    line_results.append((i + 1, substitutions, deletions, insertions, substitutions + deletions + insertions, cer))

# Calculate overall statistics
total_characters_ground_truth = sum(len(line.strip()) for line in ground_truth_lines)
overall_cer = (overall_substitutions + overall_deletions + overall_insertions) / total_characters_ground_truth
average_cer /= len(line_results)

# Output line-by-line results and overall statistics
print("Line-by-Line Results:")
print("Line | Substitutions | Deletions | Insertions | Operations | CER")
for line_num, subs, dels, ins, ops, cer in line_results:
    print(f"{line_num:4d} | {subs:13d} | {dels:9d} | {ins:10d} | {ops:10d} | {cer:.4f}")

print("\nOverall Results:")
print("Substitutions:", overall_substitutions)
print("Deletions:", overall_deletions)
print("Insertions:", overall_insertions)
print("Total Operations:", total_operations)
print("Total Characters in Ground Truth:", total_characters_ground_truth)
print("Overall Character Error Rate (CER):", overall_cer)
print("Average Character Error Rate (CER):", average_cer)


Line-by-Line Results:
Line | Substitutions | Deletions | Insertions | Operations | CER
   1 |             1 |         0 |          0 |          1 | 0.1667
   2 |             0 |         0 |          0 |          0 | 0.0000
   3 |             0 |         0 |        568 |        568 | 568.0000
   4 |             0 |        65 |          0 |         65 | 1.0000
   5 |            51 |         0 |          0 |         51 | 1.1087
   6 |             0 |        34 |          0 |         34 | 1.0000
   7 |            32 |         7 |          0 |         39 | 0.9512
   8 |             0 |        45 |          0 |         45 | 1.0000
   9 |           699 |         0 |          0 |        699 | 49.9286
  10 |             0 |        22 |          0 |         22 | 1.0000
  11 |            49 |        62 |          0 |        111 | 1.5205
  12 |             0 |        18 |          0 |         18 | 1.0000
  13 |             4 |        19 |          0 |         23 | 0.9200
  14 |             0 |    

In [19]:
from typing import List
import Levenshtein
 
def calculate_cer(ground_truth: str, predicted: str) -> float:
    """
    Calculate Character Error Rate (CER) between ground truth and predicted text.
    CER = (Substitutions + Insertions + Deletions) / Total Characters in Ground Truth
    Args:
        ground_truth (str): The correct text
        predicted (str): The predicted text
    Returns:
        float: Character Error Rate
    """
    # Calculate Levenshtein distance
    distance = Levenshtein.distance(ground_truth, predicted)
    # Calculate CER
    if len(ground_truth) == 0:
        return 0 if len(predicted) == 0 else 1
    cer = distance / len(ground_truth)
    return cer
 
def read_text_file(file_path: str) -> str:
    """
    Read text from a file and return as string.
    Args:
        file_path (str): Path to the text file
    Returns:
        str: Content of the text file
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read().strip()
    except Exception as e:
        print(f"Error reading file {file_path}: {str(e)}")
        return ""
 
def main():
    # File paths
    ground_truth_file =  r'C:\Users\deepa\Desktop\Prog Lang\Python\Python_practice\Recorrected  ICR_test_ROAS.txt'
    predicted_file = 'ROAS_Predicted.txt'
    # Read the files
    ground_truth = read_text_file(ground_truth_file)
    predicted = read_text_file(predicted_file)
    # Calculate CER
    cer = calculate_cer(ground_truth, predicted)
    # Print results
    print(f"Character Error Rate (CER): {cer:.4f}")
    print(f"Percentage: {cer * 100:.2f}%")
    # Additional statistics
    print(f"\nGround Truth length: {len(ground_truth)} characters")
    print(f"Predicted length: {len(predicted)} characters")
    print(f"Levenshtein distance: {Levenshtein.distance(ground_truth, predicted)}")
 
if __name__ == "__main__":
    main()

Character Error Rate (CER): 0.2992
Percentage: 29.92%

Ground Truth length: 7076 characters
Predicted length: 6216 characters
Levenshtein distance: 2117
