In [46]:
import pandas as pd
import os

# Install required packages to calculate word error rate, sentence error rate, levenshtein distance
!pip install jiwer
!pip install python-Levenshtein

from jiwer import wer
from Levenshtein import distance



In [54]:
# Function to process csv
def process_files(path):
    df = pd.read_csv(path, na_filter=False)
    
    # Convert sentences to lowercase and remove punctuations
    df['sentence'] = df['sentence'].str.lower().str.replace('[^\w\s]','')
    df['transcript'] = df['transcript'].str.lower().str.replace('[^\w\s]','')
    wer_list = []
    leven_list = []
    ser_list = []
    
    print("Processing "+path)
    for index,row in df.iterrows():
        ground_truth = row['sentence']
        hypothesis = row['transcript']
        error = wer(ground_truth,hypothesis)
        leven_dist = distance(ground_truth,hypothesis)
        if (error > 0):
            ser_list.append(1)
        else:
            ser_list.append(0)
        wer_list.append(error)
        leven_list.append(leven_dist)

    df['wer'] = wer_list
    df['levenshtein'] = leven_list
    df['ser'] = ser_list
    
    return df

In [55]:
# Specify directory to read csv from
input_dir = "input/"
output_dir = "output/"
all_frames = []

# Process csv and store dataframe in list
for filename in os.listdir(input_dir):
    path = input_dir+filename
    df = process_files(path)
    all_frames.append(df)

# Concatenate all dataframes
total_df = pd.concat(all_frames, axis=0, ignore_index=True)

# Export to csv
total_df.to_csv(output_dir+'total_shortened.csv',encoding='utf-8',columns=['sentence','transcript','wer','levenshtein','ser'],index=False)
total_df.to_csv(output_dir+'total.csv',encoding='utf-8',index=False)

Processing input/aws_asian_results.csv
Processing input/aws_non_asian_results.csv
Processing input/google_asian_results.csv
Processing input/google_non_asian_results.csv


In [51]:
total_df.head()

Unnamed: 0,provider,sentence,up_votes,down_votes,age,gender,accent,locale,segment,sentence_len,is_asian,us_eu,jobName,transcript,wer,levenshtein,ser,filename,confidence
0,Amazon,at first they were considered stegosaurus exem...,2,0,twenties,female,hongkong,en,,7,True,False,common_voice_en_22630065.mp3,at first they were considered stegosaurus exem...,0.0,0,0,,
1,Amazon,the game contains many references to the bible,2,1,thirties,male,hongkong,en,,8,True,False,common_voice_en_20544026.mp3,the game contains many references to the bible,0.0,0,0,,
2,Amazon,dont be a fool,2,0,fourties,male,hongkong,en,,4,True,False,common_voice_en_17787387.mp3,dont be a fool,0.0,0,0,,
3,Amazon,prospect hills science offerings include physics,2,0,twenties,male,hongkong,en,,6,True,False,common_voice_en_22969285.mp3,prospect hills science offerings include physics,0.0,0,0,,
4,Amazon,they have all four properties described above,2,0,twenties,male,hongkong,en,,7,True,False,common_voice_en_22423410.mp3,they have all four properties described it about,0.285714,5,1,,
