## Analysis for EGOCOM.

### This file uses the word_error_rate_analysis package to compute the accuracy of our global transcriptions methods on EGOCOM.

In [1]:
from __future__ import print_function, absolute_import, division, unicode_literals, with_statement # Python 2 compatibility

import numpy as np
import re
import pandas as pd

# The following package is used to compoute the word error rate (wer).
# wer uses the Wagner-Fischer Algorithm to compute the Levenstein distance at both the sentence and word level.
# This package requires "pip install jiwer"
from jiwer import wer 

# This package converts things like "42" to "forty-two"
from num2words import num2words

# For parallel processing
import multiprocessing as mp
from multiprocessing import Pool
max_threads = mp.cpu_count()

In [2]:
from egocom.word_error_rate_analysis import *

In [3]:
csv_loc = '/media/seagate1tb/egocom-transcription-csv/'
ground_truth_csv = '/datasets/cgn/EGOCOM/ground_truth_transcriptions.csv'

In [21]:
df_gt = pd.read_csv(ground_truth_csv)
df_m1s = [pd.read_csv(csv_loc + "method_1_speaker_{}.csv".format(i)) for i in [1,2,3]]
df_m2 = pd.read_csv(csv_loc + "method_2_combined_with_speaker_recognition.csv")
df_m3 = pd.read_csv(csv_loc + "method_3_ICA.csv")

# Fix columns names (key --> conversation id, speaker --> speaker_id)
for i in range(3):
    df_m1s[i].columns = ["conversation_id", "startTime", "speaker_id", "endTime", "word"]
df_m2.columns = ["conversation_id", "startTime", "speaker_id", "endTime", "word"]
df_m3.columns = ["conversation_id", "startTime", "speaker_id", "endTime", "word"]

In [22]:
%time gt = create_processed_transcripts(df_gt)

Original length | 359536
After splitting words with spaces into seperate rows | 487746
After replacing empty strings with spaces | 487746
After removing duplicate rows containing only spaces | 359535
After 1900s. --> [1900, s, .] and they've --> [they, ', ve] | 359535
After 1100 --> one thousand, one hundred | 359535
After twenty-two --> twenty two | 359937
After removing spaces | 231728
After removing capitalization | 231728
After removing filler words | 227575
CPU times: user 11.9 s, sys: 189 ms, total: 12.1 s
Wall time: 6.6 s


In [23]:
%time m1s = [create_processed_transcripts(df_m1) for df_m1 in df_m1s]

Original length | 74515
After splitting words with spaces into seperate rows | 74515
After replacing empty strings with spaces | 74515
After removing duplicate rows containing only spaces | 74515
After 1900s. --> [1900, s, .] and they've --> [they, ', ve] | 82629
After 1100 --> one thousand, one hundred | 82629
After twenty-two --> twenty two | 82884
After removing spaces | 82884
After removing capitalization | 82884
After removing filler words | 82881
Original length | 47412
After splitting words with spaces into seperate rows | 47412
After replacing empty strings with spaces | 47412
After removing duplicate rows containing only spaces | 47412
After 1900s. --> [1900, s, .] and they've --> [they, ', ve] | 51945
After 1100 --> one thousand, one hundred | 51945
After twenty-two --> twenty two | 52125
After removing spaces | 52125
After removing capitalization | 52125
After removing filler words | 52123
Original length | 38604
After splitting words with spaces into seperate rows | 38604
A

In [24]:
%time m2 = create_processed_transcripts(df_m2)

Original length | 115557
After splitting words with spaces into seperate rows | 115557
After replacing empty strings with spaces | 115557
After removing duplicate rows containing only spaces | 115557
After 1900s. --> [1900, s, .] and they've --> [they, ', ve] | 127880
After 1100 --> one thousand, one hundred | 127880
After twenty-two --> twenty two | 128273
After removing spaces | 128273
After removing capitalization | 128273
After removing filler words | 128267
CPU times: user 2.21 s, sys: 23.9 ms, total: 2.23 s
Wall time: 2.23 s


In [25]:
%time m3 = create_processed_transcripts(df_m3)

Original length | 119321
After splitting words with spaces into seperate rows | 119321
After replacing empty strings with spaces | 119321
After removing duplicate rows containing only spaces | 119321
After 1900s. --> [1900, s, .] and they've --> [they, ', ve] | 132101
After 1100 --> one thousand, one hundred | 132101
After twenty-two --> twenty two | 132553
After removing spaces | 132553
After removing capitalization | 132553
After removing filler words | 132551
CPU times: user 2.26 s, sys: 16.1 ms, total: 2.28 s
Wall time: 2.27 s


In [26]:
# Both should cover the same set of videos transcribed. m1_3 is fewer (doesnt exist when only two people wore glasses)
assert(sorted(gt.keys()) == sorted(m2.keys()))
assert(all(sorted(gt.keys()) == sorted(m1.keys()) for m1 in m1s[:-1]))

In [27]:
print('Method 1')
%time m1_errors = [compute_wer_for_all_videos(m1, gt) for m1 in m1s]
# Average error across the 2-3 microphones in each conversation
m1_error = {k:np.mean([m1e[k] for m1e in m1_errors if k in m1e]) for k in m1_errors[0].keys()}

Method 1

Transcription accuracy for each video
-------------------------------------
day_1__con_2__part5 | 0.487
day_1__con_3__part4 | 0.288
day_1__con_1__part5 | 0.392
day_1__con_5__part5 | 0.714
day_1__con_1__part4 | 0.332
day_2__con_1__part1 | 0.451
day_1__con_4__part3 | 0.339
day_1__con_2__part1 | 0.425
day_1__con_3__part1 | 0.36
day_1__con_2__part2 | 0.385
day_1__con_1__part3 | 0.468
day_1__con_5__part1 | 0.412
day_1__con_2__part3 | 0.435
day_1__con_4__part4 | 0.364
day_1__con_1__part1 | 0.362
day_1__con_1__part2 | 0.433
day_1__con_2__part4 | 0.562
day_1__con_4__part2 | 0.365
day_1__con_3__part2 | 0.383
day_2__con_1__part5 | 0.335
day_1__con_4__part1 | 0.487
day_1__con_3__part3 | 0.392
day_2__con_2__part4 | 0.414
day_1__con_5__part4 | 0.393
day_2__con_1__part2 | 0.534
day_1__con_5__part3 | 0.379
day_1__con_5__part2 | 0.391
day_2__con_2__part1 | 0.533
day_2__con_1__part4 | 0.256
day_2__con_1__part3 | 0.326
day_2__con_2__part3 | 0.436
day_2__con_2__part2 | 0.513
day_2__con_3 | 0.43

In [28]:
print('Method 2')
%time m2_error = compute_wer_for_all_videos(m2, gt)

Method 2

Transcription accuracy for each video
-------------------------------------
day_1__con_2__part5 | 0.461
day_1__con_3__part4 | 0.486
day_1__con_1__part5 | 0.683
day_1__con_5__part5 | 0.667
day_1__con_4__part3 | 0.445
day_2__con_1__part1 | 0.614
day_1__con_1__part3 | 0.6
day_1__con_2__part1 | 0.592
day_1__con_1__part4 | 0.596
day_1__con_5__part1 | 0.496
day_1__con_2__part2 | 0.559
day_1__con_2__part3 | 0.582
day_1__con_3__part1 | 0.512
day_1__con_4__part4 | 0.525
day_1__con_5__part4 | 0.416
day_1__con_5__part2 | 0.459
day_1__con_4__part2 | 0.588
day_1__con_1__part1 | 0.577
day_1__con_4__part1 | 0.594
day_1__con_1__part2 | 0.633
day_1__con_3__part3 | 0.49
day_1__con_5__part3 | 0.467
day_1__con_2__part4 | 0.664
day_2__con_2__part4 | 0.471
day_2__con_1__part2 | 0.625
day_1__con_3__part2 | 0.527
day_2__con_1__part5 | 0.534
day_2__con_2__part1 | 0.597
day_2__con_2__part3 | 0.469
day_2__con_1__part4 | 0.403
day_2__con_1__part3 | 0.522
day_2__con_2__part2 | 0.55
day_2__con_3 | 0.513
d

In [29]:
print('Method 3')
%time m3_error = compute_wer_for_all_videos(m3, gt)

Method 3

Transcription accuracy for each video
-------------------------------------
day_1__con_2__part5 | 0.513
day_1__con_3__part4 | 0.486
day_1__con_1__part5 | 0.661
day_1__con_5__part5 | 0.667
day_2__con_1__part1 | 0.642
day_1__con_2__part1 | 0.621
day_1__con_5__part1 | 0.507
day_1__con_1__part3 | 0.63
day_1__con_4__part3 | 0.484
day_1__con_1__part4 | 0.596
day_1__con_5__part2 | 0.475
day_1__con_3__part1 | 0.531
day_1__con_4__part1 | 0.581
day_1__con_2__part3 | 0.573
day_1__con_5__part4 | 0.396
day_1__con_1__part1 | 0.589
day_1__con_2__part4 | 0.659
day_1__con_2__part2 | 0.564
day_1__con_4__part2 | 0.578
day_1__con_3__part2 | 0.58
day_1__con_3__part3 | 0.509
day_1__con_4__part4 | 0.552
day_2__con_2__part4 | 0.453
day_1__con_1__part2 | 0.623
day_2__con_1__part2 | 0.608
day_2__con_1__part5 | 0.567
day_1__con_5__part3 | 0.472
day_2__con_2__part1 | 0.601
day_2__con_2__part3 | 0.44
day_2__con_1__part3 | 0.557
day_2__con_1__part4 | 0.407
day_2__con_2__part2 | 0.539
day_3__con_2 | 0.177


In [220]:
video_info = pd.read_csv("/datasets/cgn/EGOCOM/video_info.csv")
video_info['gender'] = video_info['speaker_gender']

# Speaker identification accuracy

In [230]:
# Get speaker id data
gt_ids = df_gt.groupby('conversation_id').apply(lambda x: " ".join([str(z) for z in x['speaker_id']]))
m1_ids = []
for i in range(3):
    m1_ids.append(df_m1s[i].groupby('conversation_id').apply(lambda x: " ".join([str(z) for z in x['speaker_id']])))
m2_ids = df_m2.groupby('conversation_id').apply(lambda x: " ".join([str(z) for z in x['speaker_id']]))
m3_ids = df_m3.groupby('conversation_id').apply(lambda x: " ".join([str(z) for z in x['speaker_id']]))

# Create dataframes to join when start time is the same and conversation_id is the same.
gt_ids = pd.DataFrame(df_gt[['speaker_id', 'startTime', 'conversation_id']].dropna(), copy = True)
gt_ids['startTime'] = gt_ids['startTime'].dropna().round().astype(int)
m2_ids = pd.DataFrame(df_m2[['speaker_id', 'startTime', 'conversation_id']].dropna(), copy = True)
m2_ids['startTime'] = m2_ids['startTime'].dropna().round().astype(int)

# Computer accuracy scores.
results_id = pd.merge(gt_ids, m2_ids, on=['conversation_id', 'startTime'])
id_acc = results_id.groupby('conversation_id').apply(lambda x: sum(x['speaker_id_x'] == x['speaker_id_y']) / len(x))
id_df = pd.DataFrame(pd.Series(id_acc), columns = ['speaker_id_acc'])
data = video_info.set_index('conversation_id').join(id_df)
print('Overall speaker id accuracy:', "{:.2%}".format(sum(results_id['speaker_id_x'] == results_id['speaker_id_y']) / len(results_id)))

# Compute final dataframes of results
results_background = pd.DataFrame(data.groupby(['native_speaker', 'background_music', 'background_fan'])['speaker_id_acc'].mean())
results_demographics = pd.DataFrame(data.groupby(['gender', 'native_speaker', 'speaker_is_host'])['speaker_id_acc'].mean())

Overall speaker id accuracy: 76.76%


# Utils for producing latex tables

In [366]:
latex_table_header = '''

\\begin{{table*}}[t]

\\setlength\\tabcolsep{{2pt}} % Makes table columns tighter
\\caption{{{caption}}}
\\vskip -0.1in
\\label{{{label}}}
\\begin{{center}}
\\begin{{small}}
\\begin{{sc}}
\\resizebox{{1.0\\textwidth}}{{!}}{{ %Completely zooms in or zooms out (shrinks) entire table!

'''

latex_table_footer = '''}}

\\end{{sc}}
\\end{{small}}
\\end{{center}}
\\vskip -0.1in
\\end{{table*}}


'''

In [376]:
def make_latex(table, what = 'influencers'):
    table = table.reset_index()
    table.columns = [z.replace("_", " ").replace('accuracy', 'acc') for z in list(table.columns)]
    caption = "Global transcription accuracy of baseline vs. our method across {}.".format(what)
    label = "table:global_transcription_{}".format(what)
    tex = table.to_latex(index = False, column_format = 'ccc|rrrr')
    tex_header = latex_table_header.format(
        caption = caption, 
        label = label,
    )
    tex = tex_header + tex + latex_table_footer    
    tex = tex.replace("{{", "{")
    tex = tex.replace("}}", "}")
    top, bottom = tex.split('\\toprule')
    mid, bottom = bottom.split('\\midrule')
    if what == 'influencers':
        mid = '\n      \\textbf{native} & \\textbf{music} &  \\textbf{fan} &  \\textbf{word} &  \\textbf{baseline} &  \\textbf{egocom} &  \\textbf{speaker id} \\\\\n \\textbf{speaker} &  \\textbf{noise} &  \\textbf{noise}  &    \\textbf{count}  & \\textbf{accuracy}   &  \\textbf{accuracy}  &  \\textbf{accuracy}   \\\\\n'
    else:
        mid = '\n   &    \\textbf{native}  &  \\textbf{speaker}  &  \\textbf{word}  &  \\textbf{baseline} &  \\textbf{egocom} &  \\textbf{speaker id} \\\\\n\\textbf{gender} & \\textbf{speaker} & \\textbf{is host} &    \\textbf{count}  & \\textbf{accuracy}   &  \\textbf{accuracy}  &  \\textbf{accuracy}        \\\\\n'
    bottom = bottom.replace('\\\\', '\\\\\n\\hline\n')
    tex = top + '\\toprule' + mid + '\\midrule' + bottom
    tex = tex.replace('True', '\\checkmark')
    tex = tex.replace('False', '')
    tex = tex.replace('\\hline\n\n\\bottomrule', '\n\\bottomrule')
    return tex

# Global transcription accuracy

In [371]:
baseline_df = pd.DataFrame(pd.Series(m1_error), columns = ['baseline_error'])
egocom_df = pd.DataFrame(pd.Series(m2_error), columns = ['egocom_error'])
# Combine errors with video data
data = video_info.set_index('conversation_id').join(baseline_df).join(egocom_df)
data['baseline_error_count'] = (data['word_count'] * data['baseline_error']).astype(int)
data['egocom_error_count'] = (data['word_count'] * data['egocom_error']).astype(int)
print("Baseline Acc:", "{:.2%}".format(1 - data['baseline_error_count'].sum() / data['word_count'].sum()))
print("EgoCom Acc:", "{:.2%}".format(1 - data['egocom_error_count'].sum() / data['word_count'].sum()))
print('EgoCom speaker id accuracy:', "{:.2%}".format(sum(results_id['speaker_id_x'] == results_id['speaker_id_y']) / len(results_id)), 'N = ', len(results_id))

Baseline Acc: 30.67%
EgoCom Acc: 54.78%
EgoCom speaker id accuracy: 76.76% N =  534499


In [372]:
results = data.groupby(['gender', 'native_speaker', 'speaker_is_host'])['word_count', 'baseline_error_count', 'egocom_error_count'].sum()
results['baseline_acc'] = 1 - results['baseline_error_count'] / results['word_count']
results['egocom_acc'] = 1 - results['egocom_error_count'] / results['word_count']
print("Baseline Acc:", np.mean(results['baseline_acc']))
print("EgoCom Acc:", np.mean(results['egocom_acc']))
demo_table = results.join(results_demographics).round(2).drop(['egocom_error_count', 'baseline_error_count'], axis = 1)
demo_table

Baseline Acc: 0.3045673805331425
EgoCom Acc: 0.5448953612514134


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,word_count,baseline_acc,egocom_acc,speaker_id_acc
gender,native_speaker,speaker_is_host,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,False,False,1055,0.31,0.54,0.75
female,True,False,31666,0.29,0.55,0.76
male,False,False,21174,0.3,0.54,0.77
male,True,False,23344,0.31,0.55,0.76
male,True,True,81826,0.31,0.55,0.77


In [373]:
results = data.groupby(['native_speaker', 'background_music', 'background_fan'])['word_count', 'baseline_error_count', 'egocom_error_count'].sum()
results['baseline_accuracy'] = 1 - results['baseline_error_count'] / results['word_count']
results['egocom_accuracy'] = 1 - results['egocom_error_count'] / results['word_count']
print("Baseline Acc:",np.mean(results['baseline_accuracy']))
print("EgoCom Acc:", np.mean(results['egocom_accuracy']))
inf_table = results.join(results_background).round(2).drop(['egocom_error_count', 'baseline_error_count'], axis = 1)
inf_table

Baseline Acc: 0.28618166251035765
EgoCom Acc: 0.5295406838504789


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,word_count,baseline_accuracy,egocom_accuracy,speaker_id_acc
native_speaker,background_music,background_fan,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,False,False,17577,0.31,0.55,0.77
False,False,True,2467,0.25,0.51,0.76
False,True,False,2185,0.27,0.51,0.79
True,False,False,96448,0.32,0.56,0.77
True,False,True,11701,0.28,0.53,0.73
True,True,False,28687,0.29,0.53,0.76


In [375]:
print(make_latex(demo_table, 'demographics'))
print(make_latex(inf_table, 'influencers'))



\begin{table*}[t]

\setlength\tabcolsep{2pt} % Makes table columns tighter
\caption{Global transcription accuracy of baseline vs. our method across demographics.}
\vskip -0.1in
\label{table:global_transcription_demographics}
\begin{center}
\begin{small}
\begin{sc}
\resizebox{1.0\textwidth}{!}{ %Completely zooms in or zooms out (shrinks) entire table!

\begin{tabular}{lll|rrrr}
\toprule
      \textbf{gender}  &    \textbf{native}  &  \textbf{speaker}  &  \textbf{word}  &  \textbf{baseline} &  \textbf{egocom} &  \textbf{speaker id} \\
\textbf{gender} & \textbf{speaker} & \textbf{is host} &    \textbf{count}  & \textbf{accuracy}   &  \textbf{accuracy}  &  \textbf{accuracy}        \\
\midrule
 female &            &             &        1055 &          0.31 &        0.54 &            0.75 \\
\hline

 female &            \checkmark &             &       31666 &          0.29 &        0.55 &            0.76 \\
\hline

   male &            &             &       21174 &          0.30 &       

In [16]:
m1_total_error = compute_duration_total_weighted_error(
    error_dict = m1_error,
    transcript_len_dict = {k:sum([len(m1[k].split()) for m1 in m1s if k in m1]) for k in m1s[0].keys()},
)
m2_total_error = compute_duration_total_weighted_error(
    error_dict = m2_error,
    transcript_len_dict = {key: len(s.split()) for key, s in m2.items()},
)
m3_total_error = compute_duration_total_weighted_error(
    error_dict = m3_error,
    transcript_len_dict = {key: len(s.split()) for key, s in m3.items()},
)

In [17]:
print('\nMethod 1 (Single audio transcription, accuracy is avg score across all 3 sources) \n\tAverage Accuracy:', error_as_percent_acc(m1_total_error))
print('\nMethod 2 (Combined transcriptions using source with max confidence for each word)\n\tAverage Accuracy:', error_as_percent_acc(m2_total_error))
print('\nMethod 3 (Combined transcriptions using ICA (directly in time domain) then max-conf-word across sources)\n\tAverage Accuracy:', error_as_percent_acc(m3_total_error))


Method 1 (Single audio transcription, accuracy is avg score across all 3 sources) 
	Average Accuracy: 31.19%

Method 2 (Combined transcriptions using source with max confidence for each word)
	Average Accuracy: 55.04%

Method 3 (Combined transcriptions using ICA (directly in time domain) then max-conf-word across sources)
	Average Accuracy: 52.37%
