# This is the code used to execute our global transcription methods. It relies heavily on the egocom/transcription.py library.

In [1]:
from __future__ import print_function, absolute_import, division, unicode_literals, with_statement # Python 2 compatibility

import os
import pickle
import pandas as pd
from datetime import datetime, timedelta
from itertools import groupby
import numpy as np

In [2]:
from egocom.transcription import *

In [3]:
base_path = '/mnt/surreal/datasets/EGOCOM_IntermediateRawBackups/'
base_path = '/media/seagate1tb/'

In [4]:
pickle_loc =  base_path + "egocom-pickles/"
subtitle_loc = base_path + "egocom-subtitles/"
data_loc = base_path + "egocom-audio-only/"
csv_loc = base_path + "egocom-transcription-csv/"
fn_dict = {}
for fn in sorted([v for v in os.listdir(pickle_loc) if "person" in v and 'all' in v]):
    key = fn[9:23] + fn[32:37] if 'part' in fn else fn[9:21]
    fn_dict[key] = fn_dict[key] + [fn] if key in fn_dict else [fn]
    

# Method 1 - each single source is used as the sole information source for the global transcription

In [6]:
write_out_csv = False

dfs = []
for i in [1,2,3]:
    word_dicts = []
    for fn in sorted([v for v in os.listdir(pickle_loc) if "person_"+str(i) in v and 'all' in v]):
        key = fn[9:23] + fn[32:37] if 'part' in fn else fn[9:21]
        with open(pickle_loc + fn, 'rb') as f:
            transcript_data = pickle.load(f)
        _ = [z.update({'key':key, 'speaker':i}) for d in transcript_data for z in d['words']]
        word_dicts.append([z for d in transcript_data for z in d['words']])
    word_dicts = [w for z in word_dicts for w in z]
    df = pd.DataFrame(word_dicts)
    df['endTime'] = df['endTime'].apply(lambda x: float(x[:-1]))
    df['startTime'] = df['startTime'].apply(lambda x: float(x[:-1]))
    dfs.append(df[["key", "startTime", "speaker", "endTime", "word"]])
    
if write_out_csv:
    for i, df in enumerate(dfs):
        df.to_csv(csv_loc + "method_1_speaker_{}.csv".format(i+1), index = False, )

# Method 2 - combining transcriptions by taking the word with max confidence within a window of time (0.1 seconds)

In [7]:
write_out_subtitles = False
write_out_csv = True

dfs = []
for key in sorted(fn_dict.keys()):
    print(key, "\n", fn_dict[key])
    files = [pickle_loc + z for z in fn_dict[key]]
    word_dicts = []
    for s, fn in enumerate(files):
        with open(fn, 'rb') as f:
            transcript_data = pickle.load(f)
        _ = [z.update({'confidence':d['confidence'], 'speaker':s+1}) for d in transcript_data for z in d['words']]
        word_dicts.append([z for d in transcript_data for z in d['words']])
    word_dicts = [w for z in word_dicts for w in z]
    df = pd.DataFrame(word_dicts)
    df["startTime"] = [float(z[:-1]) for z in df["startTime"]]
    df["endTime"] = [float(z[:-1]) for z in df["endTime"]]
    # When 2+ words have exactly the same startTime or endTime, keep only the 1+ with highest confidence
    idx = df.groupby('startTime')['confidence'].transform(max) == df['confidence']
    idx2 = df.groupby('endTime')['confidence'].transform(max) == df['confidence']
    maxdf = df[idx & idx2].sort_values(by='startTime')    
    maxdf = maxdf[[not z for z in find_which_duplicates_to_remove(maxdf)]]
    maxdf["key"] = [key] * len(maxdf)
    df = maxdf[["key", "startTime", "speaker", "endTime", "word"]]
    if write_out_subtitles:
        words, starts, ends, speakers = (list(dict(df)[field]) for field in ['word', 'startTime', 'endTime', 'speaker'])
        write_subtitles(words, starts, ends, speakers = speakers, wfn = None) #subtitle_loc + key + "_combined.srt") # None)
    if write_out_csv:
        dfs.append(df)
if write_out_csv:
    df = pd.concat(dfs)
    df.to_csv(csv_loc + "method_2_combined_with_speaker_recognition.csv", index = False, )

day_1__con_1__part1 
 ['vid_001__day_1__con_1__person_1_part1_all-transcription-data.p', 'vid_006__day_1__con_1__person_2_part1_all-transcription-data.p', 'vid_011__day_1__con_1__person_3_part1_all-transcription-data.p']
day_1__con_1__part2 
 ['vid_002__day_1__con_1__person_1_part2_all-transcription-data.p', 'vid_007__day_1__con_1__person_2_part2_all-transcription-data.p', 'vid_012__day_1__con_1__person_3_part2_all-transcription-data.p']
day_1__con_1__part3 
 ['vid_003__day_1__con_1__person_1_part3_all-transcription-data.p', 'vid_008__day_1__con_1__person_2_part3_all-transcription-data.p', 'vid_013__day_1__con_1__person_3_part3_all-transcription-data.p']
day_1__con_1__part4 
 ['vid_004__day_1__con_1__person_1_part4_all-transcription-data.p', 'vid_009__day_1__con_1__person_2_part4_all-transcription-data.p', 'vid_014__day_1__con_1__person_3_part4_all-transcription-data.p']
day_1__con_1__part5 
 ['vid_005__day_1__con_1__person_1_part5_all-transcription-data.p', 'vid_010__day_1__con_1__per

# Method 3 - same as method 2, but using ICA to extract 3 sources even if there are only 2 speakers.

In [8]:

data_loc = "/home/cgn/Downloads/egocom-ICA/"
fn_dict = {}
for fn in sorted([v for v in os.listdir(pickle_loc) if "source" in v and 'ica' in v]):
    key = fn[:19] if 'part' in fn else fn[:12]
    fn_dict[key] = fn_dict[key] + [fn] if key in fn_dict else [fn]
    
write_out_subtitles = False
write_out_csv = True

dfs = []
for key in sorted(fn_dict.keys()):
    print(key, "\n", fn_dict[key])
    files = [pickle_loc + z for z in fn_dict[key]]
    word_dicts = []
    for s, fn in enumerate(sorted(files)):
        with open(fn, 'rb') as f:
            transcript_data = pickle.load(f)
        _ = [z.update({'confidence':d['confidence'], 'speaker':s+1}) for d in transcript_data for z in d['words']]
        word_dicts.append([z for d in transcript_data for z in d['words']])
    word_dicts = [w for z in word_dicts for w in z]
    df = pd.DataFrame(word_dicts)
    df["startTime"] = [float(z[:-1]) for z in df["startTime"]]
    df["endTime"] = [float(z[:-1]) for z in df["endTime"]]
    # When 2+ words have exactly the same startTime or endTime, keep only the 1+ with highest confidence
    idx = df.groupby('startTime')['confidence'].transform(max) == df['confidence']
    idx2 = df.groupby('endTime')['confidence'].transform(max) == df['confidence']
    maxdf = df[idx & idx2].sort_values(by='startTime')    
    maxdf = maxdf[[not z for z in find_which_duplicates_to_remove(maxdf)]]
    maxdf["key"] = [key] * len(maxdf)
    df = maxdf[["key", "startTime", "speaker", "endTime", "word"]]
    if write_out_subtitles:
        words, starts, ends, speakers = (list(dict(df)[field]) for field in ['word', 'startTime', 'endTime', 'speaker'])
        write_subtitles(words, starts, ends, speakers = speakers, wfn = None) #subtitle_loc + key + "_combined.srt") # None)
    if write_out_csv:
        dfs.append(df)
if write_out_csv:
    df = pd.concat(dfs)
    df.to_csv(csv_loc + "method_3_ICA.csv", index = False, )

day_1__con_1__part1 
 ['day_1__con_1__part1_source_1_all-transcription-data-ica.p', 'day_1__con_1__part1_source_2_all-transcription-data-ica.p', 'day_1__con_1__part1_source_3_all-transcription-data-ica.p']
day_1__con_1__part2 
 ['day_1__con_1__part2_source_1_all-transcription-data-ica.p', 'day_1__con_1__part2_source_2_all-transcription-data-ica.p', 'day_1__con_1__part2_source_3_all-transcription-data-ica.p']
day_1__con_1__part3 
 ['day_1__con_1__part3_source_1_all-transcription-data-ica.p', 'day_1__con_1__part3_source_2_all-transcription-data-ica.p', 'day_1__con_1__part3_source_3_all-transcription-data-ica.p']
day_1__con_1__part4 
 ['day_1__con_1__part4_source_1_all-transcription-data-ica.p', 'day_1__con_1__part4_source_2_all-transcription-data-ica.p', 'day_1__con_1__part4_source_3_all-transcription-data-ica.p']
day_1__con_1__part5 
 ['day_1__con_1__part5_source_1_all-transcription-data-ica.p', 'day_1__con_1__part5_source_2_all-transcription-data-ica.p', 'day_1__con_1__part5_source_3_a