# This script is used to extract the HUMAN GROUND TRUTH TRANSCRIPTIONS from rev.com creates the EGOCOM/ground_truth_transcriptions.csv

In [1]:
from __future__ import print_function, absolute_import, division, unicode_literals, with_statement # Python 2 compatibility

import os
import pandas as pd
import numpy as np

In [2]:
def convert_time_str2seconds(x):
    if type(x) is str:
        return sum(float(x) * 60 ** i for i,x in enumerate(reversed(x.replace(",", ".").split(":"))))
    return x

In [3]:
# json_dir = "/Users/cgn/Dropbox (Facebook)/EGOCOM/rev_raw_json_transcripts"
json_dir = "/datasets/cgn/EGOCOM/rev_raw_json_transcripts"

In [4]:
# Load transcript JSON data.
json_results = {}
for fn in sorted([fn_ for fn_ in os.listdir(json_dir) if "." != fn_[0]]):
    with open(os.path.join(json_dir, fn), 'r') as f:
        transcript_data = f.readlines()
        if len(transcript_data) > 1:
            print(len(transcript_data))
        assert(len(transcript_data)) == 1
        json_results[fn[:-5]] = eval(transcript_data[0])

In [5]:
# Transform all JSON data into a Pandas DataFrame organizing all transcriptions.

dfs = []
for video_name in sorted(json_results.keys()):  
    d = json_results[video_name]
    [t.update({"speaker_id":sentence["speaker"]}) for sentence in d['monologues'] for t in sentence['elements']]
    lod = [z for sent in d['monologues'] for z in sent['elements']]
    df = pd.DataFrame(lod)
    df.reset_index(drop=True, inplace=True)
    df["conversation_id"] = [video_name] * len(df)
    df['startTime'] = df['timestamp'].apply(lambda x: convert_time_str2seconds(x)) 
    df['endTime'] = df['end_timestamp'].apply(lambda x: convert_time_str2seconds(x)) 
    df['word'] = df['value']
    df = df[["conversation_id", "startTime", "speaker_id", "endTime", "word"]]
    dfs.append(df)
df = pd.concat(dfs)

# Remove any speakers included in ground truth beyond the number of actual speakers.
# e.g. Sometimes the ground truth (rev.com) includes a speaker for when everyone laughs at once.
df = df[~((df["conversation_id"] == "day_2__con_3") & (df["speaker_id"] == 2))]
df = df[~((df["conversation_id"] == "day_3__con_6") & (df["speaker_id"] == 4))]
df = df[~((df["conversation_id"] == "day_4__con_2") & (df["speaker_id"] == 4))]
df = df[~((df["conversation_id"] == "day_4__con_4") & (df["speaker_id"] == 2))]
df = df[~((df["conversation_id"] == "day_6__con_6") & (df["speaker_id"] == 4))]

In [6]:
for d in dfs:
    print(d["conversation_id"][1], ",", d.speaker_id.unique())
    break

day_1__con_1__part1 , [1 2 3]


In [7]:
for key, sdf in df.groupby('conversation_id'):
    assert(len(np.unique(sdf['speaker_id'])) <= 3)

In [8]:
idmap = {
    'day_1__con_1__part1': {1: 1, 2: 3, 3: 2},
    'day_1__con_1__part2': {1: 2, 2: 3, 3: 1},
    'day_1__con_1__part3': {1: 2, 2: 3, 3: 1},
    'day_1__con_1__part4': {1: 1, 2: 2, 3: 3},
    'day_1__con_1__part5': {1: 3, 2: 1, 3: 2},
    'day_1__con_2__part1': {1: 1, 2: 2, 3: 3},
    'day_1__con_2__part2': {1: 1, 2: 3, 3: 2},
    'day_1__con_2__part3': {1: 3, 2: 1, 3: 2},
    'day_1__con_2__part4': {1: 1, 2: 2, 3: 3},
    'day_1__con_2__part5': {1: 2, 2: 1, 3: 3},
    'day_1__con_3__part1': {1: 1, 2: 3, 3: 2},
    'day_1__con_3__part2': {1: 1, 2: 3, 3: 2},
    'day_1__con_3__part3': {1: 1, 2: 2, 3: 3},
    'day_1__con_3__part4': {1: 1, 2: 2, 3: 3},
    'day_1__con_4__part1': {1: 1, 2: 3, 3: 2},
    'day_1__con_4__part2': {1: 3, 2: 1, 3: 2},
    'day_1__con_4__part3': {1: 3, 2: 1, 3: 2},
    'day_1__con_4__part4': {1: 3, 2: 2, 3: 1},
    'day_1__con_5__part1': {1: 1, 2: 3, 3: 2},
    'day_1__con_5__part2': {1: 1, 2: 3, 3: 2},
    'day_1__con_5__part3': {1: 2, 2: 1, 3: 3},
    'day_1__con_5__part4': {1: 1, 2: 3, 3: 2},
    'day_1__con_5__part5': {1: 2, 2: 1, 3: 3},
    'day_2__con_1__part1': {1: 1, 2: 2, 3: 3},
    'day_2__con_1__part2': {1: 3, 2: 1, 3: 2},
    'day_2__con_1__part3': {1: 2, 2: 1, 3: 3},
    'day_2__con_1__part4': {1: 2, 2: 1, 3: 3},
    'day_2__con_1__part5': {1: 1, 2: 3, 3: 2},
    'day_2__con_2__part1': {1: 1, 2: 2, 3: 3},
    'day_2__con_2__part2': {1: 2, 2: 1, 3: 3},
    'day_2__con_2__part3': {1: 2, 2: 1, 3: 3},
    'day_2__con_2__part4': {1: 1, 2: 2, 3: 3},
    'day_2__con_3': {1: 1, 4: 2, 3: 3},
    'day_2__con_4': {1: 1, 2: 3, 3: 2},
    'day_2__con_5': {1: 1, 2: 3, 3: 2},
    'day_2__con_6': {1: 1, 2: 3, 3: 2},
    'day_2__con_7': {1: 1, 2: 3, 3: 2},
    'day_3__con_1': {1: 1, 2: 3, 3: 2},
    'day_3__con_2': {1: 1, 2: 2, 3: 3},
    'day_3__con_3': {1: 3, 2: 1, 3: 2},
    'day_3__con_4': {1: 1, 2: 2, 4: 3},
    'day_3__con_5': {1: 1, 2: 3, 3: 2},
    'day_3__con_6': {1: 1, 2: 2, 3: 3},
    'day_4__con_1': {1: 1, 2: 2, 3: 3},
    'day_4__con_2': {1: 1, 2: 3, 3: 2},
    'day_4__con_3': {1: 1, 2: 2, 3: 3},
    'day_4__con_4': {1: 1, 4: 2, 3: 3},
    'day_4__con_5': {1: 1, 2: 2, 3: 3},
    'day_4__con_6': {1: 1, 2: 2, 3: 3},
    'day_5__con_1': {1: 3, 2: 2, 3: 1},
    'day_5__con_2': {1: 1, 2: 2, 3: 3},
    'day_5__con_3': {1: 2, 2: 1, 4: 3},
    'day_5__con_4': {1: 1, 2: 3, 3: 2},
    'day_5__con_5': {1: 1, 2: 3, 3: 2},
    'day_5__con_6': {1: 1, 2: 2, 3: 3},
    'day_5__con_7': {1: 1, 2: 3, 3: 2},
    'day_5__con_8': {1: 2, 2: 3, 3: 1},
    'day_6__con_1': {1: 1, 2: 3, 3: 2},
    'day_6__con_2': {1: 1, 2: 2, 3: 3},
    'day_6__con_3': {1: 1, 2: 3, 3: 2},
    'day_6__con_4': {1: 1, 2: 2, 3: 3},
    'day_6__con_5': {1: 1, 2: 3, 3: 2},
    'day_6__con_6': {1: 1, 2: 2, 3: 3},
}

# Map the speaker ids from rev.com to the corect speakers
subdfs = []
for key, subdf in df.groupby('conversation_id'):
    print(key, end = " | ")
    subdf["speaker_id"] = subdf["speaker_id"].apply(lambda x: np.nan if np.isnan(x) else idmap[key][int(x)])
    subdfs.append(subdf)
df = pd.concat(subdfs)

day_1__con_1__part1 | day_1__con_1__part2 | day_1__con_1__part3 | day_1__con_1__part4 | day_1__con_1__part5 | day_1__con_2__part1 | day_1__con_2__part2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


 | day_1__con_2__part3 | day_1__con_2__part4 | day_1__con_2__part5 | day_1__con_3__part1 | day_1__con_3__part2 | day_1__con_3__part3 | day_1__con_3__part4 | day_1__con_4__part1 | day_1__con_4__part2 | day_1__con_4__part3 | day_1__con_4__part4 | day_1__con_5__part1 | day_1__con_5__part2 | day_1__con_5__part3 | day_1__con_5__part4 | day_1__con_5__part5 | day_2__con_1__part1 | day_2__con_1__part2 | day_2__con_1__part3 | day_2__con_1__part4 | day_2__con_1__part5 | day_2__con_2__part1 | day_2__con_2__part2 | day_2__con_2__part3 | day_2__con_2__part4 | day_2__con_3 | day_2__con_4 | day_2__con_5 | day_2__con_6 | day_2__con_7 | day_3__con_1 | day_3__con_2 | day_3__con_3 | day_3__con_4 | day_3__con_5 | day_3__con_6 | day_4__con_1 | day_4__con_2 | day_4__con_3 | day_4__con_4 | day_4__con_5 | day_4__con_6 | day_5__con_1 | day_5__con_2 | day_5__con_3 | day_5__con_4 | day_5__con_5 | day_5__con_6 | day_5__con_7 | day_5__con_8 | day_6__con_1 | day_6__con_2 | day_6__con_3 | day_6__con_4 | day_6__con_5

In [9]:
# View the text for each speaker to help created the idmap.
for key in idmap:
    print(key)
    for s, sdf in df[df['conversation_id'] == key].groupby("speaker_id"):
        print(s, " ".join(sdf["word"])[:150])
    print(idmap[key])
    print()
    break

day_1__con_1__part1
1 Okay.   So,   I   have   some   topics   in   my   hand   and   I'll   start   with,   "Name   three   things   that   we   all   have   in   common  
2 Well,   none   of   us   hate   the   color   blue. (laughs) The   office   is   always   so   cold,   though.   Like- ...   I   go   outside   and   
3 Hmm. Mm-hmm   (affirmative). Curtis,   why   didn't   you   wear   pants   today?   Then   we   could   all   be   wearing   pants. Hmm. (laughs) No? 
{1: 1, 2: 3, 3: 2}



In [10]:
df['speaker_id'].value_counts()

1    142354
3     68108
2     66915
Name: speaker_id, dtype: int64

In [12]:
# Fix the multiple tokens on one line
csv_loc = "/Users/cgn/Dropbox (Facebook)/EGOCOM/"
csv_loc = "/datasets/cgn/EGOCOM/"
from egocom.word_error_rate_analysis import process_transcript_data
print('Total transcriptions before post-processing:', len(df))
df = process_transcript_data(df, remove_capitalization=False, remove_filler_words=False, replace_numbers_with_words=False, remove_spaces=False)
print('Total transcriptions after post-processing:', len(df))
df.to_csv(csv_loc + 'ground_truth_transcriptions.csv', index = False)

Total transcriptions before post-processing: 277377
Original length | 277377
After splitting words with spaces into seperate rows | 415913
After replacing empty strings with spaces | 415913
After removing duplicate rows containing only spaces | 309640
After 1900s. --> [1900, s, .] and they've --> [they, ', ve] | 359538
Total transcriptions after post-processing: 359538
