# This script is used to extract the HUMAN GROUND TRUTH TRANSCRIPTIONS from rev.com servers via HTTP FETCH requests and create the EGOCOM/ground_truth_transcriptions.csv

In [1]:
from __future__ import print_function, absolute_import, division, unicode_literals, with_statement # Python 2 compatibility

import requests
import pandas as pd
import numpy as np

In [2]:
def convert_time_str2seconds(x):
    if type(x) is str:
        return sum(float(x) * 60 ** i for i,x in enumerate(reversed(x.replace(",", ".").split(":"))))
    return x

In [3]:
do_not_include = "TC0160723531" # This was a test to make sure the system works.

In [4]:
# First, we fetch all the order_ids. Each video transcribed has its own order_id.

# curl -X GET   https://www.rev.com/api/v1/orders/TC0160723531 -H 'Authorization: Rev zo2F5eieeq9sWOYRdTHPACuINO4:SdkRAOAPZPHStVkrSO1EcIe67NM='
headers = {'Authorization': 'Rev zo2F5eieeq9sWOYRdTHPACuINO4:SdkRAOAPZPHStVkrSO1EcIe67NM='}
response = requests.get('https://www.rev.com/api/v1/orders/', headers=headers, params = {'pageSize':100})
d = response.json()
order_ids = [z['order_number'] for z in d['orders'] if z['order_number'] != do_not_include]
print('Number of orders:', len(order_ids))

Number of orders: 63


In [5]:
json_results = {}
for order_id in order_ids:
    # Get transcript id.

    # curl -X GET   https://www.rev.com/api/v1/orders/TC0160723531 -H 'Authorization: Rev zo2F5eieeq9sWOYRdTHPACuINO4:SdkRAOAPZPHStVkrSO1EcIe67NM='
    headers = {'Authorization': 'Rev zo2F5eieeq9sWOYRdTHPACuINO4:SdkRAOAPZPHStVkrSO1EcIe67NM='}
    response = requests.get('https://www.rev.com/api/v1/orders/{oid}'.format(oid = order_id), headers=headers)
    d = response.json()
        
    video_name = d['attachments'][0]['name']
    
    print(order_id, video_name)

TC0695641954 day_6__con_6.mp4
TC0680724528 day_6__con_5.mp4
TC0814260226 day_6__con_3.mp4
TC0660505040 day_6__con_2.mp4
TC0656945670 day_6__con_4.mp4
TC0913219738 day_6__con_1.mp4
TC1041326357 day_5__con_8.mp4
TC0686510667 day_5__con_7.mp4
TC1056511204 day_5__con_6.mp4
TC0923591085 day_5__con_5.mp4
TC0708183445 day_5__con_4.mp4
TC0584165371 day_5__con_2.mp4
TC0972180451 day_5__con_3.mp4
TC0996234767 day_5__con_1.mp4
TC0751188622 day_4__con_5.mp4
TC0546171684 day_4__con_6.mp4
TC0950680326 day_4__con_4.mp4
TC0891972543 day_4__con_3.mp4
TC0784141698 day_4__con_2.mp4
TC0910303778 day_4__con_1.mp4
TC0701092819 day_3__con_5.mp4
TC0852210878 day_3__con_6.mp4
TC0698778753 day_3__con_4.mp4
TC0877667951 day_3__con_3.mp4
TC0663679227 day_3__con_2.mp4
TC0838697727 day_3__con_1.mp4
TC0703949877 day_2__con_7.mp4
TC0894594926 day_2__con_6.mp4
TC0655362525 day_2__con_4.mp4
TC0863189618 day_2__con_3.mp4
TC0819718827 day_2__con_5.mp4
TC0775336185 day_1__con_5__part3.mp4
TC1045995822 day_1__con_5__part4.

In [6]:
%%time

# Next, we fetch the raw JSON containing the transcript information for each video.

json_results = {}
for order_id in order_ids:
    # Get transcript id.

    # curl -X GET   https://www.rev.com/api/v1/orders/TC0160723531 -H 'Authorization: Rev zo2F5eieeq9sWOYRdTHPACuINO4:SdkRAOAPZPHStVkrSO1EcIe67NM='
    headers = {'Authorization': 'Rev zo2F5eieeq9sWOYRdTHPACuINO4:SdkRAOAPZPHStVkrSO1EcIe67NM='}
    response = requests.get('https://www.rev.com/api/v1/orders/{oid}'.format(oid = order_id), headers=headers)
    d = response.json()
    
    if len(d['attachments']) <= 1 or 'id' not in d['attachments'][1]:
        print('Order', order_id, 'is not ready yet.')
        continue
    
    transcript_id = d['attachments'][1]['id']
    video_name = d['attachments'][0]['name']
    
    # Get transcript data.

    # curl -X GET   https://www.rev.com/api/v1/attachments/S3KUCTCGggEAAAAABQAAAA/content -H 'Authorization: Rev zo2F5eieeq9sWOYRdTHPACuINO4:SdkRAOAPZPHStVkrSO1EcIe67NM='
    headers = {'Authorization': 'Rev zo2F5eieeq9sWOYRdTHPACuINO4:SdkRAOAPZPHStVkrSO1EcIe67NM='}
    response = requests.get('https://www.rev.com/api/v1/attachments/{tid}/content'.format(tid=transcript_id), headers=headers)
    json_results[video_name] = response.json()

CPU times: user 2.7 s, sys: 320 ms, total: 3.02 s
Wall time: 45.7 s


In [9]:
# Finally, we transform the all JSON data into a Pandas DataFrame organizing all transcriptions.

dfs = []
for video_name in sorted(json_results.keys()):  
    d = json_results[video_name]
    [t.update({"speaker":sentence["speaker"]}) for sentence in d['monologues'] for t in sentence['elements']]
    lod = [z for sent in d['monologues'] for z in sent['elements']]
    df = pd.DataFrame(lod)
    df.reset_index(drop=True, inplace=True)
    df["key"] = [video_name[:-4]] * len(df)
    df['startTime'] = df['timestamp'].apply(lambda x: convert_time_str2seconds(x)) 
    df['endTime'] = df['end_timestamp'].apply(lambda x: convert_time_str2seconds(x)) 
    df['word'] = df['value']
    df = df[["key", "startTime", "speaker", "endTime", "word"]]
    dfs.append(df)
df = pd.concat(dfs)

# Remove any speakers included in ground truth beyond the number of actual speakers.
# e.g. Sometimes the ground truth (rev.com) includes a speaker for when everyone laughs at once.
df = df[~((df["key"] == "day_2__con_3") & (df["speaker"] == 2))]
df = df[~((df["key"] == "day_3__con_6") & (df["speaker"] == 4))]
df = df[~((df["key"] == "day_4__con_2") & (df["speaker"] == 4))]
df = df[~((df["key"] == "day_4__con_4") & (df["speaker"] == 2))]
df = df[~((df["key"] == "day_6__con_6") & (df["speaker"] == 4))]

In [10]:
for key, sdf in df.groupby('key'):
    assert(len(np.unique(sdf['speaker'])) <= 3)

In [17]:
for key in idmap:
    print(key)
    for s, sdf in df[df['key'] == key].groupby("speaker"):
        print(s, " ".join(sdf["word"])[:150])
    print()
    break

day_1__con_1__part1
1 Okay .  So ,  I   have   some   topics   in   my   hand   and   I'll   start   with , " Name   three   things   that   we   all   have   in   common  
2 Hmm . Mm - hmm  (affirmative). Curtis ,  why   didn't   you   wear   pants   today ?  Then   we   could   all   be   wearing   pants . Hmm . ( laughs 
3 Well ,  none   of   us   hate   the   color   blue . (laughs) The   office   is   always   so   cold ,  though .  Like - ...  I   go   outside   and  



In [11]:
idmap = {
    'day_1__con_1__part1': {1: 1, 2: 3, 3: 2},
    'day_1__con_1__part2': {1: 2, 2: 3, 3: 1},
    'day_1__con_1__part3': {1: 2, 2: 3, 3: 1},
    'day_1__con_1__part4': {1: 1, 2: 2, 3: 3},
    'day_1__con_1__part5': {1: 3, 2: 1, 3: 2},
    'day_1__con_2__part1': {1: 1, 2: 2, 3: 3},
    'day_1__con_2__part2': {1: 1, 2: 3, 3: 2},
    'day_1__con_2__part3': {1: 3, 2: 1, 3: 2},
    'day_1__con_2__part4': {1: 1, 2: 2, 3: 3},
    'day_1__con_2__part5': {1: 2, 2: 1, 3: 3},
    'day_1__con_3__part1': {1: 1, 2: 3, 3: 2},
    'day_1__con_3__part2': {1: 1, 2: 3, 3: 2},
    'day_1__con_3__part3': {1: 1, 2: 2, 3: 3},
    'day_1__con_3__part4': {1: 1, 2: 2, 3: 3},
    'day_1__con_4__part1': {1: 1, 2: 3, 3: 2},
    'day_1__con_4__part2': {1: 3, 2: 1, 3: 2},
    'day_1__con_4__part3': {1: 3, 2: 1, 3: 2},
    'day_1__con_4__part4': {1: 3, 2: 2, 3: 1},
    'day_1__con_5__part1': {1: 1, 2: 3, 3: 2},
    'day_1__con_5__part2': {1: 1, 2: 3, 3: 2},
    'day_1__con_5__part3': {1: 2, 2: 1, 3: 3},
    'day_1__con_5__part4': {1: 1, 2: 3, 3: 2},
    'day_1__con_5__part5': {1: 2, 2: 1, 3: 3},
    'day_2__con_1__part1': {1: 1, 2: 2, 3: 3},
    'day_2__con_1__part2': {1: 3, 2: 1, 3: 2},
    'day_2__con_1__part3': {1: 2, 2: 1, 3: 3},
    'day_2__con_1__part4': {1: 2, 2: 1, 3: 3},
    'day_2__con_1__part5': {1: 1, 2: 3, 3: 2},
    'day_2__con_2__part1': {1: 1, 2: 2, 3: 3},
    'day_2__con_2__part2': {1: 2, 2: 1, 3: 3},
    'day_2__con_2__part3': {1: 2, 2: 1, 3: 3},
    'day_2__con_2__part4': {1: 1, 2: 2, 3: 3},
    'day_2__con_3': {1: 1, 4: 2, 3: 3},
    'day_2__con_4': {1: 1, 2: 3, 3: 2},
    'day_2__con_5': {1: 1, 2: 3, 3: 2},
    'day_2__con_6': {1: 1, 2: 3, 3: 2},
    'day_2__con_7': {1: 1, 2: 3, 3: 2},
    'day_3__con_1': {1: 1, 2: 3, 3: 2},
    'day_3__con_2': {1: 1, 2: 2, 3: 3},
    'day_3__con_3': {1: 3, 2: 1, 3: 2},
    'day_3__con_4': {1: 1, 2: 2, 4: 3},
    'day_3__con_5': {1: 1, 2: 3, 3: 2},
    'day_3__con_6': {1: 1, 2: 2, 3: 3},
    'day_4__con_1': {1: 1, 2: 2, 3: 3},
    'day_4__con_2': {1: 1, 2: 3, 3: 2},
    'day_4__con_3': {1: 1, 2: 2, 3: 3},
    'day_4__con_4': {1: 1, 4: 2, 3: 3},
    'day_4__con_5': {1: 1, 2: 3, 3: 2},
    'day_4__con_6': {1: 1, 2: 2, 3: 3},
    'day_5__con_1': {1: 3, 2: 2, 3: 1},
    'day_5__con_2': {1: 1, 2: 2, 3: 3},
    'day_5__con_3': {1: 2, 2: 1, 4: 3},
    'day_5__con_4': {1: 1, 2: 3, 3: 2},
    'day_5__con_5': {1: 1, 2: 3, 3: 2},
    'day_5__con_6': {1: 1, 2: 2, 3: 3},
    'day_5__con_7': {1: 1, 2: 3, 3: 2},
    'day_5__con_8': {1: 2, 2: 3, 3: 1},
    'day_6__con_1': {1: 1, 2: 3, 3: 2},
    'day_6__con_2': {1: 1, 2: 2, 3: 3},
    'day_6__con_3': {1: 1, 2: 3, 3: 2},
    'day_6__con_4': {1: 1, 2: 2, 3: 3},
    'day_6__con_5': {1: 1, 2: 3, 3: 2},
    'day_6__con_6': {1: 1, 2: 2, 3: 3},
}

# Map the speaker ids from rev.com to the corect speakers
subdfs = []
for key, subdf in df.groupby('key'):
    print(key, end = " | ")
    subdf["speaker"] = subdf["speaker"].apply(lambda x: np.nan if np.isnan(x) else idmap[key][int(x)])
    subdfs.append(subdf)
df = pd.concat(subdfs)

day_1__con_1__part1 | day_1__con_1__part2 | day_1__con_1__part3 | day_1__con_1__part4 | 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


day_1__con_1__part5 | day_1__con_2__part1 | day_1__con_2__part2 | day_1__con_2__part3 | day_1__con_2__part4 | day_1__con_2__part5 | day_1__con_3__part1 | day_1__con_3__part2 | day_1__con_3__part3 | day_1__con_3__part4 | day_1__con_4__part1 | day_1__con_4__part2 | day_1__con_4__part3 | day_1__con_4__part4 | day_1__con_5__part1 | day_1__con_5__part2 | day_1__con_5__part3 | day_1__con_5__part4 | day_1__con_5__part5 | day_2__con_1__part1 | day_2__con_1__part2 | day_2__con_1__part3 | day_2__con_1__part4 | day_2__con_1__part5 | day_2__con_2__part1 | day_2__con_2__part2 | day_2__con_2__part3 | day_2__con_2__part4 | day_2__con_3 | day_2__con_4 | day_2__con_5 | day_2__con_6 | day_2__con_7 | day_3__con_1 | day_3__con_2 | day_3__con_3 | day_3__con_4 | day_3__con_5 | day_3__con_6 | day_4__con_1 | day_4__con_2 | day_4__con_3 | day_4__con_4 | day_4__con_5 | day_4__con_6 | day_5__con_1 | day_5__con_2 | day_5__con_3 | day_5__con_4 | day_5__con_5 | day_5__con_6 | day_5__con_7 | day_5__con_8 | day_6__co

In [23]:
df[df['key'] == 'day_6__con_6'].head()

Unnamed: 0,key,startTime,speaker,endTime,word
0,day_6__con_6,0.0,1,0.28,Alright
1,day_6__con_6,,1,,","
2,day_6__con_6,0.44,1,0.58,here
3,day_6__con_6,,1,,
4,day_6__con_6,0.58,1,0.6,we


In [53]:
df['speaker'].value_counts()

1    139030
3     64384
2     64107
Name: speaker, dtype: int64

In [None]:
# Fix the multiple tokens on one line
# from word_error_rate_analysis import process_transcript_data
# df = process_transcript_data(df, remove_actions=False, remove_capitalization=False, remove_filler_words=False, replace_numbers_with_words=False, remove_spaces=False)
# df.to_csv("/home/cgn/Downloads/egocom-transcription-csv/" + 'ground_truth_transcriptions.csv', index = False)

In [2]:
csv_loc = "/Users/cgn/Dropbox (Facebook)/EGOCOM/"
# df.to_csv(csv_loc + 'ground_truth_transcriptions.csv', index = False)
print('Total transcriptions:', len(df))

Total transcriptions: 267521


In [54]:
csv_loc = "/Users/cgn/Dropbox (Facebook)/EGOCOM/"
df.to_csv(csv_loc + 'ground_truth_transcriptions.csv', index = False)
print('Total transcriptions:', len(df))

Total transcriptions: 267521
