In [1]:
# this workboook illustrates how to extract data from the JSON file returned by videointelligence
# the example transcript was generated from the industry archives file:
# YOUTH SMOKING PREVENTION 7 SPOTS
# https://archive.org/details/tobacco_wbr62a00

In [2]:
import pandas as pd
import glob
import json
import numpy as np
from pandasql import sqldf

In [3]:
pysqldf = lambda q: sqldf(q, globals())

In [4]:
files = glob.glob('sample_json/*.json', recursive=True)

In [5]:
#len(files)

In [6]:
#files[0]

In [7]:
#f = open(files[0])
#data = json.load(f) 

In [8]:
#data.keys()

In [9]:
#data['annotation_results'][0].keys()

In [10]:
#data['annotation_results'][1].keys()

In [11]:
#data['annotation_results'][0]['input_uri']

In [12]:
#data['annotation_results'][0]['segment_label_annotations'][0]['entity']['description']

In [13]:
#data['annotation_results'][0]['shot_label_annotations'][0]['entity']['description']

In [14]:
#data['annotation_results'][0]['text_annotations'][0]['text']

In [15]:
#data['annotation_results'][1]['speech_transcriptions'][2]['alternatives'][0]['transcript']

In [16]:
#data['annotation_results'][1]['speech_transcriptions'][2]['alternatives'][0]['confidence']

In [17]:
label_annotations = []
text_annotations = []
logo_annotations = []
speech_annotations = []

for file in files:
    
    f = open(file)
    data = json.load(f) 
    
    for i in range(len(data['annotation_results'])):
    
        ar = data['annotation_results'][i]

        uri = ar['input_uri']

        if 'shot_label_annotations' in ar:

            slas = []
            for sla in ar['shot_label_annotations']:
                label = sla['entity']['description']
                slas.append(label)

            label_annotations.append((uri, ','.join(slas)))

        if 'text_annotations' in ar:
            tas = []
            for ta in ar['text_annotations']:
                text = ta['text']
                tas.append(text)

            text_annotations.append((uri, ' '.join(tas)))

        if 'logo_recognition_annotations' in ar.keys():                         
            lras = []
            for lra in ar['logo_recognition_annotations']:                  
                logo = lra['entity']['description']
                lras.append(logo)

            logo_annotations.append((uri, ','.join(lras)))


        #['speech_transcriptions'][2]['alternatives'][0]['transcript']
        if 'speech_transcriptions' in ar:

            sts = []
            scs = []

            for st in ar['speech_transcriptions']:

                alt_conf = st['alternatives'][0]

                if 'transcript' in alt_conf:
                    speech = alt_conf['transcript']
                    speech_confidence = alt_conf['confidence']
                    #print(speech_confidence)
                    sts.append(speech)
                    if speech_confidence == []:
                        speech_confidence = 0
                        
                    scs.append(float(speech_confidence))

            speech_annotations.append((uri, ''.join(sts), np.mean(scs)))
        
    #print(file)

In [18]:
df_label_annotations = pd.DataFrame(label_annotations, columns=['uri', 'label'])
df_text_annotations = pd.DataFrame(text_annotations, columns=['uri', 'text'])
df_logo_annotations = pd.DataFrame(logo_annotations, columns=['uri', 'logo'])
df_speech_annotations = pd.DataFrame(speech_annotations, columns=['uri', 'transcript', 'confidence'])

In [19]:
df_annotations = pysqldf("""
SELECT 
    sa.uri,
    sa.transcript,
    sa.confidence,
    ta.text,
    la.label,
    lga.logo
FROM
    df_speech_annotations sa
LEFT JOIN
    df_label_annotations la
ON
    sa.uri = la.uri
LEFT JOIN
    df_text_annotations ta
ON
    sa.uri = ta.uri
LEFT JOIN
    df_logo_annotations lga
ON
    sa.uri = lga.uri
""")

In [20]:
df_annotations.insert(0, 'file_name', df_annotations['uri'].str.split('/').str[-1][:-4])

In [21]:
df_annotations.insert(0, 'identifier', df_annotations['uri'].str.split('/').str[-2])

In [22]:
df_annotations

Unnamed: 0,identifier,file_name,uri,transcript,confidence,text,label,logo
0,input-idl-video-files,,/input-idl-video-files/tobacco_demo.mp4,I think smoking makes you look cool. No way. W...,0.825757,P MErtis USA SUOLON GENERAL'S WARNING: Smoking...,"black hair,finger,motor vehicle,sports,car,can...","Think Mutual Bank,Isuzu Philippines,New York J..."


In [23]:
df_annotations['text_content'] = df_annotations['transcript']

In [24]:
df_annotations.to_csv('sample_json/sample_json.csv', index=False)

In [26]:
df_annotations['transcript'][0]

"I think smoking makes you look cool. No way. What are you looking at?Hey Kristi. Thanks smoking makes you look cool. Hey.People think that kids are like jumping now always telling us what to do or not to do, but we know what's going on. We listened like smoking. We all know the reasons not to butt out here comes up. That's when we make the real decision for us. It's not cool to smoke. That's not even the hardest decision I've ever made. So how about a little credit for a change?My parents they always think I'm not listening. I hear him. Wait a minute, you know every day you have to deal with stuff on your own. So you want to know why I've decided not to smoke which reason would you like? I don't think the smoke the prove myself. My coolness is not on trial here. I don't smoke because sometimes it's what you don't do that makes you who you are.These are the kids that I hang out with and they're all in different ways a lot of like to because you know, we all deal with the same stuff sam

In [27]:
df_annotations['confidence'][0]

0.825757140909091

In [32]:
print(df_annotations['text'][0])



In [34]:
df_annotations['label'][0]

'black hair,finger,motor vehicle,sports,car,cannon,forehead,people,film noir,mouth,nose,head,chin,lip,human,interaction,eyebrow,barefoot,television program,soldier,smoking,tree,conversation,street,black and white,black,text,hair,emotion,song,pedestrian,television advertisement,smile,monochrome,monochrome photography,hand,leg,display device,facial expression,neck,foot,military,ear,extreme sport,happiness,nail,public space,eye,cheek,animal,vehicle'

In [35]:
df_annotations['logo'][0]

'Think Mutual Bank,Isuzu Philippines,New York Jets,New York Giants'