In [None]:
from google.cloud import storage
import json
import numpy as np
import pandas as pd

In [None]:
bucket_name = "output-idl-json-files"

In [None]:
def list_blobs(bucket_name):
    storage_client = storage.Client()

    blobs = storage_client.list_blobs(bucket_name)

    json_files = []
    for blob in blobs:
        json_files.append(blob.name)
    return json_files

In [None]:
json_files = list_blobs(bucket_name)

In [None]:
json_files

In [None]:
def read_blob(bucket_name, blob_name):
    """Write and read a blob from GCS using file-like IO"""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The ID of your new GCS object
    # blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    # Mode can be specified as wb/rb for bytes mode.
    # See: https://docs.python.org/3/library/io.html

    res = ""
    with blob.open("r") as f:
        res = json.load(f)
        
    return res

In [None]:
res = read_blob("output-idl-json-files", "archive.org/download/tobacco_kon79e00/VTS_01_1_512kb.mp4.json")

In [None]:
res['annotation_results'][1].keys()

In [None]:
for sla in res['annotation_results'][1]['text_annotations']:
    print(sla)
    print(sla['text'])
    break

In [None]:
res['annotation_results'][1]['shot_label_annotations'][2]

In [None]:
#for now, we'll leave these out because they're so numerous, but they could really matter
#len(res['annotation_results'][0]['text_annotations']) 
# (this is almost 3,0000! mainly because of date/time stamps in each segment)

In [None]:
res['annotation_results'][1]['logo_recognition_annotations'][0]['entity']['description']

In [None]:
res['annotation_results'][0].keys()

In [None]:
res['annotation_results'][0]['speech_transcriptions'][0]['alternatives'][0].keys()

In [None]:
res['annotation_results'][0]['speech_transcriptions'][0]['alternatives'][0]['transcript']

In [None]:
#res

In [None]:
#json_file['annotation_results'][0]['speech_transcriptions'][0]['alternatives'][0]['transcript']
#json_file['annotation_results'][0]['speech_transcriptions'][0]['alternatives'][0]['confidence']

In [None]:
transcripts = []
embedded_texts = []
confidences = []
labels = []
uris = []
logos = []

for json_file in json_files:

    print("processing", json_file)
    res = read_blob(bucket_name, json_file)
    
    file_transcript = []
    embedded_text = []
    file_confidence = []
    label = []
    logo = []
    
    #print('al', len(res['annotation_results']))
    
    #try:

    if 'annotation_results' in res:

        for i, annotation in enumerate(res['annotation_results']):

            # The number of alternatives for each transcription is limited by
            # SpeechTranscriptionConfig.max_alternatives.
            # Each alternative is a different possible transcription
            # and has its own confidence score.

            if 'text_annotations' in annotation:
                for ta in annotation['text_annotations']:
                    embedded_text.append(ta['text'])
                    
                embedded_texts.append([json_file, ','.join(embedded_text)])
               
            if 'segment_label_annotations' in annotation:
                for sla in annotation['segment_label_annotations']:
                    label.append(sla['entity']['description'])
                    #print(sla['entity']['description'])

                labels.append([json_file, ','.join(label)])

            if 'logo_recognition_annotations' in annotation:
                for lra in annotation['logo_recognition_annotations']:
                    logo.append(lra['entity']['description'])
                    #print(sla['entity']['description'])

                logos.append([json_file, ','.join(logo)])
  
            if "speech_transcriptions" in annotation:
                #print("transcription found for", json_file)
                for speech_transcription in annotation['speech_transcriptions']:
                    #print("transcript for", json_file)
                    for alternative in speech_transcription['alternatives']:
                        if 'transcript' in alternative.keys() and 'confidence' in alternative.keys():
                            file_transcript.append(alternative['transcript'])
                            file_confidence.append(float(alternative['confidence']))                      

                if file_confidence == []:
                    mean_confidence = 0
                else:
                    mean_confidence = np.mean(file_confidence)

                confidences.append([json_file, mean_confidence])
                transcripts.append([json_file, ''.join(file_transcript)])

            
    #except:
    #    print("file not read")

In [None]:
len(embedded_texts)

In [None]:
#embedded_texts[0]

In [None]:
df_transcripts = pd.DataFrame(transcripts, columns=['uri', 'transcript'])
df_confidences = pd.DataFrame(confidences, columns=['uri', 'confidence'])
df_labels = pd.DataFrame(labels, columns=['uri', 'labels'])
df_logos = pd.DataFrame(logos, columns=['uri', 'logos'])

In [None]:
df_embedded_texts = pd.DataFrame(embedded_texts, columns=['uri', 'embedded_texts'])

In [None]:
# because not all records have labels or logos, we need to merge them all through an outer join

df_transcripts_annotations = df_transcripts.merge(df_confidences,on='uri',how='outer').merge(df_labels,on='uri',how='outer').merge(df_logos,on='uri',how='outer').merge(df_embedded_texts,on='uri',how='outer')

In [None]:
df_transcripts_annotations.head(1)

In [None]:
#df_transcripts_annotations.iloc[9]

In [None]:
#df_transcripts_annotations.iloc[9]['uri']

In [None]:
#df_transcripts_annotations.iloc[9]['logos']

In [None]:
#df_transcripts_annotations.iloc[9]['labels']

In [None]:
#df_transcripts_annotations.iloc[9]['transcript']

In [None]:
client = storage.Client()
bucket = client.get_bucket('pandas_output')
    
bucket.blob('transcripts-confidences.csv').upload_from_string(df_transcripts_annotations.to_csv(), 'text/csv')