In [153]:
import google.cloud.vision as gcv
from google.cloud import videointelligence_v1p1beta1 as videointelligence
from google.oauth2 import service_account
from google.protobuf.json_format import MessageToDict
import json
import webcolors
import langcodes
from collections import defaultdict

In [86]:
web_detection_params = gcv.types.WebDetectionParams(include_geo_results=True)
image_context = gcv.types.ImageContext(web_detection_params=web_detection_params)

In [91]:
# credentials must be loaded as below, otherwise there will be an error
credentials = service_account.Credentials.from_service_account_file('credentials/ArnottsAU-8ed53827c907.json')

In [4]:
client = gcv.ImageAnnotatorClient(credentials=credentials)

In [7]:
# read image file as binary
img = gcv.types.Image(content=open('pictures/picture_582887468579563.jpg', 'rb').read())

In [5]:
f = open('pictures/picture_856190784534911.jpg', 'rb').read()

r = MessageToDict(client.annotate_image({'image': 
                       {'content': f}, 'image_context': image_context}), 
                  preserving_proto_field_name = True)

In [6]:
face_feats = 'joy sorrow anger surprise under_exposed blurred headwear'.split()

In [7]:
def count(what):
    _ = r.get(f'{what}_annotations', None)
    print(f'found {len(_) if _ else 0} {what}(s)')

### Face

In [8]:
count('face')

found 1 face(s)


In [9]:
for e in face_feats:
    print(f'{e}: {r["face_annotations"][0].get(e + "_likelihood", None)}')

joy: LIKELY
sorrow: VERY_UNLIKELY
anger: VERY_UNLIKELY
surprise: VERY_UNLIKELY
under_exposed: VERY_UNLIKELY
blurred: VERY_UNLIKELY
headwear: UNLIKELY


### Logos

In [10]:
count('logo')

found 0 logo(s)


In [11]:
r['logo_annotations'][0]['description']

KeyError: 'logo_annotations'

### Labels

In [12]:
count('label')

found 4 label(s)


In [13]:
for l in r['label_annotations']:
    print(f'{l["description"]}, score: {l["score"]}')

shoulder, score: 0.7711181044578552
textile, score: 0.7127899527549744
product, score: 0.6004652380943298
fun, score: 0.5477446913719177


### Text in Picture

In [14]:
count('text')

found 4 text(s)


In [15]:
for i, t in enumerate(r['text_annotations'], 1):
    print(f'#{i} -- language: {langcodes.Language.make(language=t["locale"]).language_name() if "locale" in t  else "?"}, text: {t["description"]}')

#1 -- language: German, text: Timla
im Fall

#2 -- language: ?, text: Timla
#3 -- language: ?, text: im
#4 -- language: ?, text: Fall


### Restricted Themes

In [16]:
r['safe_search_annotation']

{'adult': 'UNLIKELY',
 'spoof': 'VERY_UNLIKELY',
 'medical': 'VERY_UNLIKELY',
 'violence': 'VERY_UNLIKELY',
 'racy': 'POSSIBLE'}

### Colors
higher "scores" means higher confidence that the color in question is prominent in the central focus of the image

In [34]:
def get_closest_color(color):
    
    distance_to_color = []
    
    for k, v in webcolors.css3_hex_to_names.items():
        
        # going through somthing like this: {#f0f8ff: aliceblue, #faebd7: antiquewhite}
        
        r,g,b = webcolors.hex_to_rgb(k)  # this converts #f0f8ff to integer RGB values
        
        distance_to_color.append((v, (r - color[0])**2 + (g - color[1])**2 + (b - color[2])**2))
        
    return min(distance_to_color, key=lambda x: x[1])[0] 

In [40]:
for c in r['image_properties_annotation']['dominant_colors']['colors']:
    print(c['color'], c['pixel_fraction'], c['score'])
    rgb_ = tuple([int(c) for c in (c['color']['red'], c['color']['green'], c['color']['blue'])])
    print(rgb_)
    suggested_color  = get_closest_color(rgb_)
    print('closest color: ', suggested_color)

{'red': 194.0, 'green': 185.0, 'blue': 164.0} 0.15782222151756287 0.19699552655220032
(194, 185, 164)
closest color:  silver
{'red': 137.0, 'green': 122.0, 'blue': 158.0} 0.02791111171245575 0.06410349905490875
(137, 122, 158)
closest color:  lightslategrey
{'red': 100.0, 'green': 83.0, 'blue': 61.0} 0.06573333591222763 0.05252223461866379
(100, 83, 61)
closest color:  darkolivegreen
{'red': 26.0, 'green': 24.0, 'blue': 28.0} 0.0533333346247673 0.043435897678136826
(26, 24, 28)
closest color:  black
{'red': 199.0, 'green': 195.0, 'blue': 186.0} 0.1671111136674881 0.135808527469635
(199, 195, 186)
closest color:  silver
{'red': 173.0, 'green': 163.0, 'blue': 143.0} 0.07844444364309311 0.10183226317167282
(173, 163, 143)
closest color:  rosybrown
{'red': 160.0, 'green': 155.0, 'blue': 148.0} 0.08195555210113525 0.07075007259845734
(160, 155, 148)
closest color:  darkgrey
{'red': 125.0, 'green': 121.0, 'blue': 115.0} 0.07155555486679077 0.0484808050096035
(125, 121, 115)
closest color:  g

In [35]:
get_closest_color((0,121,0))

'green'

In [26]:
r['full_text_annotation']['pages'][0]['property']['detected_languages']

[{'language_code': 'ceb', 'confidence': 1.0}]

### Full Text Annotations

In [199]:
print('pages: ', len(r['full_text_annotation']['pages']))

pages:  1


In [200]:
for p in range(len(r['full_text_annotation']['pages'])):
    print(r['full_text_annotation']['pages'][p]['property']['detected_languages'])

[{'language_code': 'en', 'confidence': 0.7699999809265137}]


In [201]:
r['full_text_annotation']['text']

'TimTam\nSuper Scrummy\nChocolate Tim Tam\nMilkshake\nREARNOTTS\nTimTam\nORIGINAL\n'

### Web Detection
This one gives us 
* web_entities
* visually_similar_images
* best_guess_labels
Note: there's an overall relevancy score for the entity, not normalized and not comparable across different image queries.

In [202]:
web_ents = len(r['web_detection']['web_entities'])
print(f'web entities found: {web_ents}')

web entities found: 10


In [203]:
for e in r['web_detection']['web_entities']:
    print(f'entity: {e["description"] if "description" in e else "?"}, score: {e["score"]}')

entity: Sundae, score: 1.578178882598877
entity: Tim Tam, score: 0.8896999955177307
entity: Liqueur, score: 0.8364270329475403
entity: ?, score: 0.6556000113487244
entity: Arnott's Biscuits, score: 0.6057000160217285
entity: Chocolate, score: 0.5485801696777344
entity: Flavor by Bob Holmes, Jonathan Yen (narrator) (9781515966647), score: 0.5430999994277954
entity: Biscuit, score: 0.5418000221252441
entity: Spain, score: 0.5078999996185303
entity: United Arab Emirates, score: 0.5071499943733215


In [204]:
sim_imgs = len(r['web_detection']['visually_similar_images'])
print(f'visually_similar_images found: {sim_imgs}')

visually_similar_images found: 10


In [205]:
r['web_detection']['visually_similar_images']

[{'url': 'https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=1916596088657356'},
 {'url': 'https://s314.siliconimg.com/kb/content_images/2017/12/13/1496558/1513170699_709.jpg'},
 {'url': 'http://www.uhainiu.com/content/images/thumbs/000/0000678_timtam-187g.jpeg'},
 {'url': 'https://cbu01.alicdn.com/img/ibank/2017/452/396/4547693254_937679173.jpg'},
 {'url': 'http://www.totallytarget.com/wp-content/uploads/2016/02/tim-tam-1.jpg'},
 {'url': 'https://wx1.sinaimg.cn/orj360/006YBhA8gy1fm1dmmcd4aj31he0u0aep.jpg'},
 {'url': 'https://www.campbellsoupcompany.com/wp-content/uploads/sites/31/2013/11/Tim-Tam-Chocolicious.jpg'},
 {'url': 'https://s1.bukalapak.com/img/139947173/large/Biskuit_Tim_Tam_Chocolate_100g_x_3_pcs.jpg'},
 {'url': 'https://media.apnarm.net.au/media/images/2014/02/16/tim_tam_50_years-lre5y1mnzhl8vk6nnh2_fct621x468_ct677x380.JPG'},
 {'url': 'https://www.c-store.com.au/wp-content/uploads/2015/07/Tim-Tam.jpg'}]

In [206]:
r['web_detection']['best_guess_labels']

[{'label': 'tim tam', 'language_code': 'en'}]

### Localized Objects

In [207]:
for o in r['localized_object_annotations']:
    print(f'{o["name"]}, score: {o["score"]}')

Drink, score: 0.5362362265586853


In [69]:
def annotate():
    """
    using Google Vision API, annotate a photo
    """
    f = open('pictures/picture_582887468579563.jpg', 'rb').read()

    r = MessageToDict(client.annotate_image({'image': 
                             {'content': f}, 'image_context': image_context}), 
                                    preserving_proto_field_name = True)
    annots = defaultdict(lambda: defaultdict())

    """
    faces

    create a dictionary like this: {'faces': {'count': 2, 'face_1': {joy: very_unlikely, sorrow: very_unlikely, 
                                                            anger: very_unlikely}}}
    """

    def _count(what):

        try:
            return int(bool(r.get(f'{what}_annotations', None)))
        except:
            print(f'no {what}s')


    annots['faces']['count'] = _count('face')

    for i, face in enumerate(range(annots['faces']['count'])):
        
        this_face_ = []
        for feature in r['face_annotations'][i]:
            if '_likelihood' in feature:
                if r['face_annotations'][i][feature].lower() in 'likely very_likely'.split():
                    this_face_.append(feature.replace('_likelihood',''))

        if this_face_:
            annots['faces']['face_' + str(i + 1)]

    """
    logos

    """

    annots['logos']['count'] = _count('logo')

    if annots['logos']['count']:
        annots['logos']['descriptions'] = [r['logo_annotations'][i]['description'].lower() 
            for i, logo in enumerate(range(annots['logos']['count']))]


    """
    labels

    these are various labels the API decided to produce, could be anything
    """

    annots['labels']['count'] = _count('label')
    if annots['labels']['count']:
        annots['labels'] = {l['description']: round(l['score'], 3) for l in r['label_annotations']}

    """
    themes

    """
    detected_themes = [theme for theme, likelihood in r['safe_search_annotation'].items() 
                                                    if likelihood.lower() in 'likely very_likely'.split()] 

    if detected_themes:
        annots['restricted_themes'] = detected_themes

    """
    colors

    """

    def _get_closest_color(color):
    
        distance_to_color = []
    
        for k, v in webcolors.css3_hex_to_names.items():
    
            # going through something like this: {#f0f8ff: aliceblue, #faebd7: antiquewhite}
            r,g,b = webcolors.hex_to_rgb(k)  # this converts #f0f8ff to integer RGB values
        
            distance_to_color.append((v, (r - color[0])**2 + (g - color[1])**2 + (b - color[2])**2))
    
        return min(distance_to_color, key=lambda x: x[1])[0]

    clrs = defaultdict(lambda: defaultdict())

    for c in r['image_properties_annotation']['dominant_colors']['colors']:
        # create an RGB tuple and get the closest color from CCS3 palette
        color = _get_closest_color(tuple([int(c) for c in (c['color']['red'], c['color']['green'], c['color']['blue'])]))
        clrs[color]['pixel_fraction'] = round(c['pixel_fraction'], 3)
        clrs[color]['score'] = round(c['score'], 3)

    annots['colors'] = clrs

    """
    full text annotation

    """

    # note that detected languages look like [{'language_code': 'ceb', 'confidence': 1.0}]
    try:
        annots['languages'] = sorted([(l['language_code'], l['confidence']) 
            for p in r['full_text_annotation']['pages'] for l in p['property']['detected_languages']], 
            key=lambda x: x[1], reversed=True)
    except:
        pass

    """
    web detection

    """
    try:
        annots['web_entities'] = {e['description'].lower(): round(e['score'], 3) 
                    for e in r['web_detection']['web_entities'] if 'description' in e}
    except:
        pass

    """
    localized objects

    """
    try:
        annots['objects'] = {o['name'].lower(): round(o['score'], 3) for o in r['localized_object_annotations']}
    except:
        pass

    return annots

In [70]:
res = annotate()

In [71]:
res

defaultdict(<function __main__.annotate.<locals>.<lambda>()>,
            {'faces': defaultdict(None, {'count': 1}),
             'logos': defaultdict(None, {'count': 0}),
             'labels': {'blue': 0.968,
              'pink': 0.957,
              'red': 0.955,
              'shoulder': 0.86,
              'fun': 0.831,
              'girl': 0.791,
              'standing': 0.78,
              'arm': 0.752,
              'human body': 0.741,
              'mouth': 0.675},
             'colors': defaultdict(<function __main__.annotate.<locals>.<lambda>()>,
                         {'lightsteelblue': defaultdict(None,
                                      {'pixel_fraction': 0.122,
                                       'score': 0.161}),
                          'slategrey': defaultdict(None,
                                      {'pixel_fraction': 0.13,
                                       'score': 0.107}),
                          'indianred': defaultdict(None,
               

### Video

In [154]:
client_v = videointelligence.VideoIntelligenceServiceClient(credentials=credentials)

# this API version supports 'EXPLICIT_CONTENT_DETECTION', 'FEATURE_UNSPECIFIED', 
# 'LABEL_DETECTION', 'SHOT_CHANGE_DETECTION', 'SPEECH_TRANSCRIPTION'

In [182]:
# speech to text

In [160]:
sp_config = videointelligence.types.SpeechTranscriptionConfig(language_code='en-US')
vd_context = videointelligence.types.VideoContext(speech_transcription_config=sp_config)

operation = client_v.annotate_video(input_uri='gs://timtamslam_videos/video_679808005510204.mp4', 
                                    features=[videointelligence.enums.Feature.SPEECH_TRANSCRIPTION], video_context=vd_context)
res = operation.result()

In [None]:
video_text = [_.transcript for _ in res.annotation_results[0].speech_transcriptions[0].alternatives]

In [183]:
# labels

In [184]:
operation = client_v.annotate_video(input_uri='gs://timtamslam_videos/video_679808005510204.mp4', 
                                    features=[videointelligence.enums.Feature.LABEL_DETECTION])
res = operation.result()

In [185]:
for ann in res.annotation_results[0].segment_label_annotations:
    desc = ann.entity.description
    cents = ' - '.join([_.description for _ in ann.category_entities])
    print(f'ent: {desc}, cat: {cents}')

ent: food, cat: 
ent: conversation, cat: communication
ent: eating, cat: person
ent: social group, cat: 
ent: interaction, cat: person


In [186]:
for ann in res.annotation_results[0].shot_label_annotations:
    desc = ann.entity.description
    cents = ' - '.join([_.description for _ in ann.category_entities])
    print(f'ent: {desc}, cat: {cents}')

ent: interaction, cat: person
ent: social group, cat: 
ent: restaurant, cat: business
ent: community, cat: organization
ent: conversation, cat: communication
ent: eating, cat: person
ent: student, cat: person
ent: learning, cat: person
ent: fun, cat: 
ent: food, cat: 
ent: party, cat: event
ent: drink, cat: 
ent: people, cat: person
