vision/annotations.go

// Copyright 2016 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vision

import (
	"image"

	pb "google.golang.org/genproto/googleapis/cloud/vision/v1"
	"google.golang.org/grpc"
	"google.golang.org/grpc/codes"
)

// Annotations contains all the annotations performed by the API on a single image.
// A nil field indicates either that the corresponding feature was not requested,
// or that annotation failed for that feature.
type Annotations struct {
	// Faces holds the results of face detection.
	Faces []*FaceAnnotation
	// Landmarks holds the results of landmark detection.
	Landmarks []*EntityAnnotation
	// Logos holds the results of logo detection.
	Logos []*EntityAnnotation
	// Labels holds the results of label detection.
	Labels []*EntityAnnotation
	// Texts holds the results of text detection.
	Texts []*EntityAnnotation
	// SafeSearch holds the results of safe-search detection.
	SafeSearch *SafeSearchAnnotation
	// ImageProps contains properties of the annotated image.
	ImageProps *ImageProps

	// If non-nil, then one or more of the attempted annotations failed.
	// Non-nil annotations are guaranteed to be correct, even if Error is
	// non-nil.
	Error error
}

func annotationsFromProto(res *pb.AnnotateImageResponse) *Annotations {
	as := &Annotations{}
	for _, a := range res.FaceAnnotations {
		as.Faces = append(as.Faces, faceAnnotationFromProto(a))
	}
	for _, a := range res.LandmarkAnnotations {
		as.Landmarks = append(as.Landmarks, entityAnnotationFromProto(a))
	}
	for _, a := range res.LogoAnnotations {
		as.Logos = append(as.Logos, entityAnnotationFromProto(a))
	}
	for _, a := range res.LabelAnnotations {
		as.Labels = append(as.Labels, entityAnnotationFromProto(a))
	}
	for _, a := range res.TextAnnotations {
		as.Texts = append(as.Texts, entityAnnotationFromProto(a))
	}
	as.SafeSearch = safeSearchAnnotationFromProto(res.SafeSearchAnnotation)
	as.ImageProps = imagePropertiesFromProto(res.ImagePropertiesAnnotation)
	if res.Error != nil {
		// res.Error is a google.rpc.Status. Convert to a Go error. Use a gRPC
		// error because it preserves the code as a separate field.
		// TODO(jba): preserve the details field.
		as.Error = grpc.Errorf(codes.Code(res.Error.Code), "%s", res.Error.Message)
	}
	return as
}

// A FaceAnnotation describes the results of face detection on an image.
type FaceAnnotation struct {
	// BoundingPoly is the bounding polygon around the face. The coordinates of
	// the bounding box are in the original image's scale, as returned in
	// ImageParams. The bounding box is computed to "frame" the face in
	// accordance with human expectations. It is based on the landmarker
	// results. Note that one or more x and/or y coordinates may not be
	// generated in the BoundingPoly (the polygon will be unbounded) if only a
	// partial face appears in the image to be annotated.
	BoundingPoly []image.Point

	// FDBoundingPoly is tighter than BoundingPoly, and
	// encloses only the skin part of the face. Typically, it is used to
	// eliminate the face from any image analysis that detects the "amount of
	// skin" visible in an image. It is not based on the landmarker results, only
	// on the initial face detection, hence the fd (face detection) prefix.
	FDBoundingPoly []image.Point

	// Landmarks are detected face landmarks.
	Face FaceLandmarks

	// RollAngle indicates the amount of clockwise/anti-clockwise rotation of
	// the face relative to the image vertical, about the axis perpendicular to
	// the face. Range [-180,180].
	RollAngle float32

	// PanAngle is the yaw angle: the leftward/rightward angle that the face is
	// pointing, relative to the vertical plane perpendicular to the image. Range
	// [-180,180].
	PanAngle float32

	// TiltAngle is the pitch angle: the upwards/downwards angle that the face is
	// pointing relative to the image's horizontal plane. Range [-180,180].
	TiltAngle float32

	// DetectionConfidence is the detection confidence. The range is [0, 1].
	DetectionConfidence float32

	// LandmarkingConfidence is the face landmarking confidence. The range is [0, 1].
	LandmarkingConfidence float32

	// Likelihoods expresses the likelihood of various aspects of the face.
	Likelihoods *FaceLikelihoods
}

func faceAnnotationFromProto(pfa *pb.FaceAnnotation) *FaceAnnotation {
	fa := &FaceAnnotation{
		BoundingPoly:          boundingPolyFromProto(pfa.BoundingPoly),
		FDBoundingPoly:        boundingPolyFromProto(pfa.FdBoundingPoly),
		RollAngle:             pfa.RollAngle,
		PanAngle:              pfa.PanAngle,
		TiltAngle:             pfa.TiltAngle,
		DetectionConfidence:   pfa.DetectionConfidence,
		LandmarkingConfidence: pfa.LandmarkingConfidence,
		Likelihoods: &FaceLikelihoods{
			Joy:          Likelihood(pfa.JoyLikelihood),
			Sorrow:       Likelihood(pfa.SorrowLikelihood),
			Anger:        Likelihood(pfa.AngerLikelihood),
			Surprise:     Likelihood(pfa.SurpriseLikelihood),
			UnderExposed: Likelihood(pfa.UnderExposedLikelihood),
			Blurred:      Likelihood(pfa.BlurredLikelihood),
			Headwear:     Likelihood(pfa.HeadwearLikelihood),
		},
	}
	populateFaceLandmarks(pfa.Landmarks, &fa.Face)
	return fa
}

// An EntityAnnotation describes the results of a landmark, label, logo or text
// detection on an image.
type EntityAnnotation struct {
	// ID is an opaque entity ID. Some IDs might be available in Knowledge Graph(KG).
	// For more details on KG please see:
	// https://developers.google.com/knowledge-graph/
	ID string

	// Locale is the language code for the locale in which the entity textual
	// description (next field) is expressed.
	Locale string

	// Description is the entity textual description, expressed in the language of Locale.
	Description string

	// Score is the overall score of the result. Range [0, 1].
	Score float32

	// Confidence is the accuracy of the entity detection in an image.
	// For example, for an image containing the Eiffel Tower, this field represents
	// the confidence that there is a tower in the query image. Range [0, 1].
	Confidence float32

	// Topicality is the relevancy of the ICA (Image Content Annotation) label to the
	// image. For example, the relevancy of 'tower' to an image containing
	// 'Eiffel Tower' is likely higher than an image containing a distant towering
	// building, though the confidence that there is a tower may be the same.
	// Range [0, 1].
	Topicality float32

	// BoundingPoly is the image region to which this entity belongs. Not filled currently
	// for label detection. For text detection, BoundingPolys
	// are produced for the entire text detected in an image region, followed by
	// BoundingPolys for each word within the detected text.
	BoundingPoly []image.Point

	// Locations contains the location information for the detected entity.
	// Multiple LatLng structs can be present since one location may indicate the
	// location of the scene in the query image, and another the location of the
	// place where the query image was taken. Location information is usually
	// present for landmarks.
	Locations []LatLng

	// Properties are additional optional Property fields.
	// For example a different kind of score or string that qualifies the entity.
	Properties []Property
}

func entityAnnotationFromProto(e *pb.EntityAnnotation) *EntityAnnotation {
	var locs []LatLng
	for _, li := range e.Locations {
		locs = append(locs, latLngFromProto(li.LatLng))
	}
	var props []Property
	for _, p := range e.Properties {
		props = append(props, propertyFromProto(p))
	}
	return &EntityAnnotation{
		ID:           e.Mid,
		Locale:       e.Locale,
		Description:  e.Description,
		Score:        e.Score,
		Confidence:   e.Confidence,
		Topicality:   e.Topicality,
		BoundingPoly: boundingPolyFromProto(e.BoundingPoly),
		Locations:    locs,
		Properties:   props,
	}
}

// SafeSearchAnnotation describes the results of a SafeSearch detection on an image.
type SafeSearchAnnotation struct {
	// Adult is the likelihood that the image contains adult content.
	Adult Likelihood

	// Spoof is the likelihood that an obvious modification was made to the
	// image's canonical version to make it appear funny or offensive.
	Spoof Likelihood

	// Medical is the likelihood that this is a medical image.
	Medical Likelihood

	// Violence is the likelihood that this image represents violence.
	Violence Likelihood
}

func safeSearchAnnotationFromProto(s *pb.SafeSearchAnnotation) *SafeSearchAnnotation {
	if s == nil {
		return nil
	}
	return &SafeSearchAnnotation{
		Adult:    Likelihood(s.Adult),
		Spoof:    Likelihood(s.Spoof),
		Medical:  Likelihood(s.Medical),
		Violence: Likelihood(s.Violence),
	}
}

// ImageProps describes properties of the image itself, like the dominant colors.
type ImageProps struct {
	// DominantColors describes the dominant colors of the image.
	DominantColors []*ColorInfo
}

func imagePropertiesFromProto(ip *pb.ImageProperties) *ImageProps {
	if ip == nil || ip.DominantColors == nil {
		return nil
	}
	var cinfos []*ColorInfo
	for _, ci := range ip.DominantColors.Colors {
		cinfos = append(cinfos, colorInfoFromProto(ci))
	}
	return &ImageProps{DominantColors: cinfos}
}