Baseline tagging code for multiple narratives

In [1]:
import localized_narratives
import os
import glob
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pickle as pkl
from scipy.spatial import ConvexHull
from matplotlib.path import Path
from PIL import Image
import itertools
import string

In [2]:
"""https://github.com/google/localized-narratives/blob/master/demo.py"""

# This folder is where you would like to download the annotation files to and
# where to read them from.
local_dir = os.getcwd()

# The DataLoader class allows us to download the data and read it from file.
data_loader = localized_narratives.DataLoader(local_dir)

# Downloads the annotation files (it first checks if they are not downloaded).
data_loader.download_annotations('ade20k_val')

loc_narrs = data_loader.load_annotations('ade20k_val', 5) # Change number of narratives used

Already downloaded: ade20k_validation_localized_narratives.jsonl


In [3]:
def clean_text(text):
    return text.translate(str.maketrans('', '', string.punctuation)).lower()

In [4]:
def hull_to_name(hull):    
    # make a polygon from the hull verticies
    points = hull.points
    points = np.array([[point[0]*image_width, point[1]*image_height] for point in points])
    tupVerts = [(points[vtx,0], points[vtx,1]) for vtx in hull.vertices]
    p = Path(tupVerts)
    # make a canvas of coordinates corresponding to the image
    x, y = np.meshgrid(np.arange(image_width), np.arange(image_height)) 
    x, y = x.flatten(), y.flatten()
    points = np.vstack((x,y)).T     
    # get points in image within hull
    grid = p.contains_points(points)
    mask = grid.reshape(image_height,image_width) # now you have a mask with points inside a polygon
    # get labels at those points
    labels_within_hull = mask*ObjectClassMasks
    labels = labels_within_hull[np.nonzero(labels_within_hull)]
    # get count for unique labels
    values, counts = np.unique(labels, return_counts=True)
    # Edge case: some utterances the traces entirely out of the image
    # if empty set label to 0
    if len(counts)==0:
        return index_ade20k['objectnames'][0] 
    # get most frequent label within hull
    ind = np.argmax(counts)
    label = values[ind]
    # object name lookup with label
    return index_ade20k['objectnames'][label - 1]

In [5]:
# for printing with colors
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [6]:
# Get object name lookup table
with open('/Volumes/gordonssd/ADE20K_2021_17_01/index_ade20k.pkl', 'rb') as f:
    index_ade20k = pkl.load(f)
# Replace "-" with "other"
index_ade20k['objectnames'][0] = "other"

In [7]:
raw_word_documents = []
raw_labels_documents = []
    
for loc_narr in loc_narrs:
    raw_labels_doc = ''
    raw_word_documents.append(clean_text(loc_narr.caption))
    image_id = loc_narr.image_id
    path_to_image = f'/Volumes/gordonssd/ADE20K_2021_17_01/images/ADE/validation/*/*/{image_id}.jpg'
    path_to_image_seg = f'/Volumes/gordonssd/ADE20K_2021_17_01/images/ADE/validation/*/*/{image_id}_seg.png'
    path_to_image_json = f'/Volumes/gordonssd/ADE20K_2021_17_01/images/ADE/validation/*/*/{image_id}.json'
    # Get image
    for filename in glob.glob(path_to_image):
        path_to_image = filename
    image = mpimg.imread(path_to_image)
    # Get image segmentation
    for filename in glob.glob(path_to_image_seg):
        path_to_image_seg = filename
    image_seg = mpimg.imread(path_to_image_seg)
    # Get image height, width
    image_height = image.shape[0]
    image_width = image.shape[1] 
    # Get Object Labels
    with Image.open(path_to_image_seg) as io:
        seg = np.array(io);
    R = seg[:,:,0];
    G = seg[:,:,1];
    B = seg[:,:,2];
    ObjectClassMasks = (R/10).astype(np.int32)*256+(G.astype(np.int32));
    # Concatenate all trace segments
    traces = list(itertools.chain.from_iterable(loc_narr.traces))
    # Align traces to word #
    # Get list of word_trace_align objects
    save_word = ''
    for word in loc_narr.timed_caption:
        w = save_word + word['utterance']
        w = clean_text(w)
        save_word = ''
        # Check if start_time match end_time
        if (word['start_time']==word['end_time']):
            # Make phrase with next word
            save_word = word['utterance'] + ' '
            continue
        # Convert (start_time, end_time) to trace coordinates that fall within the time window
        start_time = word['start_time']
        end_time = word['end_time']
        # Filter trace_seg for items with t value within start_time:end_time
        points = list(filter(lambda coord: start_time<=coord['t']<= end_time, traces))
        points = np.array([[point['x'], point['y']] for point in points])
        try:
            hull = ConvexHull(points)
        except:
            print("!!!Cannot make ConvexHull for", w, "!!!")
        # Get most frequent label within hull
        tag = hull_to_name(hull)
        # one tag may have multiple words eg. 	person;individual;someone;somebody;mortal;soul
        # taking only first
        tag = tag.split()[0]
        tag = clean_text(tag)
        # Print word trace alignment
        print(w+'/'+bcolors.OKGREEN + tag + bcolors.ENDC, end =" ")
        # Add labels to labels document
        for i in range(len(w.split())):
            raw_labels_doc += tag + ' '
    raw_labels_documents.append(raw_labels_doc.strip())
    print('\n')

in this/[92mwall[0m image/[92mbed[0m on/[92mbed[0m the/[92mbed[0m left/[92mbed[0m side/[92mbed[0m i/[92mbed[0m can/[92mbed[0m see/[92mbed[0m a/[92mbed[0m bed/[92mbed[0m and/[92mwindowpane[0m a window/[92mwindowpane[0m on/[92mwindowpane[0m right/[92mdesk[0m side/[92mwall[0m i/[92mdesk[0m can/[92mdesk[0m see/[92mdesk[0m some/[92mdesk[0m object/[92mdesk[0m on/[92mdesk[0m the table/[92mdesk[0m and/[92mwall[0m a/[92mdoor[0m door/[92mwall[0m at/[92mcurtain[0m the/[92mceiling[0m top/[92mlamp[0m i/[92mlamp[0m can/[92mlamp[0m see/[92mlamp[0m the/[92mlamp[0m light/[92mlamp[0m 

in/[92mwall[0m this/[92mcounter[0m image/[92mcounter[0m there/[92mcounter[0m is/[92mcounter[0m a table/[92mcounter[0m on/[92mcounter[0m the table/[92mcounter[0m there/[92mfood[0m are/[92mcounter[0m food/[92mcounter[0m recipes/[92mcounter[0m and/[92mcounter[0m display/[92mprice[0m boards/[92mperson[0m there/[92mperson[0m are

In [8]:
raw_labels_documents

['wall wall bed bed bed bed bed bed bed bed bed bed windowpane windowpane windowpane windowpane desk wall desk desk desk desk desk desk desk desk wall door wall curtain ceiling lamp lamp lamp lamp lamp lamp',
 'wall counter counter counter counter counter counter counter counter counter food counter counter counter counter price person person wall person person wall person person wall person wall other other wall wall wall wall wall wall wall wall',
 'bathtub bathtub wall bathtub bathtub wall door door door door door door door door door door wall windowpane wall wall wall wall wall wall wall windowpane windowpane windowpane wall rod rod windowpane wall wall wall shower shower wall wall wall bottle wall wall wall wall wall wall bathtub wall wall wall wall wall wall door wall wall wall wall wall sink wall wall wall wall wall wall wall wall sink sink sink sink toilet toilet toilet toilet toilet floor floor floor floor floor floor floor floor floor floor',
 'wardrobe wardrobe wardrobe ward

In [9]:
raw_word_documents

['in this image on the left side i can see a bed and a window on right side i can see some object on the table and a door at the top i can see the light',
 'in this image there is a table on the table there are food recipes and display boards there are two persons standing there is a glass there is a wall with the curtain at the back side',
 'this image is taken indoors on the right side of the image there is a door in the background we can see the wall there is a window there is a metal rod we can see the shower there two bottles on the shelves there is a bathtub we can see the handles there is a tap on the left side of the image there is a wall we can see the sink there is a toilet seat at the bottom of the image there is the floor',
 'in this picture i can see a bed having some pillows blanket on the bed a bag is placed behind there is a designed wall and some objects are placed on the table',
 'in this image i can see water and i can also see something looking like trees mountains 

Edge case: horizontal/vertical traces   
ADE_val_00000078  
This image is taken indoors. On the right side of the image there is a door. In the background we can see the wall. There is a window. There is <b>a*</b> metal rod. We can see the shower. There two bottles on the shelves. There is a bathtub. We can see the handles. There is a tap. On the left side of the image there is a wall. We can see the sink. There is a toilet seat. At the bottom of the image there is the floor.
```
[[0.4303 0.1629]
 [0.381  0.1629]
 [0.3131 0.1629]
 [0.2468 0.1629]
 [0.2111 0.1629]
 [0.2111 0.1629]]
```
Cannot make ConvexHull. For now just copy previous.