# Table of Contents
* [Stage 4 - load data](#Stage-4---load-data)
* [process data](#process-data)
* [hide](#hide)


In [1]:
%%capture
from __future__ import division
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict
%load_ext autoreload
%autoreload 2

import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pylab as plt
#%matplotlib notebook
%matplotlib inline
%load_ext base16_mplrc
%base16_mplrc light solarized
plt.rcParams['figure.figsize'] = (20.0, 10.0)

import re
import pickle
import boto
from copy import deepcopy
import json
import os
import random
import jinja2
from tqdm import tqdm
from IPython.core.display import HTML

import PIL.Image as Image
import requests

from boto.mturk.qualification import PercentAssignmentsApprovedRequirement, Qualifications, Requirement

from keysTkingdom import mturk_ai2
from keysTkingdom import aws_tokes
from keysTkingdom import mturk_aristo

from amt_utils.mturk import pickle_this, unpickle_this

import os
import jinja2
import argparse
from jinja2 import Environment, FileSystemLoader

j2env = jinja2.Environment()

from amt_utils.bboxes import cluster_from_nms
from amt_utils.bboxes import draw_animation_seq
from amt_utils.bboxes import cluster_from_annos
from amt_utils.bboxes import create_subtask_data
from amt_utils.bboxes import draw_image_and_labels
from amt_utils.bboxes import select_labels
from amt_utils.bboxes import cluster_and_label
import warnings
warnings.filterwarnings('ignore')

# Stage 4 - load data 

In [2]:
test_res = unpickle_this('./sample_s4a_results.pkl')

In [3]:
first_test_batch = unpickle_this('test_batch_1.pkl')
first_prod_batch = unpickle_this('first_prod_batch.pkl')

In [436]:
dataset = unpickle_this('./pickled_data/dataset_full8500_0p4p5.pkl')

In [13]:
current_batch = {vid.gid(): vid.description() for vid in dataset}

# process data

## df manip

In [6]:
def create_result(assmt):
    result = {}
    raw_result = json.loads(assmt.answers[0][0].fields[0])
    result['image_id'] = raw_result['image_url']
    result['object_words_raw'] = sorted([w.replace('None', 'None_0_0') for w in raw_result['description']], key=lambda x: ''.join(x.split('_')[1:]))
    obj_words = [word.split('_')[0] for word in result['object_words_raw']]
    obj_word_location = [word.split('_')[1:] for word in result['object_words_raw']]
    result['object_words'] = obj_words
    result['object_locs'] = obj_word_location
    result['asgmt_id'] = assmt.AssignmentId
    result['hit_id'] = assmt.HITId
    result['worker_id'] = assmt.WorkerId
    return result

In [7]:
assignments = [item for sublist in first_prod_batch.values() for item in sublist]
assignment_results = [create_result(ar) for ar in assignments]

In [8]:
st4_df = pd.DataFrame(assignment_results)
st4_df['obj_set'] = st4_df['object_words_raw'].apply(lambda x: set(x))

In [9]:
grouped_by_image = st4_df.groupby('image_id')

In [10]:
agged_on_mode = grouped_by_image.agg(lambda x: st.mode(x))
agged_on_mode['mode_count'] = agged_on_mode['obj_set'].apply(lambda x: x[1][0])
cons_df = agged_on_mode[agged_on_mode['mode_count'].isin([2, 3])]
agged_noncon_df = agged_on_mode[agged_on_mode['mode_count'].isin([1])]
noncon_vids = set(agged_noncon_df.index.tolist())

In [14]:
cons_df['image_id'] = cons_df.index
cons_df['description'] = cons_df['image_id'].apply(lambda x: current_batch[x])

In [15]:
cons_df['con_objects'] = cons_df['obj_set'].apply(lambda x: sorted(list(x[0][0]), key=lambda x: (x.split('_')[1], x.split('_')[2] )))

In [16]:
noncon_df = st4_df[st4_df['image_id'].isin(noncon_vids)]

In [17]:
noncon_df['description'] = noncon_df['image_id'].apply(lambda x: current_batch[x])
noncon_df['description'] = noncon_df['image_id'].apply(lambda x: current_batch[x])

In [18]:
cons_df.shape[0] / agged_on_mode.shape[0]

0.9209797951840576

In [19]:
obj_arr = cons_df['con_objects'].apply(lambda x: np.array([ob.split('_')[0] for ob in x])).values

In [20]:
obj_series = pd.Series(np.hstack(obj_arr))

In [21]:
obj_series.shape

(10349,)

In [262]:
noncon_df[noncon_df['image_id'] == 's_01_e_01_shot_010120_010194']

Unnamed: 0,asgmt_id,hit_id,image_id,object_locs,object_words,object_words_raw,worker_id,obj_set,description
11179,3KJYX6QCMABOZE4YSMN41FMCBJMJVM,3G4VVJO6P0G3IUFQYT9F7I2MEH0KP8,s_01_e_01_shot_010120_010194,"[[1, 11]]",[hand],[hand_1_11],A1DTK3VD0CAM5V,{hand_1_11},Fred is in a room speaking to the sky. Fred th...
11180,3L4D84MIL0SJHD9ENX60MEV334DHJW,3G4VVJO6P0G3IUFQYT9F7I2MEH0KP8,s_01_e_01_shot_010120_010194,"[[0, 8], [1, 12], [1, 7]]","[sky, expression, ground]","[sky_0_8, expression_1_12, ground_1_7]",A22VGT2F28LTWC,"{sky_0_8, expression_1_12, ground_1_7}",Fred is in a room speaking to the sky. Fred th...
11181,3SB4CE2TJWV52A6PYVB9W1GE183AX5,3G4VVJO6P0G3IUFQYT9F7I2MEH0KP8,s_01_e_01_shot_010120_010194,"[[0, 8], [1, 11], [1, 7]]","[sky, hand, ground]","[sky_0_8, hand_1_11, ground_1_7]",A1LWMYF4M8DWTV,"{sky_0_8, hand_1_11, ground_1_7}",Fred is in a room speaking to the sky. Fred th...


In [257]:
def overlap(row):
    try:
        return list(row[0].intersection(row[1]).union(row[1].intersection(row[2])))
    except TypeError:
        return {}

In [258]:
grouped_by_image = noncon_df.groupby('image_id')
noncon_grouped = grouped_by_image['obj_set'].apply(lambda x: pd.Series(x.values)).unstack()

In [259]:
noncon_subsets_choices = noncon_grouped.head(46).apply(lambda x: overlap(x), axis=1)

In [207]:
noncon_lookup = noncon_subsets_choices.to_dict()

In [209]:
noncon_lookup
noncon_lookup_exists = {k: v for k, v in noncon_lookup.items()}

In [247]:
# print(obj_series.value_counts()[:20])

In [246]:
# obj_series.value_counts()[20:80]

In [81]:
val_counts = obj_series.value_counts()

In [83]:
# _ = val_counts[val_counts < 100].hist(bins=20)

## build objects

In [92]:
vid_objs = cons_df['con_objects'].to_dict()

In [123]:
def distill_objects(obj_idx_list):
    combined_objects = []
    object_coords = []
    for word_position in obj_idx_list:
        split_components = word_position.split('_')
        word, sent_n, word_n = split_components[0], int(split_components[1]), int(split_components[2])
        object_coords.append((word, sent_n, word_n))
    object_coords = sorted(object_coords, key= lambda x: (x[1], x[2]))
    combined_objects.append(object_coords[0])
    for idx in range(1, len(object_coords)):
        this_word, this_sent, this_wn = object_coords[idx]
        last_word, last_sent, last_wn = combined_objects[-1]
        if this_sent == last_sent and last_wn + 1 == this_wn:            
            combined_objects[-1] = (' '.join([last_word, this_word]), this_sent, this_wn)
        else:
            combined_objects.append((this_word, this_sent, this_wn))
    return [w[1:] for w in combined_objects], [w[0] for w in combined_objects]

In [124]:
test_obs = ['newspapers_0_11', 'street_0_17', 'newspaper_1_6', 'cart_1_7']
_, object_phrases = distill_objects(test_obs)

In [264]:
noncon_lookup = noncon_subsets_choices.to_dict()

In [125]:
video_object_lookup = cons_df['con_objects'].to_dict()

In [126]:
cons_df['combined_objects'] = cons_df['con_objects'].apply(distill_objects)

In [265]:
combined_lookup = {**video_object_lookup, **noncon_lookup}

In [416]:
setting_lookup = {vid.gid(): vid.setting() for vid in dataset}

In [421]:
combined_lookup_no_settings_raw = {k: [v for v in vals if v.split('_') != setting_lookup[k]] for k, vals in combined_lookup.items()} 

In [423]:
combined_lookup_no_settings = {k: v for k, v in combined_lookup_no_settings_raw.items() if v}

In [424]:
len(combined_lookup_no_settings)

6695

In [409]:
vids_with_parts = {k: vals for k, vals in combined_lookup.items() if set([v.split('_')[0] for v in vals]).intersection(set(['hands', 'eyes', 'arms', 'fingers']))} 

In [412]:
vids_with_parts_ids = set(vids_with_parts.keys())

In [415]:
pickle_this(vids_with_parts_ids, 'ids_to_target.pkl')

In [283]:
all_objs = [obj.split('_')[0].lower() for obl in combined_lookup.values() for obj in obl]

In [284]:
all_obj_ser = pd.Series(all_objs)

In [291]:
plural_objs = [ob for ob in all_objs if ob[-1] == 's' and ob[-2] != 's']

In [402]:
pd.Series(plural_objs).value_counts()[:100]

hands         164
eyes          119
arms          104
glasses        63
clothes        37
hips           29
sunglasses     25
hats           24
shoulders      22
chairs         20
fingers        17
cards          14
horns          13
flowers        13
legs           11
groceries      11
dishes         11
rocks           8
heads           8
bushes          7
clubs           7
couches         7
gloves          6
papers          6
fists           5
goggles         5
stars           5
curtains        5
balls           5
specs           4
             ... 
skates          2
cactus          2
wheels          2
candles         2
toys            2
towels          2
trees           2
helmets         2
pictures        2
noses           2
ropes           2
poles           2
muscles         2
circus          2
beds            2
bandages        2
skis            2
scarves         2
rockets         2
backs           2
covers          2
doors           2
mugs            2
bubbles         2
sacks     

In [297]:
videos_w_objs[0].vid_data['objects']

{'descriptors': ['None'], 'spans': [(0, 0)]}

In [298]:
len(videos_w_objs)

6655

In [441]:
object_data = {}
for vid in dataset:
    try:
        spans, object_decriptors =  distill_objects(combined_lookup_no_settings[vid.gid()])
        object_data[vid.gid()] = {
            'spans': spans,
            'descriptors': object_decriptors 
        }
    except KeyError:
        pass

In [438]:
# selected_vid = [vid for vid in dataset if vid.gid() == 's_01_e_01_shot_002640_002714'][0]

# selected_vid.vid_data['description']

In [439]:
videos_w_objs = [vid for vid in dataset if vid.gid() in video_object_lookup if vid.vid_data['objects']['descriptors'][0] != 'None']

In [391]:
def rejoin_formatted_desc(description, replacement_span):
    tokenized_description = [sent.split() for sent in sent_tokenize(td1)]
    replace_word = tokenized_description[replacement_span[0]][replacement_span[1]]
    tokenized_description[replacement_span[0]][replacement_span[1]]  = '<target>' + replace_word + '</target>'
    joined_desc = ' '.join([' '.join([w for w in sent]) for sent in tokenized_description])
    return joined_desc

In [397]:
# for vid in tqdm(vids_w_objs[100:]):
#     try:
#         keyframes = vid.display_keyframes()
#         three_frame_filename = vid.gid() + '_task4b.png'
#         keyframes.save('./subtask_frames/' + three_frame_filename)
#     except:
#         print(vid.gid())

In [443]:
pickle_this(object_data, 'obj_data.pkl')

## review

In [434]:
random_video = random.choice(videos_w_objs)

print(random_video.vid_data['objects']['descriptors'])
print(random_video.vid_data['setting'])
print([c.char_data['characterName'] for c in random_video.vid_data['characters']])
print(random_video.vid_data['description'])

# random_video.display_gif()

['couch', 'head']
living room
['fred', 'barney']
Fred and Barney are having a conversation on the couch in the living room. Fred turns his head with a disdained look.


filter setting

# hide

In [12]:
# no_con_df[['image_id', 'description', 'object_words']].to_csv('stage4a_test_1_no_con.csv', index=False)

In [267]:
def clean_response(resp_str):
    resp_str = resp_str.lower()
    resp_str = resp_str.replace('\n', ' ')
    resp_str = resp_str.strip()
    return resp_str

In [268]:
def select_string(resp_row):
    resp_mode = st.stats.mode(resp_row)
    if resp_mode.count[0] > 1:
        return resp_mode.mode[0]
    else:
        sorted_resp = sorted(resp_row.values.tolist(), key=lambda x: len(x))
        superset_resp = [resp for resp in sorted_resp[1:] if sorted_resp[0] in resp]
        if superset_resp:
            response_freqs = {k: setting_freq_lookup[k] for k in superset_resp + [sorted_resp[0]]} 
        else:
            response_freqs = {k: setting_freq_lookup[k] for k in sorted_resp} 
        return sorted(response_freqs.items(), key=lambda x: x[1], reverse=True)[0][0]
    

In [269]:
consensus_settings = settings_per_turker.iloc[:,1:].apply(select_string, axis=1)

In [270]:
consensus_settings.head()

0           room
1    living room
2        doorway
3           room
4        outside
dtype: object

In [271]:
cleaned_settings_per_turker = pd.concat([settings_per_turker['image_id'], settings_per_turker.iloc[:,1:].applymap(clean_response)], axis=1)

In [272]:
cleaned_settings_per_turker['consensus'] = consensus_settings

In [273]:
gif_uris = grouped_by_image['gif_uri'].agg(np.max)

In [274]:
cleaned_settings_per_turker = pd.merge(pd.DataFrame(gif_uris).reset_index(), cleaned_settings_per_turker)

In [275]:
cleaned_settings_per_turker.shape

(10893, 6)

In [276]:
cleaned_settings_per_turker.sample(200).to_csv('stage_3a_prod_2.csv')

In [277]:
consensus_settings.apply(lambda x: bin_settings_lookup[x]).value_counts()

bin: 100-inf    10089
bin: 21-100       522
bin: 11-20        140
bin: 1-5           72
bin: 6-10          70
dtype: int64

In [280]:
prod_1_setting = cleaned_settings_per_turker[['image_id', 'consensus']]

In [281]:
prod_1_setting.index = prod_1_setting['image_id']

In [283]:
pickle_this(prod_1_setting['consensus'].to_dict(), 'stage3_prod1_2_settings.pkl')