# Table of Contents
* [Stage 3 - load data](#Stage-3---load-data)
* [process data](#process-data)
* [region experiment](#region-experiment)
* [hide](#hide)


In [1]:
%%capture
from __future__ import division
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict
%load_ext autoreload
%autoreload 2

import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pylab as plt
#%matplotlib notebook
%matplotlib inline
%load_ext base16_mplrc
%base16_mplrc light solarized
plt.rcParams['figure.figsize'] = (20.0, 10.0)

import re
import pickle
import boto
from copy import deepcopy
import json
import os
import random
import jinja2
from tqdm import tqdm
from IPython.core.display import HTML

import PIL.Image as Image
import requests

from boto.mturk.qualification import PercentAssignmentsApprovedRequirement, Qualifications, Requirement

from keysTkingdom import mturk_ai2
from keysTkingdom import aws_tokes
from keysTkingdom import mturk_aristo

from amt_utils.mturk import pickle_this, unpickle_this

import os
import jinja2
import argparse
from jinja2 import Environment, FileSystemLoader

j2env = jinja2.Environment()

from amt_utils.bboxes import cluster_from_nms
from amt_utils.bboxes import draw_animation_seq
from amt_utils.bboxes import cluster_from_annos
from amt_utils.bboxes import create_subtask_data
from amt_utils.bboxes import draw_image_and_labels
from amt_utils.bboxes import select_labels
from amt_utils.bboxes import cluster_and_label
import warnings
warnings.filterwarnings('ignore')

import ai2.vision.utils as ai2vu

ModuleNotFoundError: No module named 'ai2'

# Stage 4 - load data 

In [2]:
test_res = unpickle_this('./sample_s4a_results.pkl')

In [172]:
first_test_batch = unpickle_this('test_batch_1.pkl')
first_prod_batch = unpickle_this('first_prod_batch.pkl')

In [174]:
dataset = unpickle_this('./pickled_data/flinstones_v0p3.pkl')

In [175]:
current_batch = {vid.gid(): vid.description() for vid in dataset}

# process data

In [176]:
def create_result(assmt):
    result = {}
    raw_result = json.loads(assmt.answers[0][0].fields[0])
    result['image_id'] = raw_result['image_url']
    result['object_words_raw'] = sorted([w.replace('None', 'None_0_0') for w in raw_result['description']], key=lambda x: ''.join(x.split('_')[1:]))
    obj_words = [word.split('_')[0] for word in result['object_words_raw']]
    obj_word_location = [word.split('_')[1:] for word in result['object_words_raw']]
    result['object_words'] = obj_words
    result['object_locs'] = obj_word_location
    result['asgmt_id'] = assmt.AssignmentId
    result['hit_id'] = assmt.HITId
    result['worker_id'] = assmt.WorkerId
    return result

In [177]:
assignments = [item for sublist in first_prod_batch.values() for item in sublist]
assignment_results = [create_result(ar) for ar in assignments]

In [178]:
st4_df = pd.DataFrame(assignment_results)
st4_df['obj_set'] = st4_df['object_words_raw'].apply(lambda x: set(x))

In [179]:
grouped_by_image = st4_df.groupby('image_id')

In [193]:
agged_on_mode = grouped_by_image.agg(lambda x: st.mode(x))
agged_on_mode['mode_count'] = agged_on_mode['obj_set'].apply(lambda x: x[1][0])
cons_df = agged_on_mode[agged_on_mode['mode_count'].isin([2, 3])]
agged_noncon_df = agged_on_mode[agged_on_mode['mode_count'].isin([1])]
noncon_vids = set(agged_noncon_df.index.tolist())

In [189]:
cons_df['image_id'] = cons_df.index
cons_df['description'] = cons_df['image_id'].apply(lambda x: current_batch[x])
noncon_df['image_id'] = noncon_df.index
noncon_df['description'] = noncon_df['image_id'].apply(lambda x: current_batch[x])

In [190]:
cons_df['con_objects'] = cons_df['obj_set'].apply(lambda x: sorted(list(x[0][0]), key=lambda x: (x.split('_')[1], x.split('_')[2] )))

In [199]:
noncon_df = st4_df[st4_df['image_id'].isin(noncon_vids)]

noncon_df['description'] = noncon_df['image_id'].apply(lambda x: current_batch[x])

In [205]:
cons_df.shape[0] / agged_on_mode.shape[0]

0.9209797951840576

In [None]:
cons_df

In [171]:
no_con_df[['image_id', 'description', 'object_words']].to_csv('stage4a_test_1_no_con.csv', index=False)

In [267]:
def clean_response(resp_str):
    resp_str = resp_str.lower()
    resp_str = resp_str.replace('\n', ' ')
    resp_str = resp_str.strip()
    return resp_str

In [268]:
def select_string(resp_row):
    resp_mode = st.stats.mode(resp_row)
    if resp_mode.count[0] > 1:
        return resp_mode.mode[0]
    else:
        sorted_resp = sorted(resp_row.values.tolist(), key=lambda x: len(x))
        superset_resp = [resp for resp in sorted_resp[1:] if sorted_resp[0] in resp]
        if superset_resp:
            response_freqs = {k: setting_freq_lookup[k] for k in superset_resp + [sorted_resp[0]]} 
        else:
            response_freqs = {k: setting_freq_lookup[k] for k in sorted_resp} 
        return sorted(response_freqs.items(), key=lambda x: x[1], reverse=True)[0][0]
    

In [269]:
consensus_settings = settings_per_turker.iloc[:,1:].apply(select_string, axis=1)

In [270]:
consensus_settings.head()

0           room
1    living room
2        doorway
3           room
4        outside
dtype: object

In [271]:
cleaned_settings_per_turker = pd.concat([settings_per_turker['image_id'], settings_per_turker.iloc[:,1:].applymap(clean_response)], axis=1)

In [272]:
cleaned_settings_per_turker['consensus'] = consensus_settings

In [273]:
gif_uris = grouped_by_image['gif_uri'].agg(np.max)

In [274]:
cleaned_settings_per_turker = pd.merge(pd.DataFrame(gif_uris).reset_index(), cleaned_settings_per_turker)

In [275]:
cleaned_settings_per_turker.shape

(10893, 6)

In [276]:
cleaned_settings_per_turker.sample(200).to_csv('stage_3a_prod_2.csv')

In [277]:
consensus_settings.apply(lambda x: bin_settings_lookup[x]).value_counts()

bin: 100-inf    10089
bin: 21-100       522
bin: 11-20        140
bin: 1-5           72
bin: 6-10          70
dtype: int64

In [280]:
prod_1_setting = cleaned_settings_per_turker[['image_id', 'consensus']]

In [281]:
prod_1_setting.index = prod_1_setting['image_id']

In [283]:
pickle_this(prod_1_setting['consensus'].to_dict(), 'stage3_prod1_2_settings.pkl')

# region experiment

In [27]:
# turk_data = {**us_only_results, **in_only_results, **nzaucaus_only_results}

In [194]:
def lookup_region(hitid):
    if hitid in nzaucaus_hit_ids:
        return 'nzaucaus'
    if hitid in us_hit_ids:
        return 'us only'
    if hitid in in_hit_ids:
        return 'in only'

In [195]:
# st3_df['region'] = st3_df['hit_id'].apply(lookup_region)

In [374]:
st3_df.head(1)

Unnamed: 0,asgmt_id,gif_uri,hit_id,image_id,setting_description,worker_id
0,3907X2AHF15BOY9UZP8DW6BDJFLP2S,https://s3-us-west-2.amazonaws.com/ai2-vision-...,3K8CQCU3KF1UA9XU7SFQR4RM1M0NWT,s_06_e_23_shot_018951_019025,room,A3HHDPKL3O3O7Y


In [379]:
grouped_by_image = st3_df.groupby('image_id')

settings_per_turker = grouped_by_image['setting_description'].apply(lambda x: pd.Series(x.values)).unstack()
settings_per_turker = settings_per_turker.reset_index()
settings_per_turker.columns = ['image_id'] + ['turker{}  setting: '.format(i) for i in range(1, 4)]

In [393]:
settings_per_turker

Unnamed: 0,image_id,turker1 setting:,turker2 setting:,turker3 setting:
0,s_01_e_03_shot_036331_036405,quarry,quarry,Quarry
1,s_01_e_04_shot_020768_020842,room,Living room,Quarry
2,s_01_e_04_shot_036045_036119,Doorway,doorway,Entry way
3,s_01_e_05_shot_006248_006322,inside house,Dining room,table inside\n
4,s_01_e_06_shot_033979_034053,Outside,outside,outside
5,s_01_e_07_shot_027719_027793,sidewalk,street,roadway
6,s_01_e_08_shot_035035_035109,living room,Living room,living room
7,s_01_e_10_shot_002849_002923,living room,room,inside house
8,s_01_e_10_shot_028169_028243,room,green room,indoors
9,s_01_e_11_shot_000814_000888,living room,living room,living room


In [355]:
# settings_per_region = grouped_by_image['setting_description'].apply(lambda x: pd.Series(x.values)).unstack()
# actions_per_region = grouped_by_image['character_description'].apply(lambda x: pd.Series(x.values)).unstack()
# region_s = grouped_by_image['region'].apply(lambda x: pd.Series(x.values)).unstack()

# region_cols = region_s.iloc[0][[0, 1, 2]].values.tolist()

# settings_per_region.columns = ['setting: ' + reg for reg in region_cols]
# actions_per_region.columns = ['actions: ' + reg for reg in region_cols]
# settings_per_region = settings_per_region.reset_index()
# actions_per_region = actions_per_region.reset_index()

In [388]:
gif_uris = grouped_by_image['gif_uri'].agg(np.max)

In [394]:
settings_per_turker

Unnamed: 0,image_id,turker1 setting:,turker2 setting:,turker3 setting:
0,s_01_e_03_shot_036331_036405,quarry,quarry,Quarry
1,s_01_e_04_shot_020768_020842,room,Living room,Quarry
2,s_01_e_04_shot_036045_036119,Doorway,doorway,Entry way
3,s_01_e_05_shot_006248_006322,inside house,Dining room,table inside\n
4,s_01_e_06_shot_033979_034053,Outside,outside,outside
5,s_01_e_07_shot_027719_027793,sidewalk,street,roadway
6,s_01_e_08_shot_035035_035109,living room,Living room,living room
7,s_01_e_10_shot_002849_002923,living room,room,inside house
8,s_01_e_10_shot_028169_028243,room,green room,indoors
9,s_01_e_11_shot_000814_000888,living room,living room,living room


In [397]:
beta_reponses_df = pd.merge(pd.DataFrame(gif_uris).reset_index(), settings_per_turker)

In [398]:
beta_reponses_df

Unnamed: 0,image_id,gif_uri,turker1 setting:,turker2 setting:,turker3 setting:
0,s_01_e_03_shot_036331_036405,https://s3-us-west-2.amazonaws.com/ai2-vision-...,quarry,quarry,Quarry
1,s_01_e_04_shot_020768_020842,https://s3-us-west-2.amazonaws.com/ai2-vision-...,room,Living room,Quarry
2,s_01_e_04_shot_036045_036119,https://s3-us-west-2.amazonaws.com/ai2-vision-...,Doorway,doorway,Entry way
3,s_01_e_05_shot_006248_006322,https://s3-us-west-2.amazonaws.com/ai2-vision-...,inside house,Dining room,table inside\n
4,s_01_e_06_shot_033979_034053,https://s3-us-west-2.amazonaws.com/ai2-vision-...,Outside,outside,outside
5,s_01_e_07_shot_027719_027793,https://s3-us-west-2.amazonaws.com/ai2-vision-...,sidewalk,street,roadway
6,s_01_e_08_shot_035035_035109,https://s3-us-west-2.amazonaws.com/ai2-vision-...,living room,Living room,living room
7,s_01_e_10_shot_002849_002923,https://s3-us-west-2.amazonaws.com/ai2-vision-...,living room,room,inside house
8,s_01_e_10_shot_028169_028243,https://s3-us-west-2.amazonaws.com/ai2-vision-...,room,green room,indoors
9,s_01_e_11_shot_000814_000888,https://s3-us-west-2.amazonaws.com/ai2-vision-...,living room,living room,living room


In [399]:
# per_region_df['char_name'] = per_region_df['image_id'].apply(lambda x: ds_by_frame[x]['characters'][0]['characterName'])

In [400]:
# pickle_this(set(per_region_df['image_id'].tolist()), 'random_sample_ids.pkl')

In [403]:
beta_reponses_df.to_csv('stage_3a_beta_1.csv')

In [402]:
# annotations_by_frame = defaultdict(list)
# for anno in sorted(assignment_results, key=lambda x: x['stillID']):
#     animation_id = anno['stillID'].rsplit('_', 1)[0]
#     annotations_by_frame[animation_id].append(anno)

# hide

In [35]:
# s3_base_path = 'https://s3-us-west-2.amazonaws.com/ai2-vision-animation-gan/annotation_data/still_frames/'

In [28]:
# pickle_this(setting_lookup.to_dict(), 'stage_3_beta_gold_settings.pkl')

In [19]:
# import PIL.Image as Image
# import requests

# image_n = 10


# s3_base_path = 'https://s3-us-west-2.amazonaws.com/ai2-vision-animation-gan/annotation_data/still_frames/'

# image_url = s3_base_path + 's_01_e_04_shot_012010_012084_70.png'
# Image.open(requests.get(image_url, stream=True).raw)