# Table of Contents
* [build and  update](#build-and--update)
	* [load](#load)
	* [stage1](#stage1)
	* [stage3](#stage3)
	* [inspect](#inspect)
* [parsing](#parsing)
* [mkdocs/ s3 dev](#mkdocs/-s3-dev)
* [clean and spellcheck](#clean-and-spellcheck)
* [hide](#hide)
	* [old build and update ds](#old-build-and-update-ds)


In [1]:
%%capture
from __future__ import division
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict
%load_ext autoreload
%autoreload 2

import copy
import os
import json
from tqdm import tqdm

from IPython.display import Image
import random

# build and  update

## load

In [2]:
from amt_utils import flintstones_pipeline
from amt_utils.mturk import unpickle_this, pickle_this

In [3]:
selection_metadata_dir = 'Flintstone_Shots_GIF_Selection'

with open(os.path.join(selection_metadata_dir, 'beta_gif_names.json')) as f:
    beta_videos = json.load(f)
        
with open(os.path.join(selection_metadata_dir, 'production_gif_names.json')) as f:
    production_videos = json.load(f)

In [4]:
processed_stage_1_a = unpickle_this('./stage_1/processed_stage1_boxes.pkl')

processed_stage_1_b = unpickle_this('./stage_1/stage_1b_prod_all_boxes_8_29.pkl')

stage_3a_settings = unpickle_this('stage_3/stage3_prod1_2_settings.pkl')

stage_3b_descriptions =  unpickle_this('stage_3/stage3_prod1_2_descriptions.pkl')

stage_3b_parses = unpickle_this('stage_3/stage3_prod1_2_descriptions_parses.pkl')

In [5]:
# vid_bins = unpickle_this('vid_bin_assignments.pkl')

# vid_bin_lookup = {}
# for k, vals in vid_bins.items():
#     for v in vals:
#         vid_bin_lookup[v] = k

In [6]:
with open('ds_production.json') as f:
    prod_batch_1 = json.load(f)[:20000]

In [56]:
prod_batch_1 = [vid['globalID'] for vid in prod_batch_1]

TypeError: string indices must be integers

## stage1

In [8]:
# prod_dataset = flintstones_pipeline.FlintstonesDataset([vid_id])
prod_dataset = flintstones_pipeline.FlintstonesDataset(prod_batch_1)

In [9]:
prod_dataset.update_s1a(processed_stage_1_a)

In [10]:
# sorted_vids = prod_dataset.sorted_by_episode()
# prod_dataset

In [11]:
prod_dataset = flintstones_pipeline.FlintstonesDataset(prod_batch_1)
prod_dataset.update_s1a(processed_stage_1_a)
prod_dataset.update_s1b(processed_stage_1_b)

## stage3

In [12]:
prod_dataset.update_s3a(stage_3a_settings)

In [13]:
prod_dataset.update_s3b(stage_3b_descriptions)

In [14]:
prod_dataset

{
    "go count": "8504",
    "reasons for removal": {
        "characters not present in all frames": "2340",
        "missing stage1a annotation": "1",
        "missing stage1b annotation": "896",
        "missing stage3a annotation": "4025",
        "missing stage3b annotation": "2389",
        "no consensus characters in stage1a": "1845",
        "total removed": "11496"
    },
    "stage statuses": {
        "stage_0": "1846",
        "stage_1a": "3236",
        "stage_1b": "4025",
        "stage_3a": "2389",
        "stage_3b": "8504"
    },
    "video count": "20000"
}

In [15]:
go_vids = prod_dataset.filter_videos({'go': True})

In [16]:
pickle_this(go_vids, 'v0p2_to_parse.pkl')

In [17]:
# dataset_to_json(go_vids, '0p2_to_parse')

## inspect

In [None]:
prod_dataset = flintstones_pipeline.FlintstonesDataset(prod_batch_1)
prod_dataset.update_s1a(processed_stage_1_a)
prod_dataset.update_s1b(processed_stage_1_b)

In [None]:
prod_dataset.update_s3a(stage_3a_settings)

In [None]:
vid_id = 's_02_e_29_shot_014705_014779'
inspect_vid = prod_dataset.get_video(vid_id)
inspect_vid = prod_dataset.filter_videos({'go': True})[50]

In [None]:
# inspect_vid

In [None]:
# inspect_vid.display_gif()

In [None]:
dfs = inspect_vid.display_bounding_boxes()
# dfs

In [None]:
three_frames  = inspect_vid.display_keyframes()

In [None]:
# three_frames

In [None]:
local_path = '/Users/schwenk/wrk/animation_gan/ai2-vision-animation-gan/documentation/images/'

In [None]:
img_base_path = 'https://s3-us-west-2.amazonaws.com/ai2-vision-animation-gan/documentation/images/'

In [None]:
three_frames.save(local_path + inspect_vid.gid() + '_keyframes.png')
dfs.save(local_path + inspect_vid.gid() + '_bboxes.png')

In [None]:
inspect_vid.characters_present()

In [None]:
char_names_by_id = [char.name() for char in sorted(inspect_vid.data()['characters'], key=lambda x: x.char_id())]

In [None]:
char_names_by_id

# parsing 

In [17]:
from phrase_cues.parsing import parse_video
import spacy
from nltk.parse.stanford import StanfordParser
from nltk.tokenize import sent_tokenize
from nltk.tree import ParentedTree

In [18]:
from nltk.parse.corenlp import CoreNLPServer
from nltk.parse.corenlp import CoreNLPParser

In [19]:
def dataset_to_json(dataset, version, out_dir='dataset'):
    to_json = copy.deepcopy(dataset)
    for vid in to_json:
        vid.vid_data['characters'] = [char.data() for char in vid.vid_data['characters']]

    ds_json = [vid.vid_data for vid in to_json]
    out_file = os.path.join(out_dir, 'dataset_v{}.json'.format(version))
    with open(out_file, 'w') as f:
        json.dump(ds_json, f, sort_keys=True, indent=4)

In [20]:
test_vids = go_vids[:100]

In [21]:
nlp = spacy.load('en')
core_nlp_base = '/Users/schwenk/wrk/animation_gan/phrase_cues/deps/stanford_core_nlp/stanford-corenlp-full-2017-06-09/'

# parser = StanfordParser(path_to_jar=core_nlp_base + 'stanford-corenlp-3.8.0.jar',
#                         path_to_models_jar=core_nlp_base +'stanford-corenlp-3.8.0-models.jar')

const_parse_path = '/Users/schwenk/wrk/animation_gan/build_dataset/dataset'
const_parse_dir = 'const_parses'

In [22]:
from time import sleep

In [36]:
core_parser = CoreNLPParser(url='http://localhost:9000')    

In [39]:
# with CoreNLPServer(path_to_jar=core_nlp_base + 'stanford-corenlp-3.8.0.jar', path_to_models_jar=core_nlp_base +'stanford-corenlp-3.8.0-models.jar') as server:
for vid in tqdm(go_vids):
    try:
        parse_video(vid, nlp, core_parser)
    except:
        print(vid.gid())

100%|██████████| 8504/8504 [06:22<00:00, 22.24it/s]


In [45]:
coref_res = unpickle_this('coref_results_full.pkl')

In [101]:
vid = test_vids[0]

In [47]:
for vid in go_vids:
    vid.vid_data['parse']['coref'] = coref_res[vid.gid()] 

In [49]:
dataset_to_json(go_vids, '0p3')

# mkdocs/ s3 dev

In [54]:
s3_doc_base_uri = 'https://s3-us-west-2.amazonaws.com/ai2-vision-animation-gan/documentation/images/'

video_mkd_template = """## Video ID {}

![animation]({})

![animation_frames]({})

![bounding_boxes]({})

### Setting:
{}

### Characters:
{}

### Description:
{}
"""

def paginate_image_list(img_list, page_size):
    num_sort = sorted(img_list, key=lambda x: x.replace('.png', '').zfill(4))
    for i in range(0, len(num_sort), page_size):
        yield num_sort[i:i + page_size]
        
def write_mkd_doc(doc, fp):
    with open(fp, 'w') as f:
        f.write(doc)
        
def format_characters(id_name_pairs):
    char_base = ''
    for char_id, char_name in id_name_pairs:
        char_base += char_id + ': ' + char_name + '\n\n'
    return char_base

In [None]:
video = inspect_vid

entry_args = [
    video.gid(),
    video.display_gif(True),
    s3_doc_base_uri + inspect_vid.gid() + '_keyframes.png',
    s3_doc_base_uri + inspect_vid.gid() + '_bboxes.png',
    video.setting(),
    format_characters(video.characters_present()),
    video.description()
]
video_entry = video_mkd_template.format(*entry_args)

write_mkd_doc(video_entry, './documentation/docs/datapoint.md')

# clean and spellcheck

In [55]:
import enchant
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize

from fuzzywuzzy import fuzz
import difflib
import diff_match_patch

dmp = diff_match_patch.diff_match_patch()

edict = enchant.Dict("en_US")
anglo_edict = enchant.Dict("en_UK")
cached_sw = stopwords.words("english") + list(string.punctuation)

#         edict.add(word)

In [None]:
other_chars_names = ['gazoo', 'lodabricks', 'slaghoople', 'poobaah']

In [None]:
other_words = ['bandana', 'tv', 'bandana', 'bowtie', 'sabretooth', 'creepella', 'polkadot', 
               'turban', 'monical', 'unibrow', 'accordion', 'boutineer', 'handkerchief', 'xray', 
               'onesie', 'midcentury', 'cafe', 'squatty', 'earings']

In [None]:
words_to_remove = ['whine', 'turbine', 'accordant', 'according', 'turbid', 'voile', 'acous', 'google', 'leper', 'deres', 'powerless', 'powerfulness', 'fervent', 'weaning', 'grail']
_ = [edict.remove_from_session(word) for word in words_to_remove]

In [None]:
_ = [[edict.add(char_word.lower()) for char_word in char.split()] for char in main_characters + other_chars_names + other_words]

In [None]:
manual_corrections = {  'lieing': 'lying',
                        'infront': 'in font',
                        'ladie': 'lady',
                        'servent': 'servant',
                        'wiht': 'with',
                        'preformer': 'performer',
                        'hinging': 'hanging',
                        'bule': 'blue',
                        'yount': 'young',
                      'od': 'old',
                      'dres': 'dress',
                      'handshacking': 'hand shaking',
                      'cru': 'crew',
                      'hankerchief': 'handkerchief',
                      'cowbow': 'cowboy',
                      'helmit': 'helmet',
                      'wearning': 'wearing',
                      'broen': 'wearing'
                     }

In [None]:
def check_mispelled(word):
    return word and word.isalpha() and not (edict.check(word) or anglo_edict.check(word) or edict.check(word[0].upper() + word[1:]))

def check_word_rules(word):
    split_len = 2 < min([len(w) for w in word.split()])
    not_proper = word[0].islower()
    return not_proper and split_len

def correct_spelling_error(misspelled_word):
    if misspelled_word in manual_corrections:
        return manual_corrections[misspelled_word]
    suggested_spellings = edict.suggest(misspelled_word)
    match_ratios = [fuzz.token_sort_ratio(misspelled_word, word) for word in suggested_spellings]
    words_sorted_by_ratio = sorted(zip(suggested_spellings, match_ratios), key=lambda x: x[1], reverse=True)
    words_sorted_by_ratio = [wordscore for wordscore in words_sorted_by_ratio if check_word_rules(wordscore[0])]
    check_compounds = [word[0] for word in words_sorted_by_ratio if word[0].replace(' ', '') == misspelled_word]
    if check_compounds:
        return check_compounds[0]
    if words_sorted_by_ratio[0][1] > 80:
        return words_sorted_by_ratio[0][0]

    for word, score in words_sorted_by_ratio:
        if score >= 75 and word[0] == misspelled_word[0]:
            return word
        elif score >= 75:                    
            return word
    return None

def apply_spelling_fix(orig_text):
    orig_text_tokens = wordpunct_tokenize(orig_text)
    processed_tokens = []
    for token in orig_text_tokens:
        norm_token = token.lower()
        if len(norm_token) < 4:
            processed_tokens.append(token)
            continue
        if check_mispelled(norm_token):
            suggested_replacements = edict.suggest(token)
            replacement_text = correct_spelling_error(norm_token, suggested_replacements)
            if replacement_text:
                if norm_token[0].isupper():
                    replacement_text = upper(replacement_text[0]) + replaced_text[1:]
                processed_tokens.append(replacement_text)
            else:
                processed_tokens.append(token)
        else:
            processed_tokens.append(token)
    return ' '.join(processed_tokens)

def diff_corrected_text(orig_text, corrected_text):
    diff = dmp.diff_main(orig_text, corrected_text)
    return HTML(dmp.diff_prettyHtml(diff))

def specify_lesson_q_path(lesson):
    pass

def apply_spelling_and_grammar_to_ds(ck12_ds):
    return

In [None]:
def remove_empty_fields(video):
    video.pop()

In [None]:
dataset_v0p1 = copy.deepcopy(all_reasonably_sized)

In [None]:
words_changed = []
for video in dataset_v0p1:
    for char in video['characters']:
        char_name_words = wordpunct_tokenize(char['characterName'])
        misspellings = [check_mispelled(word) for word in char_name_words]
        if sum(misspellings):
            for idx, is_mispelled in enumerate(misspellings):
                if is_mispelled:
                    suggested_replacement = correct_spelling_error(char_name_words[idx])
                    if suggested_replacement:
                        words_changed.append([char_name_words[idx], suggested_replacement])
                        char_name_words[idx] = suggested_replacement
#                     print(char_name_words, suggested_replacement)
#             print(char['characterName'], ' '.join(char_name_words).replace(' , ', ', ').replace(' . ', '.'))
            char['characterName'] = ' '.join(char_name_words).replace(' , ', ', ').replace(' . ', '.')
#         print(list(zip(char_name_words, misspellings)))

In [None]:
len(words_changed)

In [None]:
for video in dataset_v0p1:
    setting = video['setting']
    setting_words = setting.split()
    misspellings = [check_mispelled(word) for word  in setting_words]
    if sum(misspellings):
        for idx, is_mispelled in enumerate(misspellings):
            if is_mispelled:
                suggested_replacement = correct_spelling_error(setting_words[idx])
                if suggested_replacement:
                    words_changed.append([setting_words[idx], suggested_replacement])
                    setting_words[idx] = suggested_replacement
        video['setting'] = ' '.join(setting_words)

In [None]:
pickle_this(dataset_v0p1, 'dataset_v0p1.pkl')

with open('dataset_v0p1.json', 'w') as f:
    json.dump(dataset_v0p1, f, indent=4, sort_keys=True)

In [None]:
df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])

In [None]:
len(dataset_v0p1)

In [None]:
len(dataset_v0p1)

In [None]:
dataset_v0p1[100]

In [None]:
for video in dataset_v0p1:
    if not video['setting']:
        print(video['globalID'])
    if not video['characters']:
        print(video['globalID'])

# hide 

In [52]:
# ds_production = make_ds_skeleton(production_videos)
# ds_complete_stage1_v1 = [vid for vid in ds_production if vid['globalID'] in processed_stage_1_a and vid['globalID'] in processed_stage_1_b]

# stg1a = set(processed_stage_1_a.keys())
# stg1b = set(processed_stage_1_b.keys())

# len(stg1a.difference(stg1b))

# single_clip = [vid for vid in ds_production if vid['globalID'] == 's_01_e_02_shot_014615_014689'][0]

# pass_vids = [vid for bin_n, vid in vid_bins.items() if bin_n in pass_bins]
# pass_vid_ids = set([item for sublist in pass_vids for item in sublist])

# ds_complete_stage1_v1 = [vid for vid in ds_complete_stage1_v1 if vid['globalID'] in pass_vid_ids]

# len(ds_complete_stage1_v1)

# ds_complete_stage1_v1_all_clean = [vid for vid in ds_complete_stage1_v1 if vid['globalID'] not in shot_change]

# len(ds_complete_stage1_v1_all_clean) / 12819

# # weird_vid = [vid for vid in ds_complete_stage1_v1 if vid['globalID'] == 's_06_e_24_shot_005808_005882']

# single_char_clips = [clip for clip in ds_complete_stage1_v1_all_clean if len(clip['characters']) == 1]

# multi_char_clips = [clip for clip in ds_complete_stage1_v1_all_clean if len(clip['characters']) > 1 and len(clip['characters']) < 4]

# all_reasonably_sized = [clip for clip in ds_complete_stage1_v1_all_clean if len(clip['characters']) > 0 and len(clip['characters']) < 4]

# len(all_reasonably_sized)

# # multi_char_sample = random.sample(multi_char_clips, 100)

# len(multi_char_sample)

# # pickle_this(multi_char_clips, 'stage_1_multi_char_sample.pkl')

# # pickle_this(all_reasonably_sized, 'stage_1_less_than_4chars.pkl')

# # pickle_this(single_char_clips, 'stage_1_single_char_updated.pkl')

# # single_clip = three_char_clips[1]
# # single_clip 

## old build and update ds

In [None]:
# # rem_reason = 'no characters annotated in stage1a'

# filt_vids = prod_dataset.filter_videos({'reason': rem_reason})
# len(filt_vids)

# rand_vid = random.choice(filt_vids)
# rand_vid.display_gif()

In [53]:
# for vid in go_vids:
#     vid.vid_data['parse'] = stage_3b_parses[vid.gid()] 

# for vid in go_vids:
#     vid.vid_data['characters'] = [char.data() for char in vid.vid_data['characters']]

# ds_json = [vid.vid_data for vid in go_vids]

# len(ds_json)

# tc = ds_json[0]['characters'][0]

# with open('dataset_v0p2.json', 'w') as f:
#     json.dump(ds_json, f, sort_keys=True, indent=4)

# demo_vid = prod_dataset.get_video('s_05_e_09_shot_032756_032830')

# demo_vid.display_keyframes()

# demo_vid.display_gif()