In [37]:
import ast
import json
from tqdm import tqdm
from pathlib import Path
import sys
import csv
import os
from collections import defaultdict, Counter
import numpy as np
import base64

os.chdir('/home/eugene/Documents/workspace/Oscar/datasets/personality_captions')
#os.chdir('/home/eugene/anaconda3/envs/frcnn/lib/python3.7/site-packages/data/personality_captions')
_ = csv.field_size_limit(sys.maxsize)

# Clean Captions

In [16]:
def load_json(file):
    with open(file) as f:
        data = json.load(f)
    
    return Counter(d['comment'] for d in data)
counts = {x : load_json(f'{x}.json') for x in ['train', 'val', 'test']}

In [17]:
for k, v in counts.items():
    print(k, v.most_common(10), '\n')

train [('[DISCONNECT]', 68), ('[RETURNED]', 28), ('What is this?', 15), ('[TIMEOUT]', 13), ('What is that?', 12), ('Where is this?', 11), ('What is going on here?', 10), ('This looks like so much fun!', 9), ("What's going on here?", 9), ('what is that?', 7)] 

val [('[DISCONNECT]', 5), ('I would hide', 2), ('THE GUY IS SUFFERING FROM DEPRESSION', 1), ('Is she doing a backflip twist?', 1), ('What did they have to do to help support cancer? Fundraisers always intrigue me', 1), ('I love the bit of motion blur in this photo, asks plenty of questions as to why this person took such an active, lovely photo.', 1), ('Sunlight filtered by sleepy clouds.', 1), ('what a wonderful map', 1), ('Why is that guy bulling the little one.', 1), ("She's really cool and funny", 1)] 

test [('[DISCONNECT]', 10), ('[RETURNED]', 3), ('[TIMEOUT]', 3), ('I do not have time for this.', 2), ('What is going on here?', 2), ('This is the life', 2), ('What even is this?', 2), ('I love the way this color makes me feel

In [18]:
# clean the captions data with [DISCONNECT], [RETURNED] and [TIMEOUT].
def clean_captions(file):
    with open(file) as f:
        data = json.load(f)
    tokens = ['[DISCONNECT]', '[RETURNED]', '[TIMEOUT]']
    print(f'cleaning {file}, before : {len(data)}')
    
    cleaned = [d for d in data if d['comment'] not in tokens]
    
    print(f'after : {len(cleaned)}')
    with open(file, 'w') as f:
        json.dump(cleaned, f)

for x in ['train', 'val', 'test']:
    clean_captions(f'{x}.json')

cleaning train.json, before : 186858
after : 186749
cleaning val.json, before : 5000
after : 4994
cleaning test.json, before : 10000
after : 9984


In [19]:
# sanity check
counts = {x : load_json(f'{x}.json') for x in ['train', 'val', 'test']}
for k, v in counts.items():
    print(k, v.most_common(10), '\n')
    print('len 1', [x for x in v.keys() if len(x.split(' ')) == 1], '\n')

train [('What is this?', 15), ('What is that?', 12), ('Where is this?', 11), ('What is going on here?', 10), ('This looks like so much fun!', 9), ("What's going on here?", 9), ('what is that?', 7), ('What is he doing?', 7), ('what is this', 6), ('Is this supposed to be art?', 6)] 

len 1 [] 

val [('I would hide', 2), ('THE GUY IS SUFFERING FROM DEPRESSION', 1), ('Is she doing a backflip twist?', 1), ('What did they have to do to help support cancer? Fundraisers always intrigue me', 1), ('I love the bit of motion blur in this photo, asks plenty of questions as to why this person took such an active, lovely photo.', 1), ('Sunlight filtered by sleepy clouds.', 1), ('what a wonderful map', 1), ('Why is that guy bulling the little one.', 1), ("She's really cool and funny", 1), ('These people sitting at nice tables could have dressed a bit nicer, too.', 1)] 

len 1 [] 

test [('I do not have time for this.', 2), ('What is going on here?', 2), ('This is the life', 2), ('What even is this?', 

In [20]:
print(f'total captions : {sum(sum(v.values()) for v in counts.values())}')

total captions : 201727


# Split TSV

In [25]:
def load_json(file):
    with open(file) as f:
        data = json.load(f)
    return set(x['image_hash'] for x in data)

    
def split(infile, ids):
    # merge tsv files
    with tqdm() as pbar:
        infile = Path(infile)
        train_name = infile.parent / f"train.{infile.name}"
        test_name = infile.parent / f"test.{infile.name}"
        val_name = infile.parent / f"val.{infile.name}"
        
        counts = {k : 0 for k in ids.keys()}
        
        with open(train_name, 'w') as train_tsv, open(test_name, 'w') as test_tsv, open(val_name, 'w') as val_tsv, open("/dev/null", 'w') as dummy:
            
            train_writer = csv.writer(train_tsv, delimiter = '\t')   
            test_writer = csv.writer(test_tsv, delimiter = '\t')   
            val_writer = csv.writer(val_tsv, delimiter = '\t')   
            
            dummy_writer = csv.writer(dummy, delimiter = '\t')
            
            
            with open(infile) as in_tsv:
                reader = csv.reader(in_tsv, delimiter='\t')
                
                for item in reader:
                    image_id = item[0]
                    
                    # check for write errors
                    try:
                        dummy_writer.writerow(item)
                    except Exception as e:
                        tqdm.write(f'write error for {image_id}, {str(e)}')
                        continue
                        
                    if image_id in ids['train']:
                        train_writer.writerow(item)
                        counts['train'] += 1
                    elif image_id in ids['test']:
                        test_writer.writerow(item)
                        counts['test'] += 1
                    elif image_id in ids['val']:
                        val_writer.writerow(item)
                        counts['val'] += 1

                    pbar.update(1)
    tqdm.write(str(counts))
ids = {x : load_json(f'{x}.json') for x in ['train', 'val', 'test']}
tqdm.write(str({k : len(v) for k, v in ids.items()}))

split('label.tsv', ids)
split('feature.tsv', ids)


856it [00:00, 8558.11it/s]

{'train': 186749, 'val': 4994, 'test': 9984}


201230it [00:22, 8767.67it/s]
8it [00:00, 75.92it/s]

{'train': 186238, 'val': 4989, 'test': 9977}


201230it [31:24, 106.78it/s]

{'train': 186238, 'val': 4989, 'test': 9977}





# Validate Split

In [28]:
def validate(feat_file, label_file):
    def get_ids(f):
         with open(f) as in_tsv:
            reader = csv.reader(in_tsv, delimiter='\t')
            ids = {}
            for i, x in enumerate(reader):
                if x[0] in ids:
                    print(f'duplicate {x[0]}')
                ids[x[0]] = i 
            return ids
        
    feat_ids = get_ids(feat_file)
    label_ids = get_ids(label_file)
    
    assert len(feat_ids) == len(label_ids)
    assert feat_ids.keys() == label_ids.keys()
    assert all(idx == label_ids[k] for k, idx in feat_ids.items())
    print(len(label_ids))

In [29]:
for x in ['train', 'val', 'test']:
    print(x)
    validate(f'{x}.feature.tsv',f'{x}.label.tsv')

train
186238
val
4989
test
9977


# Lineidx

In [53]:
import logging
import os
import os.path as op

def generate_lineidx(filein, idxout):
    idxout_tmp = idxout + '.tmp'
    with open(filein, 'r') as tsvin, open(idxout_tmp,'w') as tsvout:
        fsize = os.fstat(tsvin.fileno()).st_size
        fpos = 0
        while fpos!=fsize:
            tsvout.write(str(fpos)+"\n")
            tsvin.readline()
            fpos = tsvin.tell()
    os.rename(idxout_tmp, idxout)

In [54]:
 generate_lineidx(f'train.feature.tsv', f'train.feature.lineidx')
 generate_lineidx(f'train.label.tsv', f'train.label.lineidx')

In [3]:
for x in ['train', 'val', 'test']:
    print(x)
    generate_lineidx(f'{x}.feature.tsv', f'{x}.feature.lineidx')
    generate_lineidx(f'{x}.label.tsv', f'{x}.label.lineidx')

train
val
test


# Remove missing labels

In [25]:
def update_captions(label_tsv, caption_json):
    with open(label_tsv) as in_tsv:
        reader = csv.reader(in_tsv, delimiter='\t')
        label_ids = {x[0] for x in reader}
        
    with open(caption_json) as f:
        captions = json.load(f)
    
    captions = [x for x in captions if x['image_hash'] in label_ids]
    
    assert len(captions) == len(label_ids)
    
    with open(caption_json, 'w') as f:
        json.dump(captions, f)
    
    print(len(captions))
for x in ['train', 'val', 'test']:
    print(x)
    update_captions(f'{x}.label.tsv', f'{x}.json')

train


AssertionError: 

# Update Personality

In [26]:
# remove it to one word
personalities = set()
def clean_personality(file):
    with open(file) as f:
        data = json.load(f)
    print(f'cleaning {file}, before : {len(data)}')
    
    cleaned = []
    for d in data:
        d['personality'] = d['personality'].split(' ')[0]
        personalities.add(d['personality'])
        cleaned.append(d)
        
    print(f'after : {len(cleaned)}')
    
    with open(file, 'w') as f:
        json.dump(cleaned, f)

for x in ['train', 'val', 'test']:
    clean_personality(f'{x}.json')

cleaning train.json, before : 186234
after : 186234
cleaning val.json, before : 4989
after : 4989
cleaning test.json, before : 9977
after : 9977


In [27]:
assert(len(x) == 1 for x in personalities)

with open('personalities.txt', 'w') as f:
    f.writelines(str(x) + '\n' for x in personalities)

# Remove 'Crude' Personality

In [6]:
# clean the captions data with 'Crude'
def clean_captions(file):
    with open(file) as f:
        data = json.load(f)
    print(f'cleaning {file}, before : {len(data)}')
    
    cleaned = [d for d in data if d['personality'] != 'Crude' ]
    
    print(f'after : {len(cleaned)}')
    with open(file, 'w') as f:
        json.dump(cleaned, f)

for x in ['train', 'val', 'test']:
    clean_captions(f'{x}.json')

cleaning train.json, before : 186234
after : 186234
cleaning val.json, before : 4989
after : 4989
cleaning test.json, before : 9977
after : 9977


## Get the distribution of personalities

In [7]:
personalities = defaultdict(int)
with tqdm() as pbar:
    for i in ['train.json', 'val.json', 'test.json']:
        with open(i) as f:
            json_data = json.load(f)
        for x in json_data:
            personalities[x['personality']] += 1
            pbar.update(1)

201200it [00:00, 403544.54it/s]


## update train label and features

In [13]:
def update_train(infile):
    with open('train.json') as f:
        ids = set(x['image_hash'] for x in json.load(f))

    counts = 0
    with open(infile+'.tmp', 'w') as train_tsv:
        train_writer = csv.writer(train_tsv, delimiter = '\t') 
        
        with open(infile) as in_tsv:
            reader = csv.reader(in_tsv, delimiter='\t')

            for item in tqdm(reader):
                image_id = item[0]
                
                if image_id in ids:
                    train_writer.writerow(item)
                    counts += 1
                    
    os.rename(infile+'.tmp', infile)
    tqdm.write(str(counts))

In [15]:
update_train('train.label.tsv')
update_train('train.feature.tsv')

186238it [00:12, 15336.39it/s]
186238it [17:08, 181.03it/s]


186234
186234


## Ensure correct formatting

In [24]:
%%bash
# for train label
head -n1 train.label.tsv
sed -i 's/""/"/g' train.label.tsv
sed -i 's/"\[/\[/g' train.label.tsv
sed -i 's/\]"/\]/g' train.label.tsv

c07eb125b83823121ab72d7c2893c1c	"[{""conf"": 0.37365537881851196, ""class"": ""sky"", ""rect"": [0.0, 0.0, 369.6669616699219, 167.16232299804688]}, {""conf"": 0.27570459246635437, ""class"": ""branch"", ""rect"": [238.2344970703125, 0.0, 374.375, 325.0687255859375]}, {""conf"": 0.2762376666069031, ""class"": ""branch"", ""rect"": [6.283855438232422, 353.80535888671875, 374.375, 499.375]}, {""conf"": 0.28178179264068604, ""class"": ""tree"", ""rect"": [277.7760009765625, 0.0, 374.375, 278.257568359375]}, {""conf"": 0.3297758400440216, ""class"": ""tree"", ""rect"": [218.90582275390625, 105.63359832763672, 374.375, 499.375]}, {""conf"": 0.7046377062797546, ""class"": ""tree"", ""rect"": [64.18987274169922, 49.167537689208984, 320.56475830078125, 472.7288818359375]}, {""conf"": 0.3796304762363434, ""class"": ""branch"", ""rect"": [19.837989807128906, 26.823034286499023, 196.2734832763672, 211.62310791015625]}, {""conf"": 0.6425620913505554, ""class"": ""branch"", ""rect"": [13.20970535278

In [33]:
with open('train.label.tsv') as in_tsv:
    reader = csv.reader(in_tsv, delimiter='\t')
    for item in tqdm(reader):
        x = ast.literal_eval(item[1])
        assert type(x) == list
        assert "conf"  in x[0]
        assert "class" in x[0]
        assert "rect" in x[0]

186234it [01:29, 2077.10it/s]


In [48]:
with open('train.feature.tsv') as in_tsv:
    reader = csv.reader(in_tsv, delimiter='\t')
    for x in tqdm(reader):
        feat_info = ast.literal_eval(x[1])
        num_boxes = feat_info['num_boxes']
        np_arr = np.frombuffer(base64.b64decode(feat_info['features']), np.float32).reshape((num_boxes, -1))

186234it [10:08, 306.22it/s]


# Remove the candidates from val and test

In [17]:
def clean_captions(file):
    with open(file) as f:
        data = json.load(f)
    print(f'cleaning {file}, before : {len(data)}')
    
    for d in data:
        for remove in ['500_candidates', 'candidates']:
            if remove in d:
                d.pop(remove)
    
    print(f'after : {len(data)}')
    with open(file, 'w') as f:
        json.dump(data, f)

for x in ['train', 'val', 'test']:
    clean_captions(f'{x}.json')

cleaning train.json, before : 186234
after : 186234
cleaning val.json, before : 4989
after : 4989
cleaning test.json, before : 9977
after : 9977


In [20]:
with open('test.json') as f:
    data = json.load(f)
print(data[0])

{'personality': 'Obsessive', 'comment': "A little heavy on the make-up don't ya think. ", 'additional_comments': ['she girl  is good', 'I just love her hair that I would keep it.', 'She needs to put some rings on, look, no rings, she needs more rings', 'She spent 3 hours on her look'], 'image_hash': '2923e28b6f588aff2d469ab2cccfac57'}


# Add additional comments to test_caption_coco_format.json

In [3]:
with open('test.json') as f:
    data = json.load(f)
    
i = 0 
images = []
annotations = []
for d in data:
    images.append({
        'id' : d['image_hash'], 
        'file_name' : d['image_hash']
    })
    
    caps = d['additional_comments']
    caps.append(d['comment'])
    
    for c in caps:
        annotations.append({
            'image_id' : d['image_hash'],
            'caption' : c,
            'id' : i
        })
        i += 1
        
result = {
    'annotations' : annotations,
    'images' : images,
    'type' : 'captions', 
    'info' : 'dummy',
    'licenses' : 'dummy'
}

with open('test_caption_coco_format.json', 'w') as f:
    json.dump(result, f)