In [1]:
import json
from tqdm import tqdm
from pathlib import Path
import sys
import csv
import os
from collections import defaultdict, Counter

#os.chdir('../datasets/personality_captions')
os.chdir('/home/eugene/anaconda3/envs/frcnn/lib/python3.7/site-packages/data/personality_captions')
csv.field_size_limit(sys.maxsize)

131072

# Clean Captions

In [16]:
def load_json(file):
    with open(file) as f:
        data = json.load(f)
    
    return Counter(d['comment'] for d in data)
counts = {x : load_json(f'{x}.json') for x in ['train', 'val', 'test']}

In [17]:
for k, v in counts.items():
    print(k, v.most_common(10), '\n')

train [('[DISCONNECT]', 68), ('[RETURNED]', 28), ('What is this?', 15), ('[TIMEOUT]', 13), ('What is that?', 12), ('Where is this?', 11), ('What is going on here?', 10), ('This looks like so much fun!', 9), ("What's going on here?", 9), ('what is that?', 7)] 

val [('[DISCONNECT]', 5), ('I would hide', 2), ('THE GUY IS SUFFERING FROM DEPRESSION', 1), ('Is she doing a backflip twist?', 1), ('What did they have to do to help support cancer? Fundraisers always intrigue me', 1), ('I love the bit of motion blur in this photo, asks plenty of questions as to why this person took such an active, lovely photo.', 1), ('Sunlight filtered by sleepy clouds.', 1), ('what a wonderful map', 1), ('Why is that guy bulling the little one.', 1), ("She's really cool and funny", 1)] 

test [('[DISCONNECT]', 10), ('[RETURNED]', 3), ('[TIMEOUT]', 3), ('I do not have time for this.', 2), ('What is going on here?', 2), ('This is the life', 2), ('What even is this?', 2), ('I love the way this color makes me feel

In [18]:
# clean the captions data with [DISCONNECT], [RETURNED] and [TIMEOUT].
def clean_captions(file):
    with open(file) as f:
        data = json.load(f)
    tokens = ['[DISCONNECT]', '[RETURNED]', '[TIMEOUT]']
    print(f'cleaning {file}, before : {len(data)}')
    
    cleaned = [d for d in data if d['comment'] not in tokens]
    
    print(f'after : {len(cleaned)}')
    with open(file, 'w') as f:
        json.dump(cleaned, f)

for x in ['train', 'val', 'test']:
    clean_captions(f'{x}.json')

cleaning train.json, before : 186858
after : 186749
cleaning val.json, before : 5000
after : 4994
cleaning test.json, before : 10000
after : 9984


In [19]:
# sanity check
counts = {x : load_json(f'{x}.json') for x in ['train', 'val', 'test']}
for k, v in counts.items():
    print(k, v.most_common(10), '\n')
    print('len 1', [x for x in v.keys() if len(x.split(' ')) == 1], '\n')

train [('What is this?', 15), ('What is that?', 12), ('Where is this?', 11), ('What is going on here?', 10), ('This looks like so much fun!', 9), ("What's going on here?", 9), ('what is that?', 7), ('What is he doing?', 7), ('what is this', 6), ('Is this supposed to be art?', 6)] 

len 1 [] 

val [('I would hide', 2), ('THE GUY IS SUFFERING FROM DEPRESSION', 1), ('Is she doing a backflip twist?', 1), ('What did they have to do to help support cancer? Fundraisers always intrigue me', 1), ('I love the bit of motion blur in this photo, asks plenty of questions as to why this person took such an active, lovely photo.', 1), ('Sunlight filtered by sleepy clouds.', 1), ('what a wonderful map', 1), ('Why is that guy bulling the little one.', 1), ("She's really cool and funny", 1), ('These people sitting at nice tables could have dressed a bit nicer, too.', 1)] 

len 1 [] 

test [('I do not have time for this.', 2), ('What is going on here?', 2), ('This is the life', 2), ('What even is this?', 

In [20]:
print(f'total captions : {sum(sum(v.values()) for v in counts.values())}')

total captions : 201727


# Split TSV

In [25]:
def load_json(file):
    with open(file) as f:
        data = json.load(f)
    return set(x['image_hash'] for x in data)

    
def split(infile, ids):
    # merge tsv files
    with tqdm() as pbar:
        infile = Path(infile)
        train_name = infile.parent / f"train.{infile.name}"
        test_name = infile.parent / f"test.{infile.name}"
        val_name = infile.parent / f"val.{infile.name}"
        
        counts = {k : 0 for k in ids.keys()}
        
        with open(train_name, 'w') as train_tsv, open(test_name, 'w') as test_tsv, open(val_name, 'w') as val_tsv, open("/dev/null", 'w') as dummy:
            
            train_writer = csv.writer(train_tsv, delimiter = '\t')   
            test_writer = csv.writer(test_tsv, delimiter = '\t')   
            val_writer = csv.writer(val_tsv, delimiter = '\t')   
            
            dummy_writer = csv.writer(dummy, delimiter = '\t')
            
            
            with open(infile) as in_tsv:
                reader = csv.reader(in_tsv, delimiter='\t')
                
                for item in reader:
                    image_id = item[0]
                    
                    # check for write errors
                    try:
                        dummy_writer.writerow(item)
                    except Exception as e:
                        tqdm.write(f'write error for {image_id}, {str(e)}')
                        continue
                        
                    if image_id in ids['train']:
                        train_writer.writerow(item)
                        counts['train'] += 1
                    elif image_id in ids['test']:
                        test_writer.writerow(item)
                        counts['test'] += 1
                    elif image_id in ids['val']:
                        val_writer.writerow(item)
                        counts['val'] += 1

                    pbar.update(1)
    tqdm.write(str(counts))
ids = {x : load_json(f'{x}.json') for x in ['train', 'val', 'test']}
tqdm.write(str({k : len(v) for k, v in ids.items()}))

split('label.tsv', ids)
split('feature.tsv', ids)


856it [00:00, 8558.11it/s]

{'train': 186749, 'val': 4994, 'test': 9984}


201230it [00:22, 8767.67it/s]
8it [00:00, 75.92it/s]

{'train': 186238, 'val': 4989, 'test': 9977}


201230it [31:24, 106.78it/s]

{'train': 186238, 'val': 4989, 'test': 9977}





# Validate Split

In [28]:
def validate(feat_file, label_file):
    def get_ids(f):
         with open(f) as in_tsv:
            reader = csv.reader(in_tsv, delimiter='\t')
            ids = {}
            for i, x in enumerate(reader):
                if x[0] in ids:
                    print(f'duplicate {x[0]}')
                ids[x[0]] = i 
            return ids
        
    feat_ids = get_ids(feat_file)
    label_ids = get_ids(label_file)
    
    assert len(feat_ids) == len(label_ids)
    assert feat_ids.keys() == label_ids.keys()
    assert all(idx == label_ids[k] for k, idx in feat_ids.items())
    print(len(label_ids))

In [29]:
for x in ['train', 'val', 'test']:
    print(x)
    validate(f'{x}.feature.tsv',f'{x}.label.tsv')

train
186238
val
4989
test
9977


# Lineidx

In [30]:
import logging
import os
import os.path as op

def generate_lineidx(filein, idxout):
    idxout_tmp = idxout + '.tmp'
    with open(filein, 'r') as tsvin, open(idxout_tmp,'w') as tsvout:
        fsize = os.fstat(tsvin.fileno()).st_size
        fpos = 0
        while fpos!=fsize:
            tsvout.write(str(fpos)+"\n")
            tsvin.readline()
            fpos = tsvin.tell()
    os.rename(idxout_tmp, idxout)

In [31]:
for x in ['train', 'val', 'test']:
    print(x)
    generate_lineidx(f'{x}.feature.tsv', f'{x}.feature.lineidx')
    generate_lineidx(f'{x}.label.tsv', f'{x}.label.lineidx')

train
val
test


# Remove missing labels

In [23]:
def update_captions(label_tsv, caption_json):
    with open(label_tsv) as in_tsv:
        reader = csv.reader(in_tsv, delimiter='\t')
        label_ids = {x[0] for x in reader}
        
    with open(caption_json) as f:
        captions = json.load(f)
    
    captions = [x for x in captions if x['image_hash'] in label_ids]
    
    assert len(captions) == len(label_ids)
    
    with open(caption_json, 'w') as f:
        json.dump(captions, f)
    
    print(len(captions))
for x in ['train', 'val', 'test']:
    print(x)
    update_captions(f'{x}.label.tsv', f'{x}.json')

train
186238
val
4989
test
9977


# Update Personality

In [24]:
# remove it to one word
personalities = set()
def clean_personality(file):
    with open(file) as f:
        data = json.load(f)
    print(f'cleaning {file}, before : {len(data)}')
    
    cleaned = []
    for d in data:
        d['personality'] = d['personality'].split(' ')[0]
        personalities.add(d['personality'])
        cleaned.append(d)
        
    print(f'after : {len(cleaned)}')
    
    with open(file, 'w') as f:
        json.dump(cleaned, f)

for x in ['train', 'val', 'test']:
    clean_personality(f'{x}.json')

cleaning train.json, before : 186238
after : 186238
cleaning val.json, before : 4989
after : 4989
cleaning test.json, before : 9977
after : 9977


In [29]:
assert(len(x) == 1 for x in personalities)

with open('personalities.txt', 'w') as f:
    f.writelines(str(x) + '\n' for x in personalities)

# Create small val

In [35]:
os.chdir('/home/eugene/workspace/Oscar/datasets/personality_captions')
personalities = defaultdict(int)

In [36]:
with tqdm() as pbar:
    for i in ['train.json', 'val.json', 'test.json']:
        with open(i) as f:
            json_data = json.load(f)
        for x in json_data:
            personalities[x['personality']] += 1
            pbar.update(1)

201204it [00:01, 110100.35it/s]


In [37]:
personalities

defaultdict(int,
            {'Intense': 923,
             'Adventurous': 935,
             'Mellow': 937,
             'Zany': 940,
             'Narcissistic': 947,
             'Sarcastic': 946,
             'Artificial': 934,
             'Eloquent': 944,
             'Caring': 927,
             'Appreciative': 940,
             'Ordinary': 937,
             'Honest': 947,
             'Imaginative': 927,
             'Daring': 921,
             'Overimaginative': 945,
             'Respectful': 937,
             'Profound': 947,
             'Arrogant': 933,
             'Contemplative': 929,
             'Glamorous': 931,
             'Angry': 937,
             'Contemptible': 933,
             'Old-fashioned': 934,
             'Witty': 940,
             'Casual': 938,
             'Kind': 924,
             'Fiery': 937,
             'Spirited': 935,
             'Intelligent': 941,
             'Obnoxious': 941,
             'Cute': 944,
             'Pretentious': 934,
       

In [38]:
for x in sorted(personalities.keys()): 
    print(x)

Abrasive
Absentminded
Adventurous
Aggressive
Airy
Aloof
Amusing
Angry
Anxious
Apathetic
Appreciative
Argumentative
Arrogant
Artful
Articulate
Artificial
Assertive
Attractive
Barbaric
Bewildered
Bizarre
Bland
Blunt
Boisterous
Boyish
Breezy
Brilliant
Businesslike
Calm
Captivating
Caring
Casual
Cerebral
Charming
Cheerful
Childish
Clever
Coarse
Cold
Colorful
Compassionate
Complex
Conceited
Confident
Confused
Conservative
Considerate
Contemplative
Contemptible
Contradictory
Courageous
Cowardly
Crazy
Creative
Critical
Crude
Cruel
Cultured
Curious
Cute
Cynical
Daring
Deep
Destructive
Devious
Discouraging
Disturbing
Dramatic
Dreamy
Dry
Dull
Earnest
Egocentric
Elegant
Eloquent
Emotional
Empathetic
Energetic
Enigmatic
Enthusiastic
Envious
Erratic
Escapist
Excitable
Exciting
Extraordinary
Extravagant
Extreme
Fanatical
Fanciful
Fatalistic
Fawning
Fearful
Fickle
Fiery
Foolish
Formal
Freethinking
Frightening
Frivolous
Fun-loving
Gentle
Glamorous
Gloomy
Grand
Grim
Happy
Hateful
Haughty
High-spirited


In [29]:
os.chdir('/home/eugene/anaconda3/envs/frcnn/lib/python3.7/site-packages/data/personality_captions')
personalities_full = set()
with tqdm() as pbar:
    for i in ['train.json', 'val.json', 'test.json']:
        with open(i) as f:
            json_data = json.load(f)
        for x in json_data:
            personalities_full.add(x['personality'])
            pbar.update(1)

201858it [00:03, 51903.76it/s]


In [30]:
personalities_full = set(x.split(' ')[0] for x in personalities_full)

In [31]:
personalities_full

{'Abrasive',
 'Absentminded',
 'Adventurous',
 'Aggressive',
 'Airy',
 'Aloof',
 'Amusing',
 'Angry',
 'Anxious',
 'Apathetic',
 'Appreciative',
 'Argumentative',
 'Arrogant',
 'Artful',
 'Articulate',
 'Artificial',
 'Assertive',
 'Attractive',
 'Barbaric',
 'Bewildered',
 'Bizarre',
 'Bland',
 'Blunt',
 'Boisterous',
 'Boyish',
 'Breezy',
 'Brilliant',
 'Businesslike',
 'Calm',
 'Captivating',
 'Caring',
 'Casual',
 'Cerebral',
 'Charming',
 'Cheerful',
 'Childish',
 'Clever',
 'Coarse',
 'Cold',
 'Colorful',
 'Compassionate',
 'Complex',
 'Conceited',
 'Confident',
 'Confused',
 'Conservative',
 'Considerate',
 'Contemplative',
 'Contemptible',
 'Contradictory',
 'Courageous',
 'Cowardly',
 'Crazy',
 'Creative',
 'Critical',
 'Crude',
 'Cruel',
 'Cultured',
 'Curious',
 'Cute',
 'Cynical',
 'Daring',
 'Deep',
 'Destructive',
 'Devious',
 'Discouraging',
 'Disturbing',
 'Dramatic',
 'Dreamy',
 'Dry',
 'Dull',
 'Earnest',
 'Egocentric',
 'Elegant',
 'Eloquent',
 'Emotional',
 'Empathe

In [33]:
len(personalities_full)

216