In [14]:
# Create a vocabulary wrapper
import pickle
from collections import Counter
import json
import argparse
import os
import pdb

annotations = {
  'mrw':  ['mrw-v1.0.json'],
  'tgif': ['tgif-v1.0.tsv'],
  'coco': ['annotations/captions_train2014.json',
           'annotations/captions_val2014.json'],
  'lp': ['ml-1.json']
}

class Vocabulary(object):
  """Simple vocabulary wrapper."""

  def __init__(self):
    self.idx = 0
    self.word2idx = {}
    self.idx2word = {}

  def add_word(self, word):
    if word not in self.word2idx:
      self.word2idx[word] = self.idx
      self.idx2word[self.idx] = word
      self.idx += 1

  def __call__(self, word):
    if word not in self.word2idx:
      return self.word2idx['<unk>']
    return self.word2idx[word]

  def __len__(self):
    return len(self.word2idx)


def from_tgif_tsv(path):
  captions = [line.strip().split('\t')[1] \
       for line in open(path, 'r').readlines()]
  return captions


def from_mrw_json(path):
  dataset = json.load(open(path, 'r'))
  captions = []
  for i, datum in enumerate(dataset):
    cap = datum['sentence']
    cap = cap.replace('/r/','')
    cap = cap.replace('r/','')
    cap = cap.replace('/u/','')
    cap = cap.replace('u/','')
    cap = cap.replace('..','')
    cap = cap.replace('/',' ')
    cap = cap.replace('-',' ')
    captions += [cap]
  return captions


def from_coco_json(path):
  coco = COCO(path)
  ids = coco.anns.keys()
  captions = []
  for i, idx in enumerate(ids):
    captions.append(str(coco.anns[idx]['caption']))

  return captions

def from_lp_json(path):
  dataset = json.load(open(path, 'r'))
  captions = []
  for i, datum in enumerate(dataset):
    
    cap = [i['Word'] for i in dataset[datum]['captions']] + [i['text'] for i in dataset[datum]['slide_text']]
    
    try:
        captions += [" ".join(cap)]
    except Exception:
        pdb.set_trace()
  return captions


def from_txt(txt):
  captions = []
  with open(txt, 'rb') as f:
    for line in f:
      captions.append(line.strip())
  return captions


def build_vocab(data_name = 'lp', jsons = ['./ml-1.json'], threshold = 0):
  """Build a simple vocabulary wrapper."""
  import nltk
  counter = Counter()
  for path in jsons:
    full_path = path
    if data_name == 'tgif':
      captions = from_tgif_tsv(full_path)
    elif data_name == 'mrw':
      captions = from_mrw_json(full_path)
    elif data_name == 'coco':
      captions = from_coco_json(full_path)
    elif data_name == 'lp':
      captions = from_lp_json(full_path)
    else:
      captions = from_txt(full_path)

    for i, caption in enumerate(captions):
      tokens = nltk.tokenize.word_tokenize(caption.lower())
      counter.update(tokens)

  # Discard if the occurrence of the word is less than min_word_cnt.
  words = [word for word, cnt in counter.items() if cnt >= threshold]
  print('Vocabulary size: {}'.format(len(words)))

  # Create a vocab wrapper and add some special tokens.
  vocab = Vocabulary()
  vocab.add_word('<pad>')
  vocab.add_word('<start>')
  vocab.add_word('<end>')
  vocab.add_word('<unk>')

  # Add words to the vocabulary.
  for i, word in enumerate(words):
    vocab.add_word(word)
  return vocab







In [15]:
speakers = ["ml-1", "anat-1", "bio-1", "bio-3", "dental", "psy-2", "anat-2",  "bio-2", "bio-4", "psy-1", "speaking"]
import os 
speakers = ["bio-1"]

for sp in speakers:
    jsons = [os.path.join("/projects/dataset_processed/dongwonl/data/{}/{}.json".format(sp,sp))]
    vocab = build_vocab(data_name = 'lp', jsons = jsons)  
    with open('./%s_vocab.pkl' % sp, 'wb') as f:
      pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)
    print("Saved vocabulary file to ", '/%s_vocab.pkl' % sp)

Vocabulary size: 9087
Saved vocabulary file to  /bio-1_vocab.pkl
