## Reference: 
    https://github.com/lichengunc/vist_api/blob/master/vist.py
    

In [1]:
import json
from datetime import datetime

In [2]:
annotation=json.load(open('/home/jay.je/datasets/train.story-in-sequence.json'))

In [3]:
annotation.keys()

dict_keys(['images', 'info', 'albums', 'type', 'annotations'])

In [4]:
len(annotation['images'])

167528

In [5]:
annotation['images'][0]

{'datetaken': '2008-06-30 07:33:43',
 'license': '5',
 'title': 'Moreton Bay Fig 1877',
 'text': '',
 'album_id': '72157605930515606',
 'longitude': '-119.692879',
 'url_o': 'https://farm3.staticflickr.com/2078/2626977325_2b7696990c_o.jpg',
 'secret': 'bec0ff3596',
 'media': 'photo',
 'latitude': '34.414760',
 'id': '2626977325',
 'tags': 'santabarbara'}

In [6]:
annotation['albums'][0]

{'description': 'Believed to be the largest Moreton Bay Fig Tree in the united States',
 'title': 'Moreton Bay Fig 1877',
 'farm': '4',
 'date_update': '1432330952',
 'primary': '2626985925',
 'server': '3104',
 'date_create': '1214980972',
 'photos': '13',
 'secret': '98149cd59b',
 'owner': '12806074@N08',
 'vist_label': '4th_of_july',
 'id': '72157605930515606'}

In [7]:
annotation['annotations'][0]

[{'original_text': 'Our landmark tree in town was about to be destroyed and cleared for a new mall. ',
  'album_id': '72157605930515606',
  'photo_flickr_id': '2627795780',
  'setting': 'first-2-pick-and-tell',
  'worker_id': 'SY6QQXJCXXMNCYP',
  'story_id': '30355',
  'tier': 'story-in-sequence',
  'worker_arranged_photo_order': 0,
  'text': 'our landmark tree in town was about to be destroyed and cleared for a new mall .',
  'storylet_id': '151775'}]

## Create "Sents"

In [8]:
sents = []
for ann in annotation['annotations']:
	# sent = {album_id, img_id, story_id, text, original_text, }
	sent = ann[0].copy()
	sent['id'] = sent.pop('storylet_id')
	sent['order'] = sent.pop('worker_arranged_photo_order')
	sent['img_id'] = sent.pop('photo_flickr_id')
	sent['length'] = len(sent['text'].split())  # add length property
	sents += [sent]


In [9]:
annotation['annotations'][0][0]

{'original_text': 'Our landmark tree in town was about to be destroyed and cleared for a new mall. ',
 'album_id': '72157605930515606',
 'photo_flickr_id': '2627795780',
 'setting': 'first-2-pick-and-tell',
 'worker_id': 'SY6QQXJCXXMNCYP',
 'story_id': '30355',
 'tier': 'story-in-sequence',
 'worker_arranged_photo_order': 0,
 'text': 'our landmark tree in town was about to be destroyed and cleared for a new mall .',
 'storylet_id': '151775'}

In [10]:
sents[0]

{'original_text': 'Our landmark tree in town was about to be destroyed and cleared for a new mall. ',
 'album_id': '72157605930515606',
 'setting': 'first-2-pick-and-tell',
 'worker_id': 'SY6QQXJCXXMNCYP',
 'story_id': '30355',
 'tier': 'story-in-sequence',
 'text': 'our landmark tree in town was about to be destroyed and cleared for a new mall .',
 'id': '151775',
 'order': 0,
 'img_id': '2627795780',
 'length': 17}

## Get Album id to image id mapping

In [11]:
Albums = {album['id']: album for album in annotation['albums']}
Images = {img['id']: img for img in annotation['images']}
Sents = {sent['id']: sent for sent in sents}

In [12]:
def getDateTime(img_id):
	x = Images[img_id]['datetaken']
	return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

In [13]:
album_to_img_ids = {}
for img in annotation['images']:
	album_id = img['album_id']
	img_id = img['id']
	album_to_img_ids[album_id] = album_to_img_ids.get(album_id, []) + [img_id]
for album_id, img_ids in album_to_img_ids.items():
	img_ids.sort(key=getDateTime)

In [14]:
album_to_img_ids['72157605930515606'] #album to image ids

['2626977325',
 '2627795780',
 '2762593799',
 '2626979987',
 '2701863545',
 '2627798290',
 '2626982337',
 '2626983575',
 '2627801768',
 '2626985925',
 '2626987089',
 '2627805194',
 '2627807506']

## Get story id -> Sent ids

In [15]:
def get_order(sent_id):
	return Sents[sent_id]['order']

In [16]:
story_to_sent_ids = {}
for sent_id, sent in Sents.items():
	story_id = sent['story_id']
	story_to_sent_ids[story_id] = story_to_sent_ids.get(story_id, []) + [sent_id]
for story_id, sent_ids in story_to_sent_ids.items():
	sent_ids.sort(key=get_order)

In [17]:
story_to_sent_ids # story id to sent id

{'30355': ['151775', '151776', '151777', '151778', '151779'],
 '30356': ['151780', '151781', '151782', '151783', '151784'],
 '30357': ['151785', '151786', '151787', '151788', '151789'],
 '30358': ['151790', '151791', '151792', '151793', '151794'],
 '30359': ['151795', '151796', '151797', '151798', '151799'],
 '30360': ['151800', '151801', '151802', '151803', '151804'],
 '30361': ['151805', '151806', '151807', '151808', '151809'],
 '30362': ['151810', '151811', '151812', '151813', '151814'],
 '30363': ['151815', '151816', '151817', '151818', '151819'],
 '30364': ['151820', '151821', '151822', '151823', '151824'],
 '0': ['0', '1', '2', '3', '4'],
 '1': ['5', '6', '7', '8', '9'],
 '2': ['10', '11', '12', '13', '14'],
 '3': ['15', '16', '17', '18', '19'],
 '4': ['20', '21', '22', '23', '24'],
 '5': ['25', '26', '27', '28', '29'],
 '6': ['30', '31', '32', '33', '34'],
 '7': ['35', '36', '37', '38', '39'],
 '8': ['40', '41', '42', '43', '44'],
 '9': ['45', '46', '47', '48', '49'],
 '10': ['5

## Album id -> Story id

In [18]:
album_to_story_ids = {}
for story_id, sent_ids in story_to_sent_ids.items():
	sent = Sents[sent_ids[0]]
	album_id = sent['album_id']
	album_to_story_ids[album_id] = album_to_story_ids.get(album_id, []) + [story_id]

In [19]:
display(Sents['151775'])
display(Sents['151776'])
display(Sents['151777'])
display(Sents['151778'])
display(Sents['151779'])
display(Sents['151780']) # this one is different story id

{'original_text': 'Our landmark tree in town was about to be destroyed and cleared for a new mall. ',
 'album_id': '72157605930515606',
 'setting': 'first-2-pick-and-tell',
 'worker_id': 'SY6QQXJCXXMNCYP',
 'story_id': '30355',
 'tier': 'story-in-sequence',
 'text': 'our landmark tree in town was about to be destroyed and cleared for a new mall .',
 'id': '151775',
 'order': 0,
 'img_id': '2627795780',
 'length': 17}

{'original_text': 'So we decided to take the day to go out and enjoy its beauty.',
 'album_id': '72157605930515606',
 'setting': 'first-2-pick-and-tell',
 'worker_id': 'SY6QQXJCXXMNCYP',
 'story_id': '30355',
 'tier': 'story-in-sequence',
 'text': 'so we decided to take the day to go out and enjoy its beauty .',
 'id': '151776',
 'order': 1,
 'img_id': '2626979987',
 'length': 15}

{'original_text': 'To see the final glimpse of the roots, extending out into the depths of the hill.',
 'album_id': '72157605930515606',
 'setting': 'first-2-pick-and-tell',
 'worker_id': 'SY6QQXJCXXMNCYP',
 'story_id': '30355',
 'tier': 'story-in-sequence',
 'text': 'to see the final glimpse of the roots , extending out into the depths of the hill .',
 'id': '151777',
 'order': 2,
 'img_id': '2626982337',
 'length': 18}

{'original_text': 'And its magnificent trunk, larger than life itself.',
 'album_id': '72157605930515606',
 'setting': 'first-2-pick-and-tell',
 'worker_id': 'SY6QQXJCXXMNCYP',
 'story_id': '30355',
 'tier': 'story-in-sequence',
 'text': 'and its magnificent trunk , larger than life itself .',
 'id': '151778',
 'order': 3,
 'img_id': '2626983575',
 'length': 10}

{'original_text': 'One last picture of its beauty so we could capture it forever. ',
 'album_id': '72157605930515606',
 'setting': 'first-2-pick-and-tell',
 'worker_id': 'SY6QQXJCXXMNCYP',
 'story_id': '30355',
 'tier': 'story-in-sequence',
 'text': 'one last picture of its beauty so we could capture it forever .',
 'id': '151779',
 'order': 4,
 'img_id': '2626985925',
 'length': 13}

{'original_text': 'We found this tree when we were walking in a nearby town. ',
 'album_id': '72157605930515606',
 'setting': 'first-2-pick-and-tell',
 'worker_id': 'UG2D1541VEP82E8',
 'story_id': '30356',
 'tier': 'story-in-sequence',
 'text': 'we found this tree when we were walking in a nearby town .',
 'id': '151780',
 'order': 0,
 'img_id': '2701863545',
 'length': 13}

In [20]:
album_to_story_ids # each album has set of stories, and set of stories map to sent id

{'72157605930515606': ['30355', '30356', '30357', '30358', '30359'],
 '72157594220406194': ['30360', '30361', '30362', '30363', '30364'],
 '72157601191986212': ['0', '1', '2', '3', '4'],
 '72157604371309182': ['5', '6', '7', '8', '9'],
 '72157594185500039': ['10', '11', '12', '13', '14'],
 '72157594187037689': ['15', '16', '17', '18', '19'],
 '72157594187639753': ['20', '21', '22', '23', '24'],
 '72157594190376306': ['30365', '30366', '30367', '30368', '30369'],
 '72157594187752142': ['25', '26', '27', '28', '29'],
 '72157594187796995': ['30370', '30371', '30372', '30373', '30374'],
 '72157594187398210': ['30', '31', '32', '33', '34'],
 '72157594187912331': ['35', '36', '37', '38', '39'],
 '72157600646049534': ['40', '41', '42', '43', '44'],
 '72157600646303227': ['45', '46', '47', '48', '49'],
 '72157600646894101': ['50', '51', '52', '53', '54'],
 '72157600647591216': ['55', '56', '57', '58', '59'],
 '72157600649173924': ['30375', '30376', '30377', '30378', '30379'],
 '721576006514003

In [21]:
for album in annotation['albums']:
	album['img_ids'] = album_to_img_ids[album['id']]
	album['story_ids'] = album_to_story_ids[album['id']]

In [22]:
annotation['albums'][1]

{'description': 'Spent a nice weekend in Glasgow ... our 4th anniversary ... great town!\n\nJuly 2006',
 'title': 'Scotland',
 'farm': '1',
 'date_update': '1361013854',
 'primary': '205858625',
 'server': '65',
 'date_create': '1154456549',
 'photos': '11',
 'secret': '06a7590529',
 'owner': '10155443@N00',
 'vist_label': '4th_of_july',
 'id': '72157594220406194',
 'img_ids': ['205866755',
  '203187193',
  '205862590',
  '205858424',
  '204092716',
  '205858470',
  '205858503',
  '205858543',
  '205858578',
  '204095815',
  '205858625'],
 'story_ids': ['30360', '30361', '30362', '30363', '30364']}

In [23]:
story_to_sent_ids['30360']

['151800', '151801', '151802', '151803', '151804']

In [24]:
story_to_sent_ids['30362']

['151810', '151811', '151812', '151813', '151814']

In [25]:
Images['205862590']

{'datetaken': '2006-07-30 10:25:00',
 'license': '3',
 'title': 'Glasgow',
 'text': '',
 'album_id': '72157594220406194',
 'longitude': '-4.295654',
 'url_o': 'https://farm1.staticflickr.com/73/205862590_43bea02527_o.jpg',
 'secret': '43bea02527',
 'media': 'photo',
 'latitude': '55.883014',
 'id': '205862590',
 'tags': 'scotland unitedkingdom glasgow strudelmonkey'}

In [26]:
Sents['151810']

{'original_text': 'I went on vacation this past year.',
 'album_id': '72157594220406194',
 'setting': 'last-3-pick-old-and-tell',
 'worker_id': '2F8DOKYZFOMQV05',
 'story_id': '30362',
 'tier': 'story-in-sequence',
 'text': 'i went on vacation this past year .',
 'id': '151810',
 'order': 0,
 'img_id': '205866755',
 'length': 8}

In [27]:
for album in annotation['albums']:
	album['img_ids'] = album_to_img_ids[album['id']]
	album['story_ids'] = album_to_story_ids[album['id']]

# make Stories: {story_id: {id, album_id, sent_ids, img_ids}}
Stories = {story_id: {
                # 'id': story_id, 
                'sent_ids': sent_ids, 
                'img_ids': [Sents[sent_id]['img_id'] for sent_id in sent_ids],
                'album_id': Sents[sent_ids[0]]['album_id']} 
			for story_id, sent_ids in story_to_sent_ids.items()}

In [28]:
Stories

{'30355': {'sent_ids': ['151775', '151776', '151777', '151778', '151779'],
  'img_ids': ['2627795780',
   '2626979987',
   '2626982337',
   '2626983575',
   '2626985925'],
  'album_id': '72157605930515606'},
 '30356': {'sent_ids': ['151780', '151781', '151782', '151783', '151784'],
  'img_ids': ['2701863545',
   '2626977325',
   '2627795780',
   '2626983575',
   '2626982337'],
  'album_id': '72157605930515606'},
 '30357': {'sent_ids': ['151785', '151786', '151787', '151788', '151789'],
  'img_ids': ['2627795780',
   '2626979987',
   '2626982337',
   '2626983575',
   '2626985925'],
  'album_id': '72157605930515606'},
 '30358': {'sent_ids': ['151790', '151791', '151792', '151793', '151794'],
  'img_ids': ['2701863545',
   '2626977325',
   '2627795780',
   '2626983575',
   '2626982337'],
  'album_id': '72157605930515606'},
 '30359': {'sent_ids': ['151795', '151796', '151797', '151798', '151799'],
  'img_ids': ['2701863545',
   '2626977325',
   '2627795780',
   '2626983575',
   '2626982337

In [42]:
Images['182032576']

{'datetaken': '2006-07-04 14:16:35',
 'license': '3',
 'title': '4th of July 2006_04',
 'text': '',
 'album_id': '72157594187639753',
 'longitude': '0',
 'url_o': 'https://farm1.staticflickr.com/69/182032576_a09aab38d5_o.jpg',
 'secret': 'a09aab38d5',
 'media': 'photo',
 'latitude': '0',
 'id': '182032576',
 'tags': 'lane shelly 4thofjuly'}

## Manually download images, because one split do not have consecutive images

In [102]:
import requests
from PIL import Image
import random
import os
import shutil
import urllib.request 

from requests.adapters import HTTPAdapter, Retry

In [51]:
# Download 
len(Stories.keys()) # 40,000 stories. Want to download random stories worth 10,000 images

40155

In [58]:
Stories['40155']

{'sent_ids': ['200775', '200776', '200777', '200778', '200779'],
 'img_ids': ['21968676', '21968239', '21967783', '21968102', '21968372'],
 'album_id': '509643'}

In [59]:
ridx = random.sample(range(0, 40155), 2000) # collect about 2000 stories, corresponding to 10,000 images

In [76]:
img_list = []
stories = []
for i in ridx:
    if str(i) in Stories:
        stories.append(str(i))
        img_list += Stories[str(i)]['img_ids']

In [81]:
print(len(stories))
print(len(img_list))
print(img_list[0])

1983
9915
4355960354


In [80]:
Images['2626977325']

{'datetaken': '2008-06-30 07:33:43',
 'license': '5',
 'title': 'Moreton Bay Fig 1877',
 'text': '',
 'album_id': '72157605930515606',
 'longitude': '-119.692879',
 'url_o': 'https://farm3.staticflickr.com/2078/2626977325_2b7696990c_o.jpg',
 'secret': 'bec0ff3596',
 'media': 'photo',
 'latitude': '34.414760',
 'id': '2626977325',
 'tags': 'santabarbara'}

In [105]:
def make_request(url):
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    response = session.get(url)

    return response
    # print(parsed)

res = make_request('https://farm3.staticflickr.com/2078/2626977325_2b7696990c_o.jpg')

In [100]:
if os.path.exists('/home/jay.je/datasets/VIST/images'):
    shutil.rmtree('/home/jay.je/datasets/VIST/images')
os.makedirs('/home/jay.je/datasets/VIST/images')

In [110]:
for idx, img_num in enumerate(img_list):
    try:
        data = requests.get(Images[img_num]['url_o'], timeout=1).content
        with open(f'/home/jay.je/datasets/VIST/images/{img_num}.jpg', 'wb') as handler:
            handler.write(data)
            handler.close()
    except:
        continue
    if idx % 1000 == 0:
        print(f'Current processing number if {idx}...\n')

Current processing number if 0...

Current processing number if 1000...

Current processing number if 2000...

Current processing number if 3000...

Current processing number if 4000...

Current processing number if 6000...

Current processing number if 7000...

Current processing number if 8000...

Current processing number if 9000...



In [111]:
# check which stories have incomplete images

In [112]:
img_down = os.listdir('/home/jay.je/datasets/VIST/images/')

In [115]:
print(len(img_down))
print(len(set(img_down)))

9139
9139


In [116]:
img_down = set(img_down)

In [130]:
img_down

{'6107456787.jpg',
 '6740346883.jpg',
 '446820018.jpg',
 '5507371124.jpg',
 '217710557.jpg',
 '4460513390.jpg',
 '518728987.jpg',
 '4551348758.jpg',
 '1930325741.jpg',
 '8306629.jpg',
 '25091918.jpg',
 '6288476439.jpg',
 '486780460.jpg',
 '77384423.jpg',
 '5791612612.jpg',
 '3358515710.jpg',
 '496899824.jpg',
 '2000117596.jpg',
 '4282346069.jpg',
 '4265845166.jpg',
 '5135625421.jpg',
 '21444695.jpg',
 '8296030.jpg',
 '119898538.jpg',
 '5364362519.jpg',
 '4350654971.jpg',
 '77775527.jpg',
 '5135131477.jpg',
 '155954289.jpg',
 '5884855.jpg',
 '6122830354.jpg',
 '2255152061.jpg',
 '20367818.jpg',
 '370609883.jpg',
 '304487.jpg',
 '4249046059.jpg',
 '4013909355.jpg',
 '18935225.jpg',
 '4741457347.jpg',
 '5165285610.jpg',
 '1071354.jpg',
 '6135008398.jpg',
 '4449213897.jpg',
 '4428257128.jpg',
 '5120002884.jpg',
 '3689517134.jpg',
 '4945946347.jpg',
 '17396310.jpg',
 '5105812562.jpg',
 '4289711534.jpg',
 '6052802481.jpg',
 '6537176945.jpg',
 '1233336739.jpg',
 '506435480.jpg',
 '4434068571.

In [136]:
Stories['30355']

{'sent_ids': ['151775', '151776', '151777', '151778', '151779'],
 'img_ids': ['2627795780',
  '2626979987',
  '2626982337',
  '2626983575',
  '2626985925'],
 'album_id': '72157605930515606'}

In [141]:
Images['2627795780']

{'datetaken': '2008-06-30 07:34:04',
 'license': '5',
 'title': 'Santa Barbara',
 'text': '',
 'album_id': '72157605930515606',
 'longitude': '-119.692879',
 'url_o': 'https://farm4.staticflickr.com/3080/2627795780_2517b53262_o.jpg',
 'secret': 'c8e0bec7d2',
 'media': 'photo',
 'latitude': '34.414760',
 'id': '2627795780',
 'tags': 'santabarbara'}

In [145]:
drop_stories=[]
for i in Stories:
    img_ids = Stories[i]['img_ids']
    for j in img_ids:
        if j+'.jpg' not in img_down:
            drop_stories.append(i)
            break

In [148]:
drop_stories = set(drop_stories)
print(len(drop_stories))

35006


In [157]:
Stories['32730']

{'sent_ids': ['163650', '163651', '163652', '163653', '163654'],
 'img_ids': ['4459710048',
  '4458929149',
  '4458929413',
  '4459711062',
  '4459711506'],
 'album_id': '72157623558837727'}

In [160]:
for i in ['4459710048',
  '4458929149',
  '4458929413',
  '4459711062',
  '4459711506']:
    if i+'.jpg' not in img_down:
        print(i)

In [156]:
set(Stories.keys())-drop_stories

{'32730',
 '12964',
 '3740',
 '36155',
 '37637',
 '13362',
 '18669',
 '4528',
 '31145',
 '28652',
 '36671',
 '9255',
 '15980',
 '180',
 '39295',
 '29110',
 '24900',
 '145',
 '37171',
 '27189',
 '38754',
 '18544',
 '10138',
 '28853',
 '899',
 '3922',
 '28024',
 '19640',
 '27122',
 '26550',
 '11653',
 '26482',
 '30499',
 '25025',
 '31612',
 '9141',
 '30446',
 '28649',
 '14536',
 '32721',
 '19080',
 '1375',
 '39762',
 '2552',
 '13559',
 '3824',
 '3937',
 '19726',
 '11136',
 '1668',
 '26607',
 '36190',
 '31846',
 '10557',
 '31954',
 '28138',
 '10983',
 '10765',
 '13781',
 '30834',
 '22163',
 '31952',
 '30320',
 '29909',
 '18872',
 '209',
 '30135',
 '10728',
 '27669',
 '26753',
 '21712',
 '39491',
 '33493',
 '38928',
 '12920',
 '37722',
 '22641',
 '31707',
 '15714',
 '3690',
 '31614',
 '39878',
 '38325',
 '38658',
 '27772',
 '38544',
 '36905',
 '32888',
 '3880',
 '19462',
 '15068',
 '8992',
 '20632',
 '19494',
 '39847',
 '31753',
 '32044',
 '7269',
 '24157',
 '17972',
 '23915',
 '29045',
 '

In [None]:
len(set(Stories.keys())-drop_stories)

In [166]:
StoriesFin = {}
for i in Stories:
    if i not in drop_stories:
        StoriesFin[i] = Stories[i].copy()

In [169]:
len(StoriesFin.keys()) # able to extract 5149 stories from this!

5149

In [171]:
StoriesFin['16']

{'sent_ids': ['80', '81', '82', '83', '84'],
 'img_ids': ['181647714', '181626113', '181645575', '181635518', '181640606'],
 'album_id': '72157594187037689'}

In [172]:
Sents['80']

{'original_text': 'We took a nice hike into the forest today.',
 'album_id': '72157594187037689',
 'setting': 'first-2-pick-and-tell',
 'worker_id': 'UG2D1541VEP82E8',
 'story_id': '16',
 'tier': 'story-in-sequence',
 'text': 'we took a nice hike into the forest today .',
 'id': '80',
 'order': 0,
 'img_id': '181647714',
 'length': 10}

In [173]:
Sents['81']

{'original_text': 'We were lucky enough to see some wildlife, like this deer.',
 'album_id': '72157594187037689',
 'setting': 'first-2-pick-and-tell',
 'worker_id': 'UG2D1541VEP82E8',
 'story_id': '16',
 'tier': 'story-in-sequence',
 'text': 'we were lucky enough to see some wildlife , like this deer .',
 'id': '81',
 'order': 1,
 'img_id': '181626113',
 'length': 13}

In [174]:
Sents['82']

{'original_text': 'This guy was friendly. He must hit up all the hikers for food.',
 'album_id': '72157594187037689',
 'setting': 'first-2-pick-and-tell',
 'worker_id': 'UG2D1541VEP82E8',
 'story_id': '16',
 'tier': 'story-in-sequence',
 'text': 'this guy was friendly . he must hit up all the hikers for food .',
 'id': '82',
 'order': 2,
 'img_id': '181645575',
 'length': 15}

In [175]:
Sents['83']

{'original_text': "I'm glad we spotted this snake before we got too close!",
 'album_id': '72157594187037689',
 'setting': 'first-2-pick-and-tell',
 'worker_id': 'UG2D1541VEP82E8',
 'story_id': '16',
 'tier': 'story-in-sequence',
 'text': "i 'm glad we spotted this snake before we got too close !",
 'id': '83',
 'order': 3,
 'img_id': '181635518',
 'length': 13}

In [184]:
for s in StoriesFin:
    for i in StoriesFin[s]['sent_ids']:
        if 'text' not in StoriesFin[s]:
            StoriesFin[s]['text'] = []
        StoriesFin[s]['text'].append(Sents[i]['text'])

In [185]:
StoriesFin['16']

{'sent_ids': ['80', '81', '82', '83', '84'],
 'img_ids': ['181647714', '181626113', '181645575', '181635518', '181640606'],
 'album_id': '72157594187037689',
 'text': ['we took a nice hike into the forest today .',
  'we were lucky enough to see some wildlife , like this deer .',
  'this guy was friendly . he must hit up all the hikers for food .',
  "i 'm glad we spotted this snake before we got too close !",
  'the end of our hike rewarded us with an amazing view of the falls !']}

## Save the final files

In [179]:
sents_json = json.dumps(Sents, indent=4)
# Writing to sample.json
with open("/home/jay.je/datasets/VIST/annotations/Sents.json", "w") as outfile:
    outfile.write(sents_json)
    outfile.close()

In [186]:
storiesfin_json = json.dumps(StoriesFin, indent=4)
# Writing to sample.json
with open("/home/jay.je/datasets/VIST/annotations/StoriesFin.json", "w") as outfile:
    outfile.write(storiesfin_json)
    outfile.close()

In [182]:
stories_json = json.dumps(Stories, indent=4)
# Writing to sample.json
with open("/home/jay.je/datasets/VIST/annotations/Stories.json", "w") as outfile:
    outfile.write(stories_json)
    outfile.close()