# Generating metadata

### What this notebook does
**Step 1:** Create metadata file, containing a bunch of JSON-formatted trial metadata objects

**Step 2:**  Insert each trial as a record into a mongo database

This assumes that the stimuli have been uploaded to the S3 bucket using `upload_stims_to_s3.ipynb`.

In [None]:
import os
import numpy as np
from PIL import Image
import pandas as pd
import json
import pymongo as pm
from glob import glob
from IPython.display import clear_output
import ast

In [None]:
## where are your stimulus images stored?
data_dir = 'example'
bucket_name = 'human-physics-benchmarking-pilot'
stim_version = 'example'
dataset_name = '{}_{}'.format(bucket_name, stim_version)
stimulus_extension = ".mp4" #what's the file extension for the stims?

## get a list of paths to each one
full_stim_paths = [os.path.abspath(os.path.join(data_dir,i)) for i in os.listdir(data_dir) if stimulus_extension in i]
print('We have {} stimuli to evaluate.'.format(len(full_stim_paths)))

In [None]:
## helper to build image urls
def build_s3_url(path, bucket_name):    
    return 'https://{}.s3.amazonaws.com/{}'.format(bucket_name, path.split('/')[-1])

In [None]:
## basic metadata lists
stim_urls = [build_s3_url(p,bucket_name) for p in full_stim_paths]
stim_IDs = [p.split('/')[-1].split('.')[0] for p in full_stim_paths]

In [None]:
## convert to pandas dataframe
M = pd.DataFrame([stim_IDs,stim_urls]).transpose()
M.columns = ['stim_ID', 'stim_url']

In [None]:
# if needed, add code to add additional columns

In [None]:
L = M.to_dict(orient='records')

In [None]:
print('Saving out json dictionary out to file...') 
with open('{}.js'.format(dataset_name), 'w') as fout:
    json.dump(L, fout)
print('Done!')

Set up ssh bridge to write to mongodb. Insert your username:

In [None]:
!ssh -fNL 27017:127.0.0.1:27017 fbinder@cogtoolslab.org

In [None]:
# set vars 
auth = pd.read_csv('../auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user. Place it in the toplevel of the repo
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org' ## cogtoolslab ip address

conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['stimuli']
coll = db[dataset_name]

In [None]:
#reload JSON back in to the new stimulus collection
J = json.loads(open('{}.js'.format(dataset_name),mode='r').read())
print('dataset_name: {}'.format(dataset_name))
print('Length of J is: {}'.format(len(J)))

In [None]:
#⚠️drop collection if necessary
db.drop_collection(dataset_name) 

In [None]:
#get list of current collections
sorted(db.list_collection_names())

In [None]:
## actually add data now to the database
for (i,m) in enumerate(J):
    coll.insert_one(m)
    print('{} of {}'.format(i+1, len(J)))
    clear_output(wait=True)

print('Done inserting records into mongo!')

In [None]:
coll.estimated_document_count()

In [None]:
coll.find_one()