In [1]:
import os
import csv
import pandas as pd
import numpy as np
import boto3
from IPython.display import clear_output
import ast
import pymongo as pm
from bson.objectid import ObjectId

In [4]:
#read the csv file and select target columns
fname = 'MuseumStation_AllDescriptives_14447_images_final_cdm_run_v5.csv'
version = 'cdm_run_v5'

data = pd.read_csv(fname)
data = data[['session_id', 'category', 'age', 'filename']]
data = data[~data['category'].isin(['something you love'])]
data.reset_index().drop(['index'], axis=1)
data['filename'] = data['filename'].apply(lambda x: x.split('/')[-1])
data['url'] = data['filename'].apply(lambda x: 'https://kiddraw.s3.amazonaws.com/' + x)
data = data.rename(columns={'category':'class'})

# add a shuffle id and a games variable to indicate the number of times the current image is labelled
data_range = np.arange(data.shape[0])
np.random.shuffle(data_range)
data['shuffler_ind'] = data_range
data['games'] = '[]'
data['games'] = data['games'].apply(lambda x: ast.literal_eval(x))

# add image url 

In [5]:
data

Unnamed: 0,session_id,class,age,filename,url,shuffler_ind,games
0,cdm_run_v51551378173946,square,age2,square_sketch_age2_cdm_run_v51551378173946.png,https://kiddraw.s3.amazonaws.com/square_sketch...,5256,[]
1,cdm_run_v51551378173946,this square,age2,this square_sketch_age2_cdm_run_v5155137817394...,https://kiddraw.s3.amazonaws.com/this square_s...,10656,[]
2,cdm_run_v51551378173946,a tiger,age2,a tiger_sketch_age2_cdm_run_v51551378173946.png,https://kiddraw.s3.amazonaws.com/a tiger_sketc...,12750,[]
3,cdm_run_v51551378173946,a cactus,age2,a cactus_sketch_age2_cdm_run_v51551378173946.png,https://kiddraw.s3.amazonaws.com/a cactus_sket...,6105,[]
4,cdm_run_v51551378776163,square,age6,square_sketch_age6_cdm_run_v51551378776163.png,https://kiddraw.s3.amazonaws.com/square_sketch...,9225,[]
5,cdm_run_v51551378776163,shape,age6,shape_sketch_age6_cdm_run_v51551378776163.png,https://kiddraw.s3.amazonaws.com/shape_sketch_...,2645,[]
6,cdm_run_v51551378776163,this square,age6,this square_sketch_age6_cdm_run_v5155137877616...,https://kiddraw.s3.amazonaws.com/this square_s...,9145,[]
7,cdm_run_v51551378776163,a whale,age6,a whale_sketch_age6_cdm_run_v51551378776163.png,https://kiddraw.s3.amazonaws.com/a whale_sketc...,13661,[]
8,cdm_run_v51551378776163,a hat,age6,a hat_sketch_age6_cdm_run_v51551378776163.png,https://kiddraw.s3.amazonaws.com/a hat_sketch_...,10894,[]
9,cdm_run_v51551379596662,square,age6,square_sketch_age6_cdm_run_v51551379596662.png,https://kiddraw.s3.amazonaws.com/square_sketch...,11590,[]


#### Upload images to s3

In [45]:
runThis = 0
bucket_name = 'kiddraw'
path_to_png = 'cdm_v5'
if runThis:
    conn = boto3.resource('s3')
    b = conn.Bucket(bucket_name) 
    
    for folder in os.listdir(path_to_png):
        if folder not in ['.DS_Store', 'example', 'something you love']:
            for ind,im in enumerate(os.listdir(os.path.join(path_to_png, folder))):
                if im[-3:]=='png':
                    print ind, im
                    img_obj = b.Object(im)
                    img_obj.upload_file(os.path.join(path_to_png, folder, im))
                    img_obj.Acl().put(ACL='public-read')
                    clear_output(wait=True)  
print 'Done!'

Done!


In [44]:
for key in b.objects.all():
    print key.key


#### Convert the csv file to json and upload to mongoDB

In [52]:
# convert the csv file to json
import json
datajs = data.to_json(orient='records')
datajs = json.loads(datajs)


In [53]:
datajs

[{u'age': u'age2',
  u'class': u'square',
  u'filename': u'square_sketch_age2_cdm_run_v51551378173946.png',
  u'games': [],
  u'session_id': u'cdm_run_v51551378173946',
  u'shuffler_ind': 1608,
  u'url': u'https://kiddraw.s3.amazonaws.com/square_sketch_age2_cdm_run_v51551378173946.png'},
 {u'age': u'age2',
  u'class': u'this square',
  u'filename': u'this square_sketch_age2_cdm_run_v51551378173946.png',
  u'games': [],
  u'session_id': u'cdm_run_v51551378173946',
  u'shuffler_ind': 9502,
  u'url': u'https://kiddraw.s3.amazonaws.com/this square_sketch_age2_cdm_run_v51551378173946.png'},
 {u'age': u'age2',
  u'class': u'a tiger',
  u'filename': u'a tiger_sketch_age2_cdm_run_v51551378173946.png',
  u'games': [],
  u'session_id': u'cdm_run_v51551378173946',
  u'shuffler_ind': 8755,
  u'url': u'https://kiddraw.s3.amazonaws.com/a tiger_sketch_age2_cdm_run_v51551378173946.png'},
 {u'age': u'age2',
  u'class': u'a cactus',
  u'filename': u'a cactus_sketch_age2_cdm_run_v51551378173946.png',
  u

In [2]:
# set vars 
auth = pd.read_csv('auth.txt', header = None) # this auth.txt file contains the password for the stanford-cogsci.org user
pswd = auth.values[0][0]

# have to fix this to be able to analyze from local
conn = pm.MongoClient('mongodb://stanford:' + pswd + '@127.0.0.1')
db = conn['kiddraw']
coll = db['check_invalid_v5_dev'] # using dev first
# formal check_invalid_v5

In [67]:
# drop past data
reallyRun = 0
if reallyRun:
    coll.drop()
print 'Currently there are {} records in the development collection'.format(coll.count())

Currently there are 0 records in the development collection


In [68]:
# upload data to the development collection
reallyRun = 0
if reallyRun:
    for i,_z in enumerate(datajs):
        coll.insert(_z)
        print('Inserted {} records.'.format(i))
        clear_output(wait=True)
else:
    print('Did not insert any new data.')

Inserted 13755 records.


In [9]:
## check how many records have been retrieved on the mturk study
print 'Currently there are {} records in the collection'.format(coll.count())
#{"games": {"$size": 1}}
# {"_id": ObjectId("5e6a7e7d1943e62410f1ebb3")}
a = coll.find()
numGames = []
for rec in a:
    if len(rec['games']) > 0:
        numGames.append(len(rec['games'])) 
print len(numGames)

Currently there are 13756 records in the collection
851
