## generate stimuli for text task

In [6]:
import os, sys, json

import pymongo as pm
import numpy as np
import scipy.stats as stats
import pandas as pd
import glob 
import re
from io import BytesIO
from PIL import Image
from skimage import io, img_as_float
import base64
import itertools
from itertools import chain

pd.set_option('display.max_columns', None)

import matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')

import os
import shutil

from IPython.display import clear_output
import importlib

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [7]:
## directory & file hierarchy
proj_dir = os.path.abspath('..')
stim_dir = os.getcwd()
scripts_dir = os.path.abspath('../../viewer_comprehension/stimuli/scripts/')

In [8]:
## compile all scripts into one df
json_pattern = os.path.join(scripts_dir,'*.json')
file_list = glob.glob(json_pattern)

scripts = []
for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
    scripts.append(json_data)
df_scripts = pd.concat(scripts, sort=False)

In [9]:
#define values
objects = ['pangolin', 'saffron', 'iberian-ham', 'bodden-town', 'russian-ushanka', 'citole', 'shofar', 'metallophone', 'mbira']

#drop rows that contain any value in the list
df_scripts = df_scripts[df_scripts.object.isin(objects) == False].reset_index(drop=True)

In [10]:
## count number of videos
N_videos = df_scripts['object'].nunique()
N_categories = df_scripts['video_type'].nunique()
print(f'We have {N_videos} videos in our dataset across {N_categories} categories')

videos = df_scripts['object'].unique()
print(f'Video categories: {list(videos)}')

We have 12 videos in our dataset across 4 categories
Video categories: ['mongolian-deel', 'dugong', 'vaquita', 'african-kanzu', 'saola', 'vilnius', 'kopi-luwak', 'murnong', 'cuenca', 'indonesian-kebaya', 'almas-caviar', 'lichinga']


In [11]:
## clean up scripts for text highlight task
scripts = df_scripts.copy()
scripts['obj'] = scripts['object'].apply(lambda x: x.replace('-', ' '))

In [12]:
## clean up scripts for text highlight task
scripts = df_scripts.copy()

## capitalize names
def captalize(row):
    if row['object'] in ['mongolian-deel', 'african-kanzu', 'indonesian-kebaya', 'vilnius', 'cuenca', 'lichinga']:
        return row['object'].capitalize()
    else:
        return row['object']

scripts['object'] = scripts.apply(captalize, axis=1)

scripts['obj'] = scripts['object'].apply(lambda x: x.replace('-', ' '))

In [13]:
## clean titles
def set_title(row):
    if row['object'] in ['Mongolian-deel', 'dugong', 'vaquita', 'saola']:
        return 'What is a ' + row['obj'] + '?'
    elif row['object'] in ['African-kanzu', 'Indonesian-kebaya']:
        return 'What is an ' + row['obj'] + '?'
    else:
        return 'What is ' + row['obj'] + '?'
scripts['title'] = scripts.apply(set_title, axis=1)

In [14]:
## clean paragraphs
# scripts['transcript'] = scripts['script'].apply(lambda x: x.replace('\n\n', ' <br><br> '))
scripts['transcript'] = scripts['script'].apply(lambda x: x.split('\n\n'))
# scripts_para = scripts.explode('transcript').reset_index(drop=True)
scripts['transcript'] = scripts['transcript'].apply(lambda x: [x.replace('\n', ' ') for x in x])
# scripts_para['Npara'] = scripts_para.groupby('object')['transcript'].cumcount()

In [49]:
scripts['transcript'][10]

["Almas caviar, also known as 'white gold', is sourced from the Caspian Sea, which is located between Europe and Asia. The Caspian Sea is home to sturgeon, the fish species responsible for producing this luxurious caviar. The caviar is harvested from the eggs of the rare and ancient Beluga sturgeon, known for its large size and exquisite taste.",
 "Almas caviar is renowned for its unique appearance. The eggs are exceptionally large, ranging in color from pale to golden hues, giving it a striking and luxurious appearance. The name 'Almas' itself translates to 'diamond' in Persian, perfectly reflecting the precious nature of this exquisite delicacy.",
 'Preparing Almas caviar is a meticulous and labor-intensive process. The eggs are carefully extracted from the Beluga sturgeon, and the caviar is then gently salted to enhance its flavor while preserving its delicate texture. Afterward, it is left to mature for several months, allowing its flavors to develop and intensify.',
 'Almas caviar

In [16]:
scripts_inform = scripts.copy()
scripts_inform['goal'] = 'informative'
scripts_entertain = scripts.copy()
scripts_entertain['goal'] = 'entertaining'

In [17]:
final = pd.concat([scripts_inform, scripts_entertain]).reset_index(drop=True)

In [47]:
# final['object']

In [144]:
#initalize list of all version dictionaries
Meta = [] 

printed=False
for name, group in final.groupby(['goal']):  
    display(group)
    print('Adding {} to Meta'.format(name)) 
    clear_output(wait=True)
    Batch = {}
    Trials = [] 
    
    for n,g in group.groupby('goal'): # looping over trials within a batch
        stimuli = g.to_dict(orient='records')
        Trials.append(stimuli) 
    Batch['meta'] = Trials 
    Batch['games'] = []
    Meta.append(Batch)
    
print('Done!') 

Done!


In [145]:
## check how many Meta are being inserted
len(Meta)

2

In [146]:
dataset_name = 'video-broll'
print('Saving out json dictionary out to file...') 
with open('{}_meta.js'.format(dataset_name), 'w') as fout:
    json.dump(Meta, fout)
print('Done!')

Saving out json dictionary out to file...
Done!


In [147]:
#reload JSON back in to the new stimulus collection
J = json.loads(open('{}_meta.js'.format(dataset_name),mode='r').read())
print('dataset_name: {}'.format(dataset_name))
print('Length of J is: {}'.format(len(J)))

dataset_name: video-broll
Length of J is: 2


### insert metadata into mongo
Run this command at the terminal, ssh -fNL 27017:127.0.0.1:27017 hhuey@cogtoolslab.org

In [148]:
# set vars 
auth = pd.read_csv('auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org' ## cogtoolslab ip address

# have to fix this to be able to analyze from local
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['stimuli'] ## everyone in the lab shares this stimulus database
coll = db[dataset_name]

In [149]:
## now really insert data
reallyRun = False
if reallyRun:
    for (i,j) in enumerate(J):
        print ('%d of %d uploaded ...' % (i+1,len(J)))
        clear_output(wait=True)
        coll.insert_one(j)
print('Done!')

Done!


In [150]:
## check how many records were inserted
coll.estimated_document_count()

2

In [89]:
## inspect one of these annotation sessions
coll.find_one()

{'_id': ObjectId('6537f31b4cff9be1a9a53787'),
 'meta': [[{'video_type': 'fashion',
    'object': 'Mongolian-deel',
    'script': "The Mongolian deel is a traditional garment known for its practical design and cultural significance.\nIt features a long, loose-fitting tunic-like shape, often reaching down to the ankles, with a high collar and wide sleeves and is typically fastened with a row of buttons or toggles along the right shoulder and side.\nIts design is both functional for the harsh Mongolian climate and reflective of the country's nomadic heritage.\n\nOriginating from Mongolia, the deel has been worn by Mongolians for centuries, particularly by the nomadic herding communities.\nThe deel's creation was primarily the work of local artisans who were skilled in sewing and tailoring.\nThese artisans were not only responsible for crafting practical clothing but also for incorporating intricate designs and patterns that reflected the cultural identity of different Mongolian tribes.\n\

In [143]:
# db.drop_collection('video-broll')