In [149]:
import os, sys

import pymongo as pm
import numpy as np
import scipy.stats as stats
import math
import pandas as pd
import json
import re
from io import BytesIO
from PIL import Image
import requests # this is to access the stim urls from the notebook
from IPython.display import SVG, display # need for showing stims with sketches side by side
import base64
import PIL

import matplotlib
from matplotlib import pylab, mlab, pyplot
import matplotlib.patches as mpatches
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')
from matplotlib import rcParams
matplotlib.rcParams['pdf.fonttype'] = 42

from IPython.display import clear_output
import importlib

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

# so dataframes don't get cut off in display:
pd.set_option("display.max_rows", None, "display.max_columns", None)

import ast  # for interpreting strings as dictionary objects


### establish connection to mongo
first thing you need to do is to establish an ssh tunnel (aka remote port forwarding) to the server, so that requests to the mongodb can be made "as if" the mongodb server is running on your local computer. Run this from the command line before you begin data analysis if you plan to fetch data from mongo:

ssh -fNL 27020:127.0.0.1:27017 USER@cogtoolslab.org

In [150]:
! ssh -fNL 27020:127.0.0.1:27017 sholt@cogtoolslab.org

In [195]:
# set vars 
auth = pd.read_csv(os.path.join(os.getcwd(),'auth.txt'), header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org'

# have to fix this to be able to analyze from local
import pymongo as pm
import socket
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017')
db = conn['num_syn']
coll = db['proof_of_concept']

# which iteration name(s) should we use?
iterationName = 'sandbox'  # 'sandbox', 'friends1' (Leo & Emily), 'pilot1' was more people we know, 

# w = coll.remove({'iterationName':'sandbox'})

w = coll.find({'iterationName':iterationName})
W = pd.DataFrame(w)
# W.to_csv("pilotData.csv")
W.columns

Index([], dtype='object')

In [196]:
W

In [146]:
W[['eventType','target','response','correct','score','RT','take','base']]

In [9]:
# How many games do we have?
W[W['block'] == 'tutorial']['target'].reset_index()
W[W['block'] == 'test'].reset_index()

Unnamed: 0,index,_id,iterationName,eventType,gameID,workerID,assignmentID,hitID,score,trialNum,sectionTrial,target,response,correct,take,block,trialStartTime,trialTime,takeTime,RT,initTime,endTime
0,69,61a03fb040a9bd7fc1e6a524,pilot1,test,0,0,0,0,53,55.0,1.0,ha-to-ha,hatoha,False,1.0,test,1637892000000.0,4034.0,4034.0,2139,1637892000000.0,
1,150,61a041db40a9bd7fc1e6a575,pilot1,test,0,0,0,0,50,55.0,1.0,mo-ku,moku,False,1.0,test,1637893000000.0,1887.0,1887.0,897,1637892000000.0,
2,219,61a043a540a9bd7fc1e6a5ba,pilot1,test,0,0,0,0,54,55.0,1.0,ke-to,keto,False,1.0,test,1637893000000.0,2323.0,2323.0,1981,1637893000000.0,
3,335,61a0a14340a9bd7fc1e6a62e,pilot1,test,0,0,0,0,51,55.0,1.0,ka-sa,kasa,False,1.0,test,1637917000000.0,2792.0,2792.0,1218,1637917000000.0,


In [10]:
W.initTime.unique()

array([1.63789161e+12, 1.63789216e+12, 1.63789226e+12, 1.63789266e+12,
       1.63789360e+12, 1.63791677e+12])

In [11]:
W.block.unique()

array(['tutorial', 'quiz', 'test', nan], dtype=object)

In [12]:
W.correct.mean()

0.5255255255255256

In [91]:
W[W['eventType']=='generalization'].RT

70     {'1': [3944], '2': [14481], '3': [3593, 411], ...
151    {'1': [9271], '2': [3190], '3': [1451], '4': [...
220    {'1': [9728], '2': [2530], '3': [2207], '4': [...
336    {'1': [2426], '2': [5060], '3': [5268], '4': [...
Name: RT, dtype: object

In [82]:
generalizations = W[W['eventType'] == 'generalization'].reset_index()['target']

G = pd.DataFrame(range(1,16),columns=['#'])
for j,g in enumerate(generalizations):
#     print("\033[1mGeneralization #{}\033[0m".format(j+1))
    gDF = pd.DataFrame(columns=['#','Targs{}'.format(j+1),'Resps{}'.format(j+1)])
    
    for i in range(15):
        targ = g[i] if i < len(g) else "  "
        resp = W[W['eventType'] == 'generalization'].reset_index()['response'][j][str(i+1)]
#         print(i+1,targ,resp)
        newRow = pd.DataFrame([[i+1,targ,resp]], columns=gDF.columns)
        gDF = gDF.append(newRow)
    G = G.merge(gDF,on='#')
#     print('')
G = G.style.set_caption("Generalizations")
cell_hover = {  # for row hover use <tr> instead of <td>
    'selector': 'td:hover',
    'props': [('background-color', '#ffffb3')]
}
G.set_table_styles([cell_hover])

G

Unnamed: 0,#,Targs1,Resps1,Targs2,Resps2,Targs3,Resps3,Targs4,Resps4
0,1,tu,['ha'],ti,['ti'],ti,['ti'],ne,['ne']
1,2,ha,['ha'],mo,['ko'],ke,['ke'],sa,['sa']
2,3,ku,['ku'],no,['no'],mu,['mu'],sa-ne,['sane']
3,4,to,['to'],he,['ku'],te,['te'],mu,['mu']
4,5,to-tu,['totu'],ku,['he'],to,['ti'],mu-ne,['mune']
5,6,to-ha,['toha'],ku-ti,['kuti'],to-ti,['toti'],mu-sa,['musa']
6,7,to-ku,['toku'],ku-mo,['kumo'],to-ke,['toke'],mu-sa-ne,['musane']
7,8,ha-to,['hato'],ku-no,['kuni'],to-mu,['tomu'],ka,['ka']
8,9,ha-to-tu,['hatotu'],ku-he,['r'],to-te,['tote'],ka-ne,['kane']
9,10,ha-to-ha,['hatoha'],mo-ku,['o'],ke-to,['keto'],ka-sa,['kasane']


In [None]:


# exclusion criteria:
# 1. tutorial trials should almost all be correct
# 2. exclude people if they use too many characters not in the count list (up )

# how much to bonus people? 3¢, or 2 or 1 depending on the take


# staged/tiered analysis for the generalization phase:
#     1. see if there are group differences with all the data
#     2. if not, then exclude all of what looks like garbage data, and do secondary analysis

In [None]:
# # to do:
### 1. make it so that people have to delete TWICE to get to the previous generalization field # DON'T NEED
### 2. fix the 'correct' classification that forgets about hyphens ('score' is working fine)
### 3. sample consonants and vowels without replacement independently of each other
### 4. add in the final count to ten of quiz trials
### 5. store surveyData
# 5.1. also store number of syllables (# of hyphens + 1)
### 6. get rid of 'test' trials, make trials start at the correct number
### 7. INTEGRATE IT WITH PROLIFIC!