# Analysis notebook

In [None]:
import numpy as np
import pandas as pd
from IPython.display import clear_output
import pymongo as pm
import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(""))))
import cabutils

# create relevant project subdirs

In [None]:
## create relevant project subdirs
proj_dir = os.path.abspath('..')
analysis_dir =  os.path.join(proj_dir,'analysis')
results_dir = os.path.join(proj_dir,'results')
csv_dir = os.path.join(results_dir,'csv')
plots_dir = os.path.join(results_dir,'plots')

def makedir(path):
    if not os.path.exists(path):
        os.makedirs(path)
    return
 
makedir(results_dir)
makedir(csv_dir)
makedir(plots_dir)

# establish connection to mongo and retrieve data

In [None]:
## set the relevant variables
dbname = '' ## which database are we using, e.g. BACH?
colname = '' ## which collection inside this database, e.g. dominoes?
iterationName = '' ## which iterations do we want to analyze, e.g. iter1?

# establish connection to mongo
conn = cabutils.get_db_connection()
db = conn[dbname] 
coll = db[colname]
## fetch all records from this iterationName
K = coll.find({'iterationName':iterationName})
li = list(K)
_M = pd.DataFrame(li)

# apply data exclusion criteria

In [None]:
## get unique gameIDs for completed games only
completed_games = []
for gameID in _M['gameID'].unique():
    ## check if we have a survey event for the gameID
    events = list(_M[_M['gameID'] == gameID]['eventType'])
    if ("survey_data" in events): completed_games.append(gameID)

print("We have",len(completed_games),"completed unique games.")

In [None]:
## filter on complete games 
M = _M[_M['gameID'].isin(completed_games)]

## separate into T (trials) and S (survey) dataframes
T_train = M[M['eventType']=='training_trials']
T_test = M[M['eventType']=='test_trials']
T = pd.concat([T_train, T_test], ignore_index=True, sort=False)
S = M[M['eventType']=='survey_data']

## make sure that all the games in T are also in S (sanity check)
Tgames = list(np.unique(T['gameID'].values))
Sgames = list(np.unique(S['gameID'].values))
assert len(np.intersect1d(Tgames,Sgames))==len(Tgames)

# drop the sensitive information and save to csv locally

In [None]:
## ANONYMIZE DATAFRAMES: drop ProlificID & any other potentially identifying info from T & S
T = T.drop(columns=['ProlificID'])
S = S.drop(columns=['ProlificID'])

In [None]:
## save out to file 
S.to_csv(os.path.join(csv_dir, '{}_{}_survey.csv'.format(colname, iterationName)),index=False)
T.to_csv(os.path.join(csv_dir, '{}_{}_trials.csv'.format(colname, iterationName)),index=False)
print('Saved successfully to file!')

# visualize the data and do the analysis you want

In [None]:
...