In [None]:
import os
import pandas as pd
import numpy as np

pd.options.display.max_columns = 100

head_dir = os.path.expanduser('~/Google Drive/Bas Zahy Gianni - Games/')
data_dir = os.path.join(head_dir, 'Data/1_gen/Raw')

In [None]:
files = [f for f in os.listdir(data_dir) if f[-3:] == 'csv']
file_dict = dict(zip([f[:-4] for f in files], np.arange(len(files))))

In [None]:
top = 184                                                  # board bounds and dimensions in pixels
bottom = 587
left = 227
right = 1138
width = right - left
height = bottom - top

def mouse_x_to_tile(x):
    """Converts mouse x coordinates to board-space"""
    return 9*(x - left) / width

def mouse_y_to_tile(y):
    """Converts mouse y coordinates to board-space"""
    return 4*(y - top) / height

def fix_game_boards(row):
    """Removes move from appropriate color board string representation for a single record"""

    bp, wp = row[['bp', 'wp']]                              # select board string reps
    if row['color']==0:                                     # if Black is player
        p = list(bp)                                        # convert board string to list (mutability)
        p[int(row['zet'])] = '0'                            # set list at zet loc to be '0'
    else:                                                   # if White is player, do the same thing for White's board
        p = list(wp)
        p[int(row['zet'])] = '0'

    return ''.join(p)                                       # rejoin into new string

def load_mouse_file(mf):
    """Loads and preprocesses mouse tracking data"""

    mfnames = [
        'idx', 'id', 'color', 'gi', 'mi',
        'status', 'bp', 'wp', 'zet',
        'rt', 'ts', 'mt', 'mx'
    ]                                                      # names for columns

    D = pd.read_csv(mf, names=mfnames)                     # load csv into pandas dataframe
    D['mx'] = D.apply(expand_mouse_mx, axis=1)             # append start and end mouse spatial coords
    D['mt'] = D.apply(expand_mouse_mt, axis=1)             # append start and end mouse timestamps
    D = D[['mt', 'mx']]                                    # lose the fluff

    valid = pd.notnull(D['mt'])                            # select records with valid mouse time series
    m = (D.loc[valid, 'mt'] + ',').sum().split(',')[:-1]   # combine all mouse timestamp records
    x = [tuple(xy.split(',')) for xy in (D.loc[valid, 'mx'] + ';').sum().split(';')][:-1]
                                                           # combine all mouse coordinate records
    M = pd.DataFrame(index=m, data=x, columns=['x', 'y'])  # new dataframe with timestamp index and coordinates
    M['subject'] = mf[-6:-4]                               # set subject field to initials
    M['human'] = M['subject'].map(file_dict)     # set human field to subject ordinal index
    M.index = M.index.astype(int)                          # cast timestamp index to integers
    return M

def load_game_file(gf):
    """Loads and preprocesses data from game observations"""

    gfnames = [
        'idx', 'id', 'color', 'gi', 'mi',
        'status', 'bp', 'wp', 'zet',
        'rt', 'ts', 'mt', 'mx'
    ]                                                      # names for columns

    D = pd.read_csv(gf, names=gfnames)                     # load csv into pandas dataframe
    readyfilter = D['status'] == 'ready'                   # filter on convenience records
    rtfilter = D['rt'] == 0                                # filter on AI moves
    aifilter = ~(readyfilter & rtfilter)                   # filter out convenience AI records
    D = D.loc[aifilter]                                    # apply filter
    D.loc[readyfilter, 'rt'] = 0                           # set human convenience records rt field to 0
    D['subject'] = gf[-6:-4]                               # set subject field to initials in game file name
    D['human'] = D['subject'].map(file_dict)     # set human field to be subject index (alphabetical)
    D['move_start_ts'] = (D['ts'].astype(int) - D['rt'].astype(int)).shift(-1)     # set move_start_timestamp field to turn beginning
    tsfilter = rtfilter | readyfilter                      # filter on ai OR convenience records
    D.loc[tsfilter, 'ts'] = D.loc[tsfilter, 'move_start_ts']
                                                           # replace invalid timestamps with inferred correct timestamp
    D['mx'] = D.apply(expand_mouse_mx, axis=1)             # append move start and end mouse spatial coords
    D['mt'] = D.apply(expand_mouse_mt, axis=1)             # append move start and end timestamps to mouse timestamps
    D['is human'] = 1                                      # initialize human player indicator variable
    playfilter = D['status'].isin(['playing', 'win', 'draw'])
                                                           # filter on non-convenience records
    D.loc[playfilter & rtfilter, 'is human'] = 0           # set human player indicator to 0 on AI records
    endfilter = D['status'].isin(['win', 'draw'])          # filter on game end records
    idx = D.loc[endfilter].index                                   # get indices for game end filter application
    if D.loc[idx[-1], 'rt'] != 0:                          # if human player ended last game
        D.loc[idx[-1], 'gi'] = D.loc[idx[-1], 'gi'] - 1    # subtract 1 from game index (why? probably a data error)
    bpfilter = D['color']==0                               # filter on player colors
    wpfilter = D['color']==1
    D.loc[bpfilter, 'bp'] = D.loc[bpfilter].apply(fix_game_boards, axis=1)
    D.loc[wpfilter, 'wp'] = D.loc[wpfilter].apply(fix_game_boards, axis=1)
                                                           # apply filters and remove last move from board
    return D.set_index('ts')                               # set index to timestamps

def expand_mouse_mt(row):
    """Appends start time, end time to mouse timestamp records for a single record"""

    endtime = int(row['ts'])                               # get turn end timestamp
    starttime = endtime - int(row['rt'])                   # get turn start from turn end and turn duration
    if type(row['mt'])==str:                               # check if valid data
        return str(starttime) + ',' + row['mt'] + ',' + str(endtime)
                                                           # add start, end times to respective ends of record
def expand_mouse_mx(row):
    """Appends start time location, end time location to mouse spatial coordinates for a single record"""

    endtime = int(row['ts'])                               # get turn end timestamp
    starttime = endtime - int(row['rt'])                   # get turn start from turn end and turn duration
    if type(row['mx'])==str:                               # check if valid data
        locs = row['mx'].split(';')                        # split record into (x, y) pair strings
        endloc = locs[-1]                                  # select first and last coordinate pairs
        startloc = locs[0]
        return startloc + ';' + row['mx'] + ';' + endloc    # add start, end coords to respective ends of record

def mouse_hist(m, g):
    """Modifies mousetracking data to produce histograms over tile indices"""

    g['turn'] = 100*g['gi'] + g['mi']                      # add unique turn ids
    turnfilter = g['status'].isin(['playing', 'draw', 'win'])
                                                           # filter on non-convenience records
    m['turn'] = np.nan                                     # initialize helper fields
    m['turnstart'] = np.nan
    m['turnend'] = np.nan
    m['ts'] = m.index
    m['xtile'] = np.nan
    m['ytile'] = np.nan
    m['tile'] = np.nan
    m['dur'] = np.nan
    m['is human'] = np.nan
    m = m.drop_duplicates(subset='ts')                     # get rid of duplicate timestamps
    validfilter = g.index.isin(m.index)
    gp = g.loc[validfilter & turnfilter]
    m.loc[gp.index, 'turn'] = gp['turn']                   # add helper data to mouse df
    m.loc[gp.index, 'turnstart'] = gp.index - gp['rt']
    m.loc[gp.index, 'turnend'] = gp.index
    m.loc[gp.index, 'is human'] = gp['is human']

    m = m.sort_index()                                     # sort mouse data by timestamp
    fillthese = ['turn', 'turnstart', 'turnend', 'is human']
                                                           # helper columns to fill
    m[fillthese] = m[fillthese].fillna(method='bfill')     # backfill missing data

    m['dur'] = m.index
    m['dur'] = m['dur'].diff(periods=1)                    # compute duration of each event
    eventbounds = (m.index > m['turnstart']) & (m.index <= m['turnend'])
                                                           # filter on mouse data within player turn
    m = m.loc[eventbounds]                                 # apply filter

    m['xtile'] = m['x'].astype(float).map(mouse_x_to_tile) # map mouse coords to board coords
    m['ytile'] = m['y'].astype(float).map(mouse_y_to_tile)
    m['tile'] = m['xtile'].astype(int) + 9*m['ytile'].astype(int) # compute mouse tile

    humanfilter = m['is human'] == 1                       # filter on human moves (mouse df)
    mpvt = m.loc[humanfilter].pivot_table(index='turn', columns='tile', values='dur', aggfunc=np.sum)
                                                           # pivot human trials duration per tile idx
    mpvt['rt'] = mpvt.sum(axis=1)                          # recalculate rt for verification

    offboard = [
        i for i in mpvt.columns
        if (i not in list(range(36)) and type(i)==int)
    ]                                                      # column names for offboard locs

    mpvt[999] = mpvt[offboard].sum(axis=1)                 # combine all offboard durations
    humanfilter = g['is human'] == 1                       # filter on human moves (game df)
    turnfilter = g['status'].isin(['playing', 'draw', 'win'])
    gt = g.loc[turnfilter & humanfilter & validfilter].set_index('turn') # get non-convenience human records
    mpvt.loc[gt.index, 'true rt'] = gt['rt']               # set 'true rt' for verification
    mpvt = mpvt.fillna(value=0)                            # nan values mean 0 duration

    for c in ['bp', 'wp', 'zet']:
        mpvt.loc[gt.index, c] = gt[c]                      # set other game info fields on hist records

    for c in range(36):
        if c not in mpvt.columns:                          # set all nonvisited trials to 0 dur
            mpvt[c] = 0

    return m, mpvt

In [None]:
game_data = [load_game_file(os.path.join(data_dir, file)) for file in files]
mouse_data = [load_mouse_file(os.path.join(data_dir, file)) for file in files]

pvts = []
for subject in range(len(mouse_data)):
    top = mouse_data[subject].y.astype(int).min()                       # board bounds and dimensions in pixels
    bottom = mouse_data[subject].y.astype(int).max()
    left = mouse_data[subject].x.astype(int).min()
    right = mouse_data[subject].x.astype(int).max()
    width = right - left
    height = bottom - top
    
    
    mouse_data[subject], mpvt = mouse_hist(mouse_data[subject], game_data[subject])
    pvts.append(mpvt)

In [None]:
keep = list(range(36)) + [999, 'true rt', 'zet']
locs = list(range(36)) + [999]
hists = []
validhists = []

for i, t in enumerate(pvts):
    hists.append(t[locs].values / t[locs].values.sum(axis=1)[:, np.newaxis])
    validhists.append(t[locs[:-1]].values / t[locs[:-1]].values.sum(axis=1)[:, np.newaxis])

In [None]:
def shannon_entropy(p):
    e = p[p > 0]
    e = (e * np.log(e)).sum()
    return -e

In [None]:
for probe in range(len(mouse_data)):
    v = validhists[probe]
    p = pvts[probe]

    xhat = v.argmax(axis=1)
    x = p['zet'].values

    nll = -np.log(v[np.arange(v.shape[0]), x])
    acc = (xhat == x).mean()
    ent = np.apply_along_axis(shannon_entropy, axis=1, arr=v)
    
    n = np.isinf(nll)
    ymins = [m.y.min() for m in mouse_data]
    ymaxes = [m.y.max() for m in mouse_data]
    print('Subject:', probe)
    print('\tBoard edges (debugging):', ymins[probe], ymaxes[probe])
    print('\t#inf nlls (debugging):', len(pvts[probe].iloc[np.where(n)[0]][list(range(36)) + ['zet']]))
    print('\tMean NLL:', nll.mean())
    print('\tMouse hist entropy:', ent.mean())
    print('\tAccuracy:', acc, '\n')