In [1]:
# These are global imports
%pylab inline
import sys
import os
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from scipy.stats import rankdata
from scipy.io import loadmat  # this is the SciPy module that loads mat-files
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import ndimage
import cPickle as pickle
from joblib import Parallel,delayed
# Connect to an R session
import rpy2.robjects
r = rpy2.robjects.r

# For a Pythonic interface to R
from rpy2.robjects.packages import importr
from rpy2.robjects import Formula, FactorVector
from rpy2.robjects.environments import Environment
from rpy2.robjects.vectors import DataFrame, Vector, FloatVector
from rpy2.rinterface import MissingArg,SexpVector
# Make it so we can send numpy arrays to R
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()
fdrtool = importr('fdrtool')
from mvpa2.suite import *
from scipy.io import loadmat
from scipy.spatial.distance import squareform,pdist
from scipy.stats import rankdata,pearsonr
from scipy import stats

#import prettyplotlib as ppl

#import brewer2mpl
from mpl_toolkits.mplot3d import Axes3D
from geopy import Point
from geopy.distance import distance


Populating the interactive namespace from numpy and matplotlib




In [2]:
# define base data and subject list
basedir = '/cmlab/data/fmri/expSamp/data/'
subjects = ['expSamp01',
'expSamp02',
'expSamp03',
'expSamp04',
'expSamp05',
'expSamp07',
'expSamp08',
'expSamp09',
'expSamp10']

In [3]:
#load in events csvs
for i,subj in enumerate(subjects):
    if i ==0:
        allev_df = pd.read_csv(os.path.join(basedir,subj,'events.csv'))
    else:
        allev_df = pd.concat([allev_df,pd.read_csv(os.path.join(basedir,subj,'events.csv'))])

In [2]:
# this was code to create the csv that I edited with new gps coords, don't run this

#kept_stims = allev_df.loc[((allev_df['event_type']=="DISP_IMAGE") & (allev_df['prac_trial']==False)),:]
#kept_stims['gps_mod']=0
# this line is broken on purpose, to keep you from running it, it's just a reminder of how I made the csv
#kept_stims.to_csv(os.path.join(basedir,'exp_stims_1.csv') 


In [4]:
#load in the csv with all the events and the new gps coordinates
kept_stims = pd.read_csv('/cmlab/data/fmri/expSamp/data/rsa_new/exp_stims.csv',sep = '\t')
kept_stims.loc[pd.isnull(kept_stims['self']),'self'] = 0

In [7]:
#kept_stims

In [4]:
#load hamming distances
ham_path = os.path.join(basedir,'hamming/AllSubsHammingDists.mat')
mat = loadmat(ham_path)
ham_dat = mat['AllSubsHammingDists']
ham_dat = ham_dat[[True,True,True,True,True,False,True,True,True,True]]

In [5]:
# define different distance metrics
def geodesic(u,v):
    return distance(Point(u[0],u[1]),Point(v[0],v[1])).meters
def genps(u,v):
    if u != v:
        return 2
    else:
        return u
def booland(u,v):
    return (bool(u)&bool(v))
def boolor(u,v):
    return (bool(u)|bool(v))
def gensev(u,v):
    if u == v:
        return 1
    else:
        return 0
def ave(u,v):
    return np.average([u,v])

In [6]:
# make rsa_dat dataframe
#subj = 'expSamp01'
for j,subj in enumerate(subjects):    
    cords = [(kept_stims.loc[kept_stims['subject_id']==subj,['latitude']].values[i][0],kept_stims.loc[kept_stims['subject_id']==subj,['longitude']].values[i][0]) for i in range(len(kept_stims.loc[kept_stims['subject_id']==subj,:]))]
    
    space = pdist(cords,metric=geodesic)
    sdict = dict(
    space = space,
    time = pdist(kept_stims.loc[kept_stims['subject_id']==subj,['image_time']],metric='euclidean'),
    event = pdist(kept_stims.loc[kept_stims['subject_id']==subj,['event_time']],metric='euclidean'),
    pair_str = pdist(kept_stims.loc[kept_stims['subject_id']==subj,['strong']],metric=genps),
    viv_dif = pdist(kept_stims.loc[kept_stims['subject_id']==subj,['img_vividness']],metric='euclidean'),
    viv_ave = np.abs(1-pdist(kept_stims.loc[kept_stims['subject_id']==subj,['vivid_mem']],metric=ave)),
    viv_bin = 1-pdist(((kept_stims.loc[kept_stims['subject_id']==subj,['vivid_mem']].values)),metric=booland).astype(bool),
    hasgps = pdist(((kept_stims.loc[kept_stims['subject_id']==subj,['has_gps']].values) | (kept_stims.loc[kept_stims['subject_id']==subj,['gps_mod']].values)),metric=booland).astype(bool),
    gpsmod = pdist(kept_stims.loc[kept_stims['subject_id']==subj,['gps_mod']],metric=boolor).astype(bool),
    self = pdist(kept_stims.loc[kept_stims['subject_id']==subj,['self']],metric=boolor).astype(bool),
    same_ev = pdist(kept_stims.loc[kept_stims['subject_id']==subj,['chunk_iloc']],metric=gensev).astype(bool),
    oldkeep = pdist(kept_stims.loc[kept_stims['subject_id']==subj,['keep']],metric=booland).astype(bool),
    s1_trial = squareform(np.mgrid[0:120,0:120][0],checks = False)+1,
    s2_trial = squareform(np.mgrid[0:120,0:120][1],checks = False)+1,
    subject = np.array([subj]*len(space)).astype('S10'),
    rem = pdist(((kept_stims.loc[kept_stims['subject_id']==subj,['remembered']].values)),metric=booland).astype(bool),
    rem_ave = np.abs(1-pdist(kept_stims.loc[kept_stims['subject_id']==subj,['remembered']],metric=ave)),
    ham = squareform(ham_dat[j][0],checks=False)
    )

    sdict['pair_str']=np.array(["1weak" if sdict['pair_str'][i] == 0 else "3strong" if sdict['pair_str'][i] == 1 else "2mixed" for i in range(len(sdict['pair_str']))],dtype='|S8')
    #sdict['pair_str'][((sdict['hasgps']==False))] = "0exclude"
    sdict['pair_str'][((sdict['self']==True))] = "0exclude"
    sdict['pair_str'][((sdict['oldkeep']==False))] = "0exclude"
    sdict['pair_str'][((np.isnan(sdict['ham'])))] = "0exclude"
    #sdict['pair_str'][((sdict['same_ev']==True))] = "0exclude"


    subj_dat = pd.DataFrame(sdict, columns = ['subject',
                           'pair_str',
                           's1_trial',
                           's2_trial',
                           'space',
                           'time',
                           'event',
                           'same_ev',
                           'oldkeep',
                           'hasgps',
                           'self',
                           'gpsmod',
                           'viv_dif',
                           'viv_bin',
                           'viv_ave',
                           'rem',
                           'rem_ave',
                           'ham'
                           ])
    if j == 0:
        rsa_dat_df = subj_dat
    else:
        rsa_dat_df = pd.concat([rsa_dat_df,subj_dat])

# write to file
rsa_dat = rsa_dat_df.to_records(index=False)
# this fixes column types that pandas randomly sets to object for some annoying reason
rsa_dat = rsa_dat.astype([('subject', 'S10'), ('pair_str', 'S10'), ('s1_trial', '<i4'), ('s2_trial', '<i4'), ('space', '<f8'), ('time', '<f8'),('event','<f8'),
                          ('same_ev', '?'), ('oldkeep', '?'), ('hasgps', '?'),('viv_dif','<f8'), ('viv_bin', '?'),('viv_ave','<f8'),('rem', '?'),('rem_ave','<f8'),('ham','<f8')])
rsa_dat.dump(os.path.join(basedir,'rsa_new','rsa_dataset_gps_time_old_exclude_viv_bin_scan_time_rem_ham.pickle'))

In [7]:
#drop excluded rows
rsa_dat_df_drop = rsa_dat_df[(rsa_dat_df['pair_str'] != '0exclude')]
rsa_dat_drop = rsa_dat_df_drop.to_records(index=False)
rsa_dat_drop = rsa_dat_drop.astype([('subject', 'S10'), ('pair_str', 'S10'), ('s1_trial', '<i4'), ('s2_trial', '<i4'), ('space', '<f8'), ('time', '<f8'),('event','<f8'),
                          ('same_ev', '?'), ('oldkeep', '?'), ('hasgps', '?'),('timef', 'S6'),('spacef', 'S6'),('viv_dif','<f8'), ('viv_bin', '?'),('viv_ave','<f8'),('rem', '?'),('rem_ave','<f8'),('ham','<f8')])
rsa_dat_drop.dump(os.path.join(basedir,'rsa_new','rsa_dataset_gps_time_old_exclude_drop_viv_bin_scan_time_rem_ham.pickle'))