In [3]:
import sys
import os
sys.path.insert(0, os.path.join(os.pardir, 'sparat'))

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.optimize import fmin
import seaborn as sns
import statsmodels.api as sm

from data_processing.rat import load_rat_items
from data_processing.generate_association_matrix import load_assoc_mat
from data_processing.spgen import load_pointers
from model.stimulus import filter_valid
from data_processing.spgen import load_pointers

%matplotlib inline

## Load experimental data

In [4]:
basedir = os.pardir
path_file = os.path.join(basedir, 'data', 'raw', '144CompoundBowden.xlsx')
xls = pd.ExcelFile(path_file)

Load the excel sheet with values from the paper and sort RAT problems according to the t=2 condition:

In [5]:
def solveable(row, assoc, i2w, w2i):
    cues = row['items'].upper().split('/')
    target = row['solution'].upper()
    
    if not all(w in i2w for w in cues+[target]):
        return np.nan
    else:
        strengths = np.sum([assoc[w2i[cue], :] for i, cue in enumerate(cues)], axis=0)
        return strengths[w2i[target]] > 0.

## Load association data

In [6]:
def pointers2assoc(pointers, i2w, w2i):
    return np.dot(pointers, pointers.T), i2w, w2i

In [7]:
datasets = {
        k: load_assoc_mat(os.path.join(basedir, 'data', 'associationmatrices'), k)
        for k in ['freeassoc_asymmetric']
    }

In [8]:
df = xls.parse('RAT stimuli part1.csv', skip_footer=2)
df.columns = [
    'items', 'solution', '2s-%',
    '7s-%',  '7s-t-mean', '7s-t-sd',
    '15s-%', '15s-t-mean', '15s-t-sd',
    '30s-%', '30s-t-mean', '30s-t-sd']
df = df.sort_values('2s-%', ascending=False)
df = df.set_index(np.arange(len(df)))

In [9]:
assoc, i2w, w2i = datasets['freeassoc_asymmetric']

In [10]:
solvable_problems = df.apply(solveable, args=(assoc, i2w, w2i), axis=1)

In [11]:
print 'Unsolvable:', len(solvable_problems.dropna()) - solvable_problems.sum()

Unsolvable: 37


In [12]:
df=df.loc[solvable_problems.fillna(False)]

In [13]:
def solve_p(row, assoc, i2w, w2i, ws=(1., 1., 1.)):
    '''
    Compute probabilities for each row based on associations and store the result in col_title.
    '''
    assert len(ws) == 3
    beta = ws[-1]
    alphas = (1., ws[0], ws[1])
    
    cues = row['items'].upper().split('/')
    target = row['solution'].upper()
    
    if not all(w in i2w for w in cues+[target]):
        return np.nan
    else:
        strengths = np.sum([alphas[i]*assoc[w2i[cue], :] for i, cue in enumerate(cues)], axis=0)
        for cue in cues:
            strengths[w2i[cue]] = 0.
        normed = strengths / np.sum(strengths)
        return beta * 100 * normed[w2i[target]]

In [14]:
def model(params, assoc, i2w, w2i):
    x = df.apply(solve_p, args=(assoc, i2w, w2i, params), axis=1).dropna()
    y = df.loc[x.index]
    return np.sqrt(np.sum(np.square(x - y['2s-%'])))

In [15]:
fits = {
        k: fmin(model, (1., 1., 1.), assoc)
        for k, assoc in datasets.items()
    }

Optimization terminated successfully.
         Current function value: 73.006371
         Iterations: 74
         Function evaluations: 137


In [16]:
fits

{'freeassoc_asymmetric': array([ 2.05991529,  1.19931146,  1.13050399])}

In [17]:
def model_r(params, assoc, i2w, w2i):
    x = df.apply(solve_p, args=(assoc, i2w, w2i, params), axis=1).dropna()
    y = df.loc[x.index]
    return sm.OLS(x, y['2s-%']).fit().rsquared

r_values = {k: model_r(v, *datasets[k]) for k, v in fits.items()}
r_values

{'freeassoc_asymmetric': 0.62515435963131949}