# Optimizing exemplar sensitivity in ExemPy


In [1]:
%load_ext autoreload
from ExemPy import *
from ExemPy.utils import *
from ExemPy.viz import *
from ExemPy.GCM import *
%aimport ExemPy, ExemPy.utils, ExemPy.viz, ExemPy.GCM
%autoreload 1
import math
import random
import matplotlib.pyplot as plt
#%matplotlib inline
import numpy as np
import pandas as pd
from pandas import DataFrame
from scipy.optimize import minimize
import seaborn as sns
sns.set(style='ticks', context='paper')
colors=["#e3c934","#68c4bf","#c51000","#287271"]
sns.set_palette(colors)

## Read in data, set initial parameters

In [2]:
pb52 = pd.read_csv('pb52_data//pb52.csv')
pb52 = HzToBark(pb52, ["F0", "F1", "F2", "F3"])
pb52.sample(5)

Unnamed: 0,type,gender,speaker,vowel,repetition,F0,F1,F2,F3,z0,z1,z2,z3
451,m,m,23,PALM,2,120,690,960,2520,1.016731,6.450717,8.284247,14.550625
583,m,m,30,KIT,2,155,360,2010,2400,1.434799,3.630172,13.043829,14.227798
414,m,m,21,FOOT,1,142,430,1130,2440,1.281142,4.293556,9.274304,14.337364
1341,c,f,68,FLEECE,2,262,340,3100,3400,2.631215,3.433217,15.895099,16.476343
1262,c,m,64,KIT,1,290,580,2760,3400,2.925511,5.591969,15.147034,16.476343


## Specify arguments for optimization

In [3]:
dimsvals={'z0' : 1,
          'z1' : .761,
          'z2' : .681,
          'z3' : .407}
dimslist = list(dimsvals.keys())

catslist = ['type', 'vowel']           # man, woman, or child; lexical set notation

cval = 25

exemplars = pb52.copy()

In [4]:
fitdims = dimslist[1:]      # Fit all dimensions except item 0
anchordim = dimslist[0]     # Set item 0 to 1

name = 'pboptimization-3'                # name of output spreadsheet
n = 2                        # number of times that random x is generated
t = 0.1                     # Tolerance value -- lower = more evals

test = exemplars.sample(50)
cloud = exemplars
cats = ["vowel"]

cmax = 50
cmin = 0


## Define error functions
- In future versions, this function will be pre-defined in the library
- For now, there is some value in being able to tweak the function, and to see how it works!

In [5]:
def calcerror_dims(x, test, exemplars, catslist, fitdims, cval, anchordim = None):
    '''
    Categorizes a data set and returns the proportion of stimuli/test rows
    that were categorized inaccurately. A lower value means a lower amount of
    error. Designed to be used with parameter fitting functions to assign
    values to attention weighting for the dimensions.
    
    Required paratemers:
    
    x = Array. Initial guesses for parameters
    
    test = DataFrame. Stimuli to be categorized
    
    exemplars = DataFrame. Exemplar cloud to use for categorization
    
    catslist = List of strings. Each string should correspond to a category that
        should be assigned to the test
    
    fitdims = List of strings. Each string should correspond to a dimension
        for which parameters should be fit.
        
    Optional parameters:
    
    anchordim = String. Dimension for parameter which will not be fit, but will
        instead be hard-coded as 1. This helps constrain the set of possible
        solutions
    
    
    '''
    dimsvals = {fitdims[i]: x[i] for i in range(len(fitdims))}
    if anchordim != None:
        dimsvals.update({anchordim:1})
    
    choices = multicat(test, cloud, catslist, dimsvals, cval)
    accuracy = checkaccuracy(choices, catslist)
    category = catslist[0]
    err = accuracy[category+"Acc"].value_counts(normalize=True)['n']
    return err

In [6]:
def calcerror_both(x, test, exemplars, catslist, fitdims, anchordim = None):
    '''
    Categorizes a data set and returns the proportion of stimuli/test rows
    that were categorized inaccurately. A lower value means a lower amount of
    error. Designed to be used with parameter fitting functions to assign
    values to attention weighting for the dimensions.
    
    Required paratemers:
    
    x = Array. Initial guesses for parameters
    
    test = DataFrame. Stimuli to be categorized
    
    exemplars = DataFrame. Exemplar cloud to use for categorization
    
    catslist = List of strings. Each string should correspond to a category that
        should be assigned to the test
    
    fitdims = List of strings. Each string should correspond to a dimension
        for which parameters should be fit.
        
    Optional parameters:
    
    anchordim = String. Dimension for parameter which will not be fit, but will
        instead be hard-coded as 1. This helps constrain the set of possible
        solutions
    
    
    '''  
    cval = x[0]
    dimsvals = {fitdims[i]: x[i] for i in range(len(fitdims))}
    if anchordim != None:
        dimsvals.update({anchordim:1})
    
    choices = multicat(test, cloud, catslist, dimsvals, cval)
    accuracy = checkaccuracy(choices, catslist)
    category = catslist[0]
    err = accuracy[category+"Acc"].value_counts(normalize=True)['n']
    return err

In [7]:
def calcerror_c(x, test, exemplars, dimsvals, catslist):
    '''
    Categorizes a data set and returns the proportion of stimuli/test rows
    that were categorized inaccurately. A lower value means a lower amount of
    error. Designed to be used with parameter fitting functions to assign
    values to attention weighting for the dimensions.
    
    Required paratemers:
    
    x = Array. Initial guesses for c
    
    test = DataFrame. Stimuli to be categorized
    
    exemplars = DataFrame. Exemplar cloud to use for categorization
    
    catslist = List of strings. Each string should correspond to a category that
        should be assigned to the test

    
    
    '''
    cval = x[0]
    choices = multicat(test, cloud, catslist, dimsvals, cval)
    accuracy = checkaccuracy(choices, catslist)
    category = catslist[0]
    err = accuracy[category+"Acc"].value_counts(normalize=True)['n']
    return err

## Optimize!
You may want to come back later for the results: 
- Results will be saved to a spreadsheet. (\[NAME\].csv)
- Settings will be saved to a text file. (\[NAME\]\_info.txt)

### Attention weights only

In [8]:
resultslist=[['start','fit','error','evals']] # initialize a list for results


print("----- Parameters -----")
if anchordim != None: 
    print("Anchored (1):  ", anchordim)
    
print("Optimized:     ", fitdims)
print("")
print("Categorizing for: ", cats)
print("")
print("Trials: ", n)
print("")

for i in range(0,n): 
    dimsguess=np.divide(random.sample(range(0,300),len(fitdims)),100)
        # Get a random sample of numbers between 0 and 300,
           # divide by 100 to get floats between 0 and 3  
    xguess = dimsguess
    start = xguess
    
    print ("-----", (i+1) ," -----")
    print("Initial guess:    ", start)

    
    result = minimize(calcerror_dims,
                  xguess,  # the initial guess array
                  args=(test, cloud, cats, fitdims, cval, anchordim), # arguments for the error function
                  method='Powell',  
                  tol = t)  # a 'tolerance' value, smaller means more function evaluation, but potentially better fit
    start = xguess
    fit = np.round(result.x,3)
    error = result.fun
    evals = result.nfev
    row = [start,fit,error,evals]
    resultslist.append(row)
    

    print("Optimized:        ", fit)
    print(" ")
    print("Number evals: ", evals)
    print("Error:        ", error)
    print("")
results=pd.DataFrame(resultslist)
results.columns = results.iloc[0]
results=results[1:]

settings = {"fitdims": fitdims, "anchordim": anchordim, "cats": cats, "trials":n, "tol": t }

# write results to csv
### good for if you want to leave it running while you do something else!
results.to_csv(name+".csv")     
with open((name+"_info.txt"),"w") as file:
    file.write(str(settings))


----- Parameters -----
Anchored (1):   z0
Optimized:      c value, ['z1', 'z2', 'z3']

Categorizing for:  ['vowel']

Trials:  2

----- 1  -----
Initial guess:     [0.13 2.32 0.8 ]
Optimized:         [ 2.979  1.205 -2.106]
 
Number evals:  83
Error:         0.06

----- 2  -----
Initial guess:     [2.96 1.1  1.85]
Optimized:         [1.411 1.413 2.232]
 
Number evals:  36
Error:         0.1



### Both attention weights and c

In [9]:
resultslist=[['start','fit','error','evals']] # initialize a list for restults


print("----- Parameters -----")
if anchordim != None: 
    print("Anchored (1):  ", anchordim)
    
print("Optimized:     ", "c value,", fitdims)
print("")
print("Categorizing for: ", cats)
print("")
print("Trials: ", n)
print("")

for i in range(0,n): 
    cguess = random.sample(range(0, 50),1)
    dimsguess=np.divide(random.sample(range(0,300),len(fitdims)),100)
        # Get a random sample of numbers between 0 and 300,
           # divide by 100 to get floats between 0 and 3  
    xguess = np.concatenate((cguess, dimsguess))
    start = xguess
    
    print ("-----", (i+1) ," -----")
    print("Initial guess:    ", start)

    
    result = minimize(calcerror_both,
                  xguess,  # the initial guess array
                  args=(test, cloud, cats, fitdims, anchordim), # arguments for the error function
                  method='Powell',  
                  tol = t)  # a 'tolerance' value, smaller means more function evaluation, but potentially better fit
    start = xguess
    fit = np.round(result.x,3)
    error = result.fun
    evals = result.nfev
    row = [start,fit,error,evals]
    resultslist.append(row)
    

    print("Optimized:        ", fit)
    print(" ")
    print("Number evals: ", evals)
    print("Error:        ", error)
    print("")
results=pd.DataFrame(resultslist)
results.columns = results.iloc[0]
results=results[1:]

settings = {"fitdims": fitdims, "anchordim": anchordim, "cats": cats, "trials":n, "tol": t }

# write results to csv
### good for if you want to leave it running while you do something else!
results.to_csv(name+".csv")     
with open((name+"_info.txt"),"w") as file:
    file.write(str(settings))


----- Parameters -----
Anchored (1):   z0
Optimized:      c value, ['z1', 'z2', 'z3']

Categorizing for:  ['vowel']

Trials:  2

----- 1  -----
Initial guess:     [49.    0.46  0.9   2.47]
Optimized:         [52.483  1.943 -2.15   5.953]
 
Number evals:  68
Error:         0.08

----- 2  -----
Initial guess:     [34.    0.51  0.66  2.34]
Optimized:         [34.382  1.684 -1.885  4.34 ]
 
Number evals:  43
Error:         0.08



### c only

In [10]:
resultslist=[['start','fit','error','evals']] # initialize a list for restults


print("----- Parameters -----")
    
print("Optimized:     ", "c value")
print("")
print("Categorizing for: ", cats)
print("")
print("Trials: ", n)
print("")

for i in range(0,n):
    cguess = np.divide(random.sample(range(cmin, (cmax*100)),1),100)
    xguess = cguess
    start = xguess
    
    print ("-----", (i+1) ," -----")
    print("Initial guess:    ", start)

    
    result = minimize(calcerror_c,
                  xguess,  # the initial guess array
                  args=(test, cloud, dimsvals, cats), # arguments for the error function
                  method='Powell',  
                  tol = t)  # a 'tolerance' value, smaller means more function evaluation, but potentially better fit

    fit = np.round(result.x,3)
    error = result.fun
    evals = result.nfev
    row = [start,fit,error,evals]
    resultslist.append(row)
    

    print("Optimized:        ", fit)
    print(" ")
    print("Number evals: ", evals)
    print("Error:        ", error)
    print("")
results=pd.DataFrame(resultslist)
results.columns = results.iloc[0]
results=results[1:]

settings = {"dims": dimslist, "cats": cats, "trials":n, "tol": t, "cmin": cmin, "cmax": cmax }

# write results to csv
### good for if you want to leave it running while you do something else!
results.to_csv(name+".csv")     
with open((name+"_info.txt"),"w") as file:
    file.write(str(settings))


----- Parameters -----
Optimized:      c value

Categorizing for:  ['vowel']

Trials:  2

----- 1  -----
Initial guess:     [18.82]
Optimized:         [19.82]
 
Number evals:  5
Error:         0.12

----- 2  -----
Initial guess:     [35.72]
Optimized:         [36.72]
 
Number evals:  5
Error:         0.1

