In [5]:
# Imports
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as sts
import seaborn as sns
import statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols

from lib.utility_functions import *
from lib.exp4 import *

# Config
sns.set_style('white')
sns.set_context('talk')

pd.set_option('display.max_columns', 40)

% matplotlib inline

In [6]:
tidy = pd.read_csv('./tidy_data.csv', index_col=0)

In [7]:
# Are subjects more likely to reproduce some features than others? *
# Are trained subjects more likely to reproduce game set features? ***
# Probability of missing a piece that is / is not part of a feature (or by # of features piece is part of)

In [8]:
hstarts = [i for row in range(4) for i in range(9*row, 9*row + 6, 1)]
vstarts = list(range(9))
ddstarts = list(range(6))
dustarts = list(range(4, 9))


def _add_position_strings(bp, wp):
    return ''.join([str(int(b) + int(w)) for b, w in zip(bp, wp)])


def _count_feature(bp, wp, feature):
    
    # Get the overall occupancy of position
    p = _add_position_strings(bp, wp)

    # Initialize count matrices
    bcounts = np.zeros(36, dtype=np.uint8)
    wcounts = np.zeros(36, dtype=np.uint8)
    
    # Helper function to detect matchs in different orientations
    def _orient_count(start, increment):
    
        end = start + 4 * increment

        for orientation in [1, -1]:
            total_match = p[start:end:increment] == feature[::orientation]

            if not total_match:
                # If the complete position is not the same as feature,
                #    it means that some locations that should have been 
                #    empty were not, so just continue
                continue
                
            black_match = bp[start:end:increment] == feature[::orientation]

            if black_match:
                bcounts[start:end:increment] += 1

                # If we found a black_match, no need to check white position
                break

            white_match = wp[start:end:increment] == feature[::orientation]

            if white_match:
                wcounts[start:end:increment] += 1

        return None
    
    # For every horizontal starting value
    for start in hstarts:
        _orient_count(start, 1)
          
    # Etc
    for start in vstarts:
        _orient_count(start, 9)
            
    for start in dustarts:
        _orient_count(start, 8)
            
    for start in ddstarts:
        _orient_count(start, 10)
             
    return bcounts + wcounts


def count_all_features(row):
    features = ['1100', '1010', '1001', '1110', '1101', '1111']
    bp = row['Black Position']
    wp = row['White Position']
    
    output_dict = {}
    for feature in features:
        count = _count_feature(bp, wp, feature)
        
        output_dict[feature] = count
        
    return output_dict

In [9]:
def _detect_type_2_error(bi, bf, wi, wf):
    original_empty = ((bf == '0') and (wf == '0')) 
    final_not_empty = ((bi == '1') or (wi == '1'))
    
    return int(original_empty and final_not_empty)

def _detect_type_3_error(bi, bf, wi, wf):
    b2w = ((bi == '1') and (wf == '1'))
    w2b = ((wi == '1') and (bf == '1'))
    
    return int(b2w or w2b)

def count_all_errors(row):
    bpi = row['Black Position']
    bpf = row['Black Position (final)']
    
    wpi = row['White Position']
    wpf = row['White Position (final)']
    
    
    type_2_errors = [
        _detect_type_2_error(bi, bf, wi, wf)
        for bi, bf, wi, wf in zip(bpi, bpf, wpi, wpf)
    ]
    
    type_3_errors = [
        _detect_type_3_error(bi, bf, wi, wf)
        for bi, bf, wi, wf in zip(bpi, bpf, wpi, wpf)
    ]
    
    return {'Type 2': type_2_errors, 'Type 3': type_3_errors}

In [10]:
feature_count_df = pd.DataFrame(tidy.apply(count_all_features, axis=1).tolist())
error_df = pd.DataFrame(tidy.apply(count_all_errors, axis=1).tolist())
sum_df = pd.concat([error_df, feature_count_df], axis=1)

In [11]:
def sum_features(row):
    
    counts = np.zeros(36, dtype=np.uint8)
    
    for name in row.index:
        if 'Type' not in name:

            counts += np.stack(row[name])
                
    return counts.tolist()

sum_df['all'] = sum_df.apply(sum_features, axis=1)

In [12]:
def bin_errors_by_num_features(row, error_type):
    type2 = row[error_type]
    feats = row['all']
    
    counts = {}
    for i, f in enumerate(feats):
        if f not in counts.keys():
            counts[f] = 0
            
        counts[f] += type2[i]
        
    return counts


def bin_errors_type2(row):
    return bin_errors_by_num_features(row, 'Type 2')


def bin_errors_type3(row):
    return bin_errors_by_num_features(row, 'Type 3')


def bin_features(row):
    idx = row.name
    bp = tidy.iloc[idx]['Black Position']
    wp = tidy.iloc[idx]['White Position']
    p = _add_position_strings(bp, wp)
    p = list(map(int, p))
    
    feats = row['all']
    
    counts = {}
    for i, f in enumerate(feats):
        if f not in counts.keys():
            counts[f] = 0
            
        counts[f] += p[i]
        
    return counts
    

type2_counts = pd.DataFrame(sum_df.apply(bin_errors_type2, axis=1).tolist()).fillna(0)
type3_counts = pd.DataFrame(sum_df.apply(bin_errors_type3, axis=1).tolist()).fillna(0)
feature_counts = pd.DataFrame(sum_df.apply(bin_features, axis=1).tolist()).fillna(0)

In [13]:
# Spearman: # features, # errors

In [19]:
type2_counts.sum(axis=0) / feature_counts.sum(axis=0)

0    0.258431
1    0.256140
2    0.261877
3    0.246600
4    0.146491
5    0.032895
6    0.131579
dtype: float64

In [14]:
sum_df.head()

Unnamed: 0,Type 2,Type 3,1001,1010,1100,1101,1110,1111,all
0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, ...","[1, 1, 2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 2, 2, 2, 4, 2, 1, 0, 0, 0, 1, 2, 2, 3, 2, ..."
1,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
2,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 2, 1, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 2, 1, 2, 1, 0, 0, 0, 0, 0, 1, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 2, 3, 2, 2, 0, 0, 0, 0, 0, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 2, 3, 2, 2, 1, 0, 0, 0, 0, 0, 2, 2, ..."


In [15]:
dist2 = type2_counts.sum(axis=0) / feature_counts.sum(axis=0)
# for Type II/III errors, number of possible errors is limited by number of pieces
# so feature_counts is for each position the number of pieces
# with X features present

In [16]:
dist3 = type3_counts.sum(axis=0) / feature_counts.sum(axis=0)

In [12]:
sts.ks_2samp(dist2.values, dist3.values)

Ks_2sampResult(statistic=0.8571428571428572, pvalue=0.004170568509644835)

In [None]:
# For each number of features, count the number of Type 2 errors

type2 = sum_df.iloc[0]['Type 2']
feats = sum_df.iloc[0]['all']

print(type2)
print(feats)

In [None]:
type_2_error_counts = np.stack(sum_df['Type 2'].values)
total_feature_counts = np.stack(sum_df['all'].values)

In [None]:
def error_count_against_num_features(row, error_type):
    fc = np.stack(row['all']).astype(np.uint8)
    ec = np.stack(row[error_type]).astype(np.uint8)
    
    pcount = {
        k: np.sum(ec[fc == k])
        for k in range(fc.max()+1)
    }
    
    return pcount


def error2_count_against_num_features(row):
    return error_count_against_num_features(row, 'Type 2')
    

def error3_count_against_num_features(row):
    return error_count_against_num_features(row, 'Type 3')


def instance_count_against_num_features(row):
    fc = np.stack(row['all']).astype(np.uint8)
    
    pcount = {
        k: np.sum(fc == k)
        for k in range(fc.max()+1)
    }
    
    return pcount

In [None]:
type2_errors_by_feature_count = pd.DataFrame(
    sum_df.apply(error2_count_against_num_features, axis=1).tolist()
).fillna(0)

type3_errors_by_feature_count = pd.DataFrame(
    sum_df.apply(error3_count_against_num_features, axis=1).tolist()
).fillna(0)

instances_by_feature_count = pd.DataFrame(
    sum_df.apply(instance_count_against_num_features, axis=1).tolist()
).fillna(0)

In [None]:
p_type2_j_num_features = type2_errors_by_feature_count.sum(axis=0) / tidy['Num Pieces'].sum()
p_num_features = instances_by_feature_count.sum(axis=0) / instances_by_feature_count.sum()

err2_dist = p_type2_j_num_features / p_num_features

In [None]:
err2_dist

In [None]:
p_type3_j_num_features = type3_errors_by_feature_count.sum(axis=0) / tidy['Num Pieces'].sum()

err3_dist = p_type3_j_num_features / p_num_features

In [None]:
err3_dist.mean()

In [None]:
err2_dist.mean()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

axes[0].bar(np.arange(7), err2_dist)
axes[1].bar(np.arange(7), err3_dist)

sns.despine()

In [None]:
err2_tidy = pd.melt(
    type2_errors_by_feature_count,
    var_name='Num Features', value_name='Error Count'
)

err2_tidy['dummy'] = err2_tidy['Error Count']

err2_sum_piv = err2_tidy.pivot_table(
    index='Num Features', values='Error Count', 
    aggfunc=np.sum
)

err2_len_piv = err2_tidy.pivot_table(
    index='Num Features', values='Error Count',
    aggfunc=len
)

err2_sum_piv / err2_len_piv

In [None]:
err2_tidy.head()

In [None]:
err2_len_piv = err2_tidy.pivot_table(
    index='Num Features', columns='Error Count', values='dummy',
    aggfunc=len
)

err2_len_piv.fillna(0)

In [None]:
err2_sum_piv = err2_tidy.pivot_table(
    index='Num Features', columns='Error Count', values='dummy',
    aggfunc=np.sum
)

p_num_err2_j_num_feat = err2_sum_piv.fillna(0) / err2_tidy['Error Count'].sum()

In [None]:
p_num_feat = instances_by_feature_count.sum() / instances_by_feature_count.sum().sum()
p_num_feat

In [None]:
p_num_feat.sum()

In [None]:
p_num_err2_j_num_feat.sum().sum()

In [None]:
p_num_err2_c_num_feat = p_num_err2_j_num_feat.copy()
p_num_err2_c_num_feat.loc[:, :] = p_num_err2_j_num_feat.values / p_num_feat.values[:, np.newaxis]
p_num_err2_c_num_feat

In [None]:
p_num_err2_c_num_feat.sum(axis=1)

In [None]:
err2_tidy['Error Count'].sum()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))



err3_tidy = pd.melt(
    type3_errors_by_feature_count / instances_by_feature_count, 
    var_name='Num Features', value_name='Error Prob'
)

sns.factorplot(
    x='Num Features', y='Error Prob', data=err2_tidy, ax=axes[0],
    kind='bar', ci=95, n_boot=1000, color='grey'
)

sns.factorplot(
    x='Num Features', y='Error Prob', data=err3_tidy, ax=axes[1],
    kind='bar', ci=95, n_boot=1000, color='grey'
)


plt.setp(axes[0], ylabel='Type 2 Error Probability')
plt.setp(axes[1], ylabel='Type 3 Error Probability')

sns.despine(ax=axes[0])
sns.despine(ax=axes[1])

In [None]:
tidy['Type III Errors'].sum() / tidy['Num Pieces'].sum()

In [None]:
dustarts

_idx = list(range(36))[8:40:8]

_l = np.zeros(36)
_l[_idx] = 1
_l.reshape((4, 9))

print(list(range(36))[5:45:10])

row = sum_df.iloc[0]

row.index

In [None]:
position_string = tidy.iloc[0]['Black Position']
feature = '1010'
start, end = 0, 4

print(position_string)
position_string[start:end] == feature
position_string[start:end:9] == feature



In [None]:
row = tidy.iloc[0]
bpi = row['Black Position']
bpf = row['Black Position (final)']
wpi = row['White Position']
wpf = row['White Position (final)']

error_counts = errors(row)
print(''.join([str(i) for i in error_counts['Type 2']]))

In [None]:
initial = ''.join([str(int(b) + int(w)) for b, w in zip(bpi, wpi)])
final = ''.join([str(int(b) + int(w)) for b, w in zip(bpf, wpf)])

In [None]:
print(initial)
print(''.join([str(i) for i in error_counts['Type 2']]))
print(final)

In [None]:
print(bpi)
print(wpf)
print(''.join([str(i) for i in error_counts['Type 3']]))


In [None]:
start = 1
position_string[start:start+28:9]

In [None]:
def position_string_to_array(position_string):
    position_list = np.stack([int(c) for c in position_string]).reshape((4, 9))
    
    return position_list

black_positions = np.stack(tidy['Black Position'].map(position_string_to_array).values)

In [None]:
black_positions[0]

In [None]:
black_positions.shape

In [None]:
feature1 = np.array([1, 1, 0, 0])
feature2 = np.array([1, 0, 1, 0])
feature3 = np.array([1, 0, 0, 1])
feature4 = np.array([1, 1, 1, 0])
feature5 = np.array([1, 1, 0, 1])
feature6 = np.array([1, 1, 1, 1])

def count_feature_occurrences(positions, feature):
    counts = np.zeros_like(positions)
    pass
    
    

In [None]:
position_string = tidy.iloc[0]['Black Position']

In [None]:
position = np.stack([c for c in position_string]).astype(np.uint8)

In [None]:
position

In [None]:
feature = np.zeros_like(position)
start, end = 0, 4
all(position[np.arange(start, end, 1)] == feature1)

In [None]:
from scipy.signal import convolve2d

In [None]:
feature = feature1
convolve2d(black_positions[0], feature[np.newaxis, :], mode='same') == feature.sum()

In [None]:
black_positions[0]