# Language abstraction analysis notebook

## Corpus

2+ annotations per item

In [None]:
subdomains = {
    'structures' :  ['bridge','city','house','castle'],
    'drawing' :  ['nuts-bolts','wheels','dials','furniture']
}

domains = list(subdomains.keys())

In [None]:
import os
import sys
import urllib, io
os.getcwd()
sys.path.append("..")
# sys.path.append("../utils")
sys.path.append("../../../stimuli")

import numpy as np
import scipy.stats as stats
import scipy.spatial.distance as distance
import pandas as pd

from collections import Counter
from functools import reduce
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont, ImageColor

from io import BytesIO
import base64

import random
import  matplotlib
from matplotlib import pylab, mlab, pyplot
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

import seaborn as sns
sns.set_context('talk')
sns.set_style('whitegrid')

from IPython.display import clear_output, Image, HTML

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import spacy
nlp = spacy.load("en_core_web_lg")
# sys.path.append("../../stimuli/towers/block_utils/")
# import blockworld_utils as utils

In [None]:
# styling for paper_figures

sns.set_style('white', {'axes.linewidth': 0.5})
plt.rcParams['xtick.major.size'] = 6
plt.rcParams['ytick.major.size'] = 6
plt.rcParams['xtick.major.width'] = 2
plt.rcParams['ytick.major.width'] = 2
plt.rcParams['xtick.bottom'] = True
plt.rcParams['ytick.left'] = True

LIGHT_BLUE = "#56B0CD"
LIGHT_ORANGE = "#FFCE78"
LIGHT_GREEN = "#95C793"
LIGHT_RED = "#CC867A"

BLUE = "#009BCD"
ORANGE = "#FFA300"
GREEN = "#688B67"
RED = "#CC5945"

DARK_BLUE   = "#0E4478"
DARK_ORANGE = "#A46400"
DARK_GREEN  = "#275C4A"
DARK_RED    =  "#9B3024"

domain_palettes_light = {
    
    domains[0]:{
        subdomains[domains[0]][0]: LIGHT_BLUE,
        subdomains[domains[0]][1]: LIGHT_ORANGE,
        subdomains[domains[0]][2]: LIGHT_GREEN, 
        subdomains[domains[0]][3]: LIGHT_RED   
    },
     domains[1]:{
        subdomains[domains[1]][0]: LIGHT_BLUE,
        subdomains[domains[1]][1]: LIGHT_ORANGE,
        subdomains[domains[1]][2]: LIGHT_GREEN, 
        subdomains[domains[1]][3]: LIGHT_RED 
    }
}

domain_palettes = {
    
    domains[0]:{
        subdomains[domains[0]][0]: BLUE,
        subdomains[domains[0]][1]: ORANGE,
        subdomains[domains[0]][2]: GREEN,
        subdomains[domains[0]][3]: RED
    },
     domains[1]:{
        subdomains[domains[1]][0]: BLUE,
        subdomains[domains[1]][1]: ORANGE,
        subdomains[domains[1]][2]: GREEN,
        subdomains[domains[1]][3]: RED
    }
}

domain_palettes_dark = {
    
    domains[0]:{
        subdomains[domains[0]][0]: DARK_BLUE,
        subdomains[domains[0]][1]: DARK_ORANGE,
        subdomains[domains[0]][2]: DARK_GREEN, 
        subdomains[domains[0]][3]: DARK_RED   
    },
     domains[1]:{
        subdomains[domains[1]][0]: DARK_BLUE,
        subdomains[domains[1]][1]: DARK_ORANGE,
        subdomains[domains[1]][2]: DARK_GREEN, 
        subdomains[domains[1]][3]: DARK_RED 
    }
}

N=256
gradients = []

for light, mid, dark in zip([LIGHT_BLUE,LIGHT_ORANGE,LIGHT_GREEN,LIGHT_RED],[BLUE,ORANGE,GREEN,RED],[DARK_BLUE,DARK_ORANGE,DARK_GREEN,DARK_RED]):
    light_rgb = list(ImageColor.getcolor(light, "RGB"))
    mid_rgb = list(ImageColor.getcolor(mid, "RGB"))
    dark_rgb = list(ImageColor.getcolor(dark, "RGB"))
    vals = np.ones((N, 4))
    vals[:, 0] = np.append(np.linspace(light_rgb[0]/255, mid_rgb[0]/255, int(N/2)),np.linspace(mid_rgb[0]/255, dark_rgb[0]/255, int(N/2))) # R
    vals[:, 1] = np.append(np.linspace(light_rgb[1]/255, mid_rgb[1]/255, int(N/2)),np.linspace(mid_rgb[1]/255, dark_rgb[1]/255, int(N/2))) # G
    vals[:, 2] = np.append(np.linspace(light_rgb[2]/255, mid_rgb[2]/255, int(N/2)),np.linspace(mid_rgb[2]/255, dark_rgb[2]/255, int(N/2))) # B
    newcmp = ListedColormap(vals)
    
    gradients.append(newcmp)

domain_gradients = {

    domains[0]:{
        subdomains[domains[0]][0]: gradients[0],
        subdomains[domains[0]][1]: gradients[1],
        subdomains[domains[0]][2]: gradients[2],
        subdomains[domains[0]][3]: gradients[3],
    },
     domains[1]:{
        subdomains[domains[1]][0]: gradients[0],
        subdomains[domains[1]][1]: gradients[1],
        subdomains[domains[1]][2]: gradients[2],
        subdomains[domains[1]][3]: gradients[3],
    }
}


In [None]:
def add_numbers_and_space(responses):
    responses = [f"{id}: {response}" for (id, response) in enumerate(responses)]
    responses = '\n'.join(responses)
    return responses

def group_by_stim_url(df, config_name):
    df[config_name] = df[['stimURL','responses']].groupby(['stimURL'])['responses'].transform(lambda responses: add_numbers_and_space(responses))
    df[['stimURL', config_name]].drop_duplicates()
    return df[['stimURL', config_name]]

def group_by_stim_id(df, config_name):
    df[config_name] = df[['stimId','responses']].groupby(['stimId'])['responses'].transform(lambda responses: add_numbers_and_space(responses))
    df[['stimId', config_name]].drop_duplicates()
    return df[['stimId', config_name]]

def path_to_image_html(path):
    '''
     This function essentially convert the image url to 
     '<img src="'+ path + '"/>' format. And one can put any
     formatting adjustments to control the height, aspect ratio, size etc.
     within as in the below example. 
    '''

    return '<img src="'+ path + '" style=max-width:100px " />'


def stimId_to_s3URL(domain, subdomain, stimID):
    
    if domain == 'structures':
        url =  "https://lax-{}-{}-all.s3.amazonaws.com/".format(domain, 
                                                                subdomain)\
               + "lax-{}-{}-{}-all.png".format(domain,
                                      subdomain,
                                      str(stimID).zfill(3))
    else: #check this
        url =  "https://lax-{}-{}-all.s3.amazonaws.com/".format(domain, 
                                                                subdomain)\
               + "lax-{}-{}-all-{}.png".format(domain,
                                      subdomain,
                                      str(stimID).zfill(3))

    return url
    

def stimId_to_html(stimId, domain = 'structures', subdomain = 'bridge'):
    '''
     This function essentially convert the image url to 
     '<img src="'+ path + '"/>' format. And one can put any
     formatting adjustments to control the height, aspect ratio, size etc.
     within as in the below example. 
    '''
    stimURL = stimId_to_s3URL(domain, subdomain, stimId) 
    return '<img src="'+ stimURL + '" style=max-width:150px " />'




In [None]:
# load dataframe

results_csv_directory = "../../results/csv/"
# df_trial = pd.read_csv(os.path.join(results_csv_directory, 'lax_corpus_1k_trial.csv'))
df_trial = pd.read_csv(os.path.join(results_csv_directory, 'lax_corpus_1k_trials_cogsci22.csv'))

## Preprocessing

### column name descriptions

```
id
'datatype': 
'iterationName':
'config_name':
    
'condition':
'domain': structures/ drawing
'subdomain': 
'gameID': uuid for participant

'shuffle':
'trialOrder':

'rt': reaction time
'rt_mins': reaction time in minutes

'trial_index': jspsych trial number (not experimental)
'trial_type':
'time_elapsed': 
'complete_dataset': did participant submit 10 responses?
'trial_num': trial number
    
'responses': complete response of what and where messages
'response_lists': same as above, but list of lists
'whats': list of what responses
'wheres': list of where responses
'n_steps': number of steps
'what_messages_lengths': list of lengths of what responses (characters)
'where_messages_lengths': list of lengths of where responses (characters)
'what_char_sum': total characters in what responses
'where_char_sum': total characters in where responses
'char_sum': total characters in responses 
'ppt_hit_8_step_limit': participant was in version of experiment with 8 steps, and hit this limit on at least one trial


'lemmatized_whats': lemmatized by spacy
'lemmatized_wheres':
'lemmatized_notstop_whats': lemmatized by spacy, stop words (incl numbers) removed
'lemmatized_notstop_wheres': 
'lemmatized_filtered_whats': lemmatized by spacy, determiners, punctuation and symbols removed
'lemmatized_filtered_wheres':

``` 

### Metadata    
```
'internal_node_id':
'view_history':
'stimId':
'stimURL':
'stim_group':
'partitionFamily':
'splitNumber':
'stimIDs':
'stimURLS':
'stimGroups':
'numGames':
'experimentType':
'experimentName':
'versionInd':
```

### Common preprocessing

Most preprocessing is dealt with in ./lax_corpus_data_generator.ipynb

Here we add preprocessing steps common to several but not all analyses

In [None]:
# interpret more complex data structures i.e. lists
for column_name in ['responses',
                    'whats',
                    'wheres',
                    'lemmatized_whats',
                    'lemmatized_notstop_whats',
                    'lemmatized_filtered_whats',
                    'lemmatized_wheres',
                    'lemmatized_notstop_wheres',
                    'lemmatized_filtered_wheres',
                    'low_level_parts',
                    'mid_level_parts',
                    'high_level_parts',
                    'low_level_part_types',
                    'mid_level_part_types',
                    'high_level_part_types',
                    'low_level_part_params',
                    'mid_level_part_params',
                    'high_level_part_params',
                    'dreamcoder_program_dsl_0_tokens'
                   ]:
    df_trial[column_name] = df_trial[column_name].apply(ast.literal_eval)


In [None]:
assert df_trial.dreamcoder_program_dsl_0_tokens.apply(lambda x: type(x) == list).all()

df_trial.loc[:,'base_program_length'] = df_trial.dreamcoder_program_dsl_0_tokens.apply(len)

In [None]:
# add mean word count for each stim
what_word_sum_means = df_trial.groupby(['domain','subdomain','stimId']).mean()['what_word_sum'].reset_index()
what_word_sum_means = what_word_sum_means.rename(columns={'what_word_sum':'what_word_mean'})

#add means to df_trial (only do this if you will take one row per item from df_trial)
df_trial = df_trial.merge(what_word_sum_means, how='left', on=['domain','subdomain','stimId']) 

In [None]:
from collections import defaultdict

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
word_to_pos = {}
for _, row in df_trial.iterrows():
    whats_list = row["lemmatized_whats"]
    pos_list = ast.literal_eval(row["whats_pos"])
    for i in range(len(whats_list)):
        item = whats_list[i]
        for j in range(len(item)):
            word = whats_list[i][j]
            pos = pos_list[i][j]
            word_to_pos[word] = pos

In [None]:
# Replace common misspellings (applied to top-word analyses below)
spelling_map = {w: w for w in word_to_pos.keys()}
spelling_map["boarder"] = "border"
spelling_map["centre"] = "center"
spelling_map["cirlce"] = "circle"
spelling_map["cirlcle"] = "circle"
spelling_map["colour"] = "color"
spelling_map["collumn"] = "column"
spelling_map["columb"] = "column"
spelling_map["colum"] = "column"
spelling_map["hexgon"] = "hexagon"
spelling_map["heaxgon"] = "hexagon"
spelling_map["heaxagon"] = "hexagon"
spelling_map["hexagin"] = "hexagon"
spelling_map["hexogram"] = "hexagon"
spelling_map["horiz"] = "horizontal"
spelling_map["octogon"] = "octagon"
spelling_map["octogan"] = "octagon"
spelling_map["rec"] = "rectangle"
spelling_map["rect"] = "rectangle"
spelling_map["rectagle"] = "rectangle"
spelling_map["recagle"] = "rectangle"
spelling_map["sqaure"] = "square"
spelling_map["squae"] = "square"
spelling_map["squar"] = "square"
spelling_map["sqar"] = "square"
spelling_map["sqare"] = "square"
spelling_map["squre"] = "square"
spelling_map["verticle"] = "vertical"

In [None]:
df_trial.to_csv(os.path.join(results_csv_directory, 'lax_corpus_1k_trials_cogsci22_preprocessed.csv'))

## Analysis of programs

- gallery: longest / average-length / shortest programs in each domain
- length: 
- diversity domain-specificity
- program token-level distinctiveness
- in: domain, subdomain, library_0 vs. library_compressive


In [None]:
# use separate df with one entry per item
df_programs =  pd.read_csv('../../results/csv/lax_corpus_1k_programs_cogsci22.csv')

In [None]:
df_programs['base_program_length'] = df_programs.dreamcoder_program_dsl_0_tokens.apply\
            (lambda x: len(ast.literal_eval(x)))

In [None]:
# find order of complexity for subdomains
df_programs.groupby(['domain','subdomain'])['base_program_length'].apply(np.mean)

In [None]:
# display items for each structures subdomain with shortest program

top_n = 10

for domain in ['structures']:
    for subdomain in subdomains[domain]:
        
        df_subdomain = df_programs[(df_programs.domain == domain) & (df_programs.subdomain == subdomain)]\
                        [['domain','subdomain','stimId','base_program_length']].sort_values('base_program_length',ascending=True)
        
#         grouped_df_list = [group_by_stim_id(df, config_name) for (config_name, df) in {subdomain: df_subdomain}.items()]
#         reduced_df = reduce(lambda x, y: pd.merge(x, y, on = ['stimId','domain','subdomain']), grouped_df_list).drop_duplicates()

        
        display(HTML(df_subdomain.head(top_n)\
                        .to_html(escape=False,
                                formatters=dict(stimId=
                                                lambda x:(stimId_to_html(x, domain = domain, subdomain = subdomain))))
                        .replace("\\n","<br>=======<br><br>")))
        
        
        

In [None]:
# display items for each structures subdomain with longest program

top_n = 10

for domain in ['structures']:
    for subdomain in subdomains[domain]:
        
        df_subdomain = df_programs[(df_programs.domain == domain) & (df_programs.subdomain == subdomain)]\
                        [['domain','subdomain','stimId','base_program_length']].sort_values('base_program_length',ascending=False)
        
#         grouped_df_list = [group_by_stim_id(df, config_name) for (config_name, df) in {subdomain: df_subdomain}.items()]
#         reduced_df = reduce(lambda x, y: pd.merge(x, y, on = ['stimId','domain','subdomain']), grouped_df_list).drop_duplicates()

        
        display(HTML(df_subdomain.head(top_n)\
                        .to_html(escape=False,
                                formatters=dict(stimId=
                                                lambda x:(stimId_to_html(x, domain = domain, subdomain = subdomain))))
                        .replace("\\n","<br>=======<br><br>")))
        
        
        

In [None]:
# display items for each structures subdomain with shortest program

top_n = 10

for domain in ['drawing']:
    for subdomain in subdomains[domain]:
        
        df_subdomain = df_programs[(df_programs.domain == domain) & (df_programs.subdomain == subdomain)]\
                        [['domain','subdomain','stimId','base_program_length']].sort_values('base_program_length',ascending=True)
        
        display(HTML(df_subdomain.head(top_n)\
                        .to_html(escape=False,
                                formatters=dict(stimId=
                                                lambda x:(stimId_to_html(x, domain = domain, subdomain = subdomain))))
                        .replace("\\n","<br>=======<br><br>")))
        
        
        

In [None]:
# display items for each structures subdomain with longest program

top_n = 10

for domain in ['drawing']:
    for subdomain in subdomains[domain]:
        
        df_subdomain = df_programs[(df_programs.domain == domain) & (df_programs.subdomain == subdomain)]\
                        [['domain','subdomain','stimId','base_program_length']].sort_values('base_program_length',ascending=False)
        
#         grouped_df_list = [group_by_stim_id(df, config_name) for (config_name, df) in {subdomain: df_subdomain}.items()]
#         reduced_df = reduce(lambda x, y: pd.merge(x, y, on = ['stimId','domain','subdomain']), grouped_df_list).drop_duplicates()

        
        display(HTML(df_subdomain.head(top_n)\
                        .to_html(escape=False,
                                formatters=dict(stimId=
                                                lambda x:(stimId_to_html(x, domain = domain, subdomain = subdomain))))
                        .replace("\\n","<br>=======<br><br>")))
        
        
        

## Analysis of language
 
- Visualize: longest / average-length / shortest word counts in each domain (see: lax-corpus-results-visualizer.ipynb)
- token-level diversity: across domains, across subdomains within domain, across stims within subdomain, across participants
- token-level distinctiveness (PMI, tf-idf): across domains, across subdomains within domain, across stims within subdomain, across participants
- same as above, but now on "semantic" representations: gLoVe embeddings / BERT / & co. from huggingface / Spacy has all of these i think-> show a tsne


### Characterizing language use

#### Number of steps in instructions

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(data=df_trial, 
             x="n_steps", 
             hue="domain",
             hue_order=['drawing','structures'],
             binwidth=1,
             stat='proportion')
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.title('number of steps in instructions')
plt.savefig('./plots/instruction_steps_dist.pdf')

In [None]:
d_steps = df_trial[df_trial.domain=='drawing']['n_steps']
s_steps = df_trial[df_trial.domain=='structures']['n_steps']

stats.ttest_ind(d_steps,s_steps)

In [None]:
## over time
plt.figure(figsize=(10,6))
sns.lineplot(data=df_trial[(df_trial.complete_dataset)], 
             x='trial_num',
             y='n_steps', 
             hue='domain')
plt.ylim((1,11))
plt.title('total characters across trials, by domain')

#### Character count

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(data=df_trial, 
             x="what_char_sum", 
             hue="domain",
             hue_order=['drawing','structures'],
             binwidth=20,
             stat='proportion')
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.title('total characters in instructions')
plt.savefig('./plots/instruction_chars_dist.pdf')

In [None]:
d_chars = df_trial[df_trial.domain=='drawing']['char_sum']
s_chars = df_trial[df_trial.domain=='structures']['char_sum']

stats.ttest_ind(d_chars,s_chars)

In [None]:
## over time
plt.figure(figsize=(10,6))
sns.lineplot(data=df_trial[(df_trial.complete_dataset)], 
             x='trial_num', 
             y='char_sum', 
             hue='domain')
# plt.ylim((0,275))
plt.title('total characters across trials, by domain')


In [None]:
# over time
plt.figure(figsize=(10,6))
sns.lineplot(data=df_trial[(df_trial.stimId != 'demo_stim') & 
                           (df_trial.complete_dataset)], 
             x='trial_num', 
             y='char_sum', 
             hue='subdomain')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.title('total characters across trials, by subdomain')

In [None]:
## over time
plt.figure(figsize=(10,6))
sns.lineplot(data=df_trial[(df_trial.complete_dataset)], 
             x='trial_num', 
             y='what_char_sum', 
             hue='domain')
# plt.ylim((0,275))
plt.title('total WHAT characters across trials, by domain')


In [None]:
# over time
plt.figure(figsize=(10,6))
sns.lineplot(data=df_trial[(df_trial.stimId != 'demo_stim') & 
                           (df_trial.complete_dataset)], 
             x='trial_num', 
             y='what_char_sum', 
             hue='subdomain')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.title('total WHAT characters across trials, by subdomain')

In [None]:
plt.figure(figsize=(10,6))

sns.lineplot(data=df_trial[(df_trial.stimId != 'demo_stim') & 
                           (df_trial.complete_dataset)], 
             x='trial_num',
             y='what_char_sum', 
             hue='domain', 
             linestyle='--')

sns.lineplot(data=df_trial[(df_trial.stimId != 'demo_stim') & 
                           (df_trial.complete_dataset)], 
             x='trial_num', y='where_char_sum', 
             hue='domain', 
             linestyle='-', 
             legend=False)
plt.title('total characters across trials, WHAT vs. WHERE, by domain')

### Word-based measures

In [None]:
## over time

plt.figure(figsize=(4,6))
sns.barplot(data=df_trial[(df_trial.complete_dataset)], 
             x='domain', 
             y='n_whats_filtered')
plt.ylabel('unique what words')
# plt.ylim((0,275))
plt.title('number of unique words used per response')
plt.savefig('./plots/unique_whats_domain.pdf')

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(data=df_trial, 
             x="n_whats_filtered", 
             hue="domain",
             hue_order=['drawing','structures'],
             binwidth=1,
             stat='proportion')
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.title('number of steps in instructions')
plt.savefig('./plots/instruction_steps_dist.pdf')

In [None]:
d_whats_filtered = df_trial[df_trial.domain=='drawing']['n_whats_filtered']
s_whats_filtered = df_trial[df_trial.domain=='structures']['n_whats_filtered']

stats.ttest_ind(d_whats_filtered,s_whats_filtered)

In [None]:
s_whats_filtered.mean()

In [None]:
d_whats_filtered.mean()

In [None]:
## over time

plt.figure(figsize=(4,6))
sns.barplot(data=df_trial[(df_trial.complete_dataset)], 
             x='domain', 
             y='n_unique_whats')
plt.ylabel('unique what words')
# plt.ylim((0,275))
plt.title('number of unique words used per response')
plt.savefig('./plots/unique_whats_domain.pdf')

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(data=df_trial, 
             x="n_unique_whats", 
             hue="domain",
             hue_order=['drawing','structures'],
             binwidth=1,
             stat='proportion')
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.title('number of steps in instructions')
plt.savefig('./plots/instruction_steps_dist.pdf')

In [None]:
d_unique_whats = df_trial[df_trial.domain=='drawing']['n_unique_whats']
s_unique_whats = df_trial[df_trial.domain=='structures']['n_unique_whats']

stats.ttest_ind(d_unique_whats,s_unique_whats)

In [None]:
## over time
plt.figure(figsize=(6,6))
sns.barplot(data=df_trial[(df_trial.complete_dataset)], 
             x='subdomain', 
             y='n_unique_whats',
             hue='domain')
plt.ylabel('unique what words')
plt.xticks(rotation = 45)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
# plt.ylim((0,275))
plt.title('number of unique words used per response')
plt.savefig('./plots/unique_whats_subdomain.pdf')

In [None]:
## over time

plt.figure(figsize=(10,6))
sns.lineplot(data=df_trial[(df_trial.complete_dataset)], 
             x='trial_num', 
             y='n_unique_whats', 
             hue='domain')
# plt.ylim((0,275))
plt.title('number of unique words used per response, over time')

#### Comparisons between subdomains

In [None]:
plt.figure(figsize=(4,8))


sns.barplot(
            data = df_trial,
            x = 'subdomain',
            order = subdomains['drawing'] +  subdomains['structures'],
            palette= {**domain_palettes['drawing'],**domain_palettes_light['structures']},
            y = 'what_word_sum')
_ = plt.xticks(rotation = 60)
plt.savefig('./plots/what_word_sum_subdomains.pdf')

In [None]:
plt.figure(figsize=(8,6))

sns.violinplot(
    data = df_trial,
    x = 'subdomain',
    order = subdomains['drawing'] +  subdomains['structures'],
    y = 'what_word_sum',
    palette= {**domain_palettes['drawing'],**domain_palettes_light['structures']},
    linewidth=2)

In [None]:
f = plt.figure(figsize=(8,6))

sns.violinplot(
    data = df_trial.groupby(['domain','subdomain','stimId']).first().reset_index(),
    x = 'subdomain',
    order = subdomains['drawing'] +  subdomains['structures'],
    palette = {**domain_palettes['drawing'],**domain_palettes_light['structures']},
    y = 'what_word_mean',
    linewidth=2)

### Word counts

token-based length: across domains, across subdomains within domain, across stims within subdomain, across participants

In [None]:
sns.histplot(data = df_trial, x="what_word_sum", log_scale=True, fill=False, element="step")
sns.histplot(data = df_trial, x="where_word_sum", log_scale=True, fill=False, element="step")

In [None]:
sns.histplot(data = df_trial,
             x="what_word_sum", 
             hue='domain',
             stat="density",
             common_norm=False,
             log_scale=True,
             fill=False,
             element="step")

In [None]:
# mean word count (across participants) for each subdomain
plt.figure(figsize=(4,6))

sns.barplot(data=what_word_sum_means,
            x='domain',
            y='what_word_mean',
            hue_order=['structures','drawing']
           )
plt.xticks(rotation = 45)
plt.ylabel('what words')
plt.title('word count by domain')
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.subplots_adjust(left=0.2, bottom=0.35)
plt.savefig('./plots/what_word_count_domain.pdf')

In [None]:
# mean word count (across participants) for each subdomain
plt.figure(figsize=(6,6))

sns.barplot(data=what_word_sum_means,
            x='subdomain',
            hue='domain',
            y='what_word_mean',
            hue_order=['drawing','structures']
           )
plt.xticks(rotation = 45)
plt.ylabel('what words')
plt.title('word count by subdomain')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.subplots_adjust(left=0.2, bottom=0.35)
plt.savefig('./plots/what_word_count_subdomain.pdf')

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(data=df_trial[df_trial.subdomain.isin(['nuts-bolts','dials'])], 
             x="what_word_mean", 
             hue="subdomain",
#              hue_order=['drawing','structures'],
             binwidth=5,
             stat='proportion')
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.title('mean number of words in instructions')
plt.savefig('./plots/instruction_chars_dist.pdf')

## Top words in domain (by counts)

In [None]:
# by domain
all_words = {}
top_words_domain = {}
n_words_in_domain = {}
tf = {}
df = {}

all_docs = []

for domain in domains:
    
    doc = [d for sublist in df_trial[(df_trial.domain==domain) &
                                     (df_trial.complete_dataset)]['lemmatized_filtered_whats'] 
                                              for item in sublist
                                              for d in item]
    
    doc = [word for word in doc if word_to_pos[word]!='NUM']
    c = Counter(doc)
    top_words_domain[domain] = c.most_common(10)
    

In [None]:
df_top_words = pd.DataFrame(top_words_domain)

In [None]:
X_OFFSET_START = 0
Y_OFFSET_START = 1

X_OFFSET_INTERVAL = 0.5
Y_OFFSET_INTERVAL = 0.1

X_OFFSET_WORD = 0.13

cmap = matplotlib.cm.get_cmap("Greys")

fig = plt.figure()

x_offset = X_OFFSET_START
for domain in domains:
    
    y_offset = Y_OFFSET_START

#     plt.text(x_offset, y_offset, domain, fontweight="bold", color=cmap(1.0))
    y_offset -= Y_OFFSET_INTERVAL
    
    domain_counts = np.array([b for (a, b) in top_words_domain[domain]])
    
    norm = matplotlib.colors.Normalize(vmin=min(-(domain_counts.mean()*3), domain_counts.min()), vmax=domain_counts.max())
    
    for word, count in top_words_domain[domain]:
        alpha = 1
        plt.text(x_offset, y_offset, f"({count:.0f}) ", color=cmap(norm(count)), fontsize=12, alpha=alpha, fontname="Arial")
        plt.text(x_offset + X_OFFSET_WORD, y_offset, word, color=cmap(norm(count)), fontsize=16, alpha=alpha)
        
        y_offset -= Y_OFFSET_INTERVAL
    
    x_offset += X_OFFSET_INTERVAL
    
plt.grid(False)
plt.axis("off")
plt.show()

fig.savefig(f"top_words.pdf", bbox_inches="tight")


## Most diagnostic words of subdomain (PMI)

In [None]:
DOMAIN = "structures"
# DOMAIN = "drawing"

df_domain = df_trial[(df_trial.domain == DOMAIN) & (df_trial.complete_dataset) & (~df_trial.ppt_hit_8_step_limit) & (df_trial.stimId != 'demo_stim')]
df_domain = df_domain.reset_index(drop=True)

df_domain["lemmatized_whats_flat"] = df_domain["lemmatized_whats"].map(lambda whats_list: " ".join([spelling_map[item] for sublist in whats_list for item in sublist if word_to_pos[item] == "NOUN"]))


df_domain

In [None]:
vectorizer = CountVectorizer(strip_accents="unicode", min_df=5, stop_words="english")
X = vectorizer.fit_transform(df_domain["lemmatized_whats_flat"])

df_counts = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

PSEUDOCOUNT = 1 / len(df_counts.columns)

df_counts = df_counts + PSEUDOCOUNT
df_counts

In [None]:
d_pmi = defaultdict(dict)

N = df_counts.sum().sum()
JOINT_EXP = 1

subdomain_priors = ((df_domain.subdomain.value_counts()) / len(df_domain)).to_dict()

for subdomain in subdomain_priors:
    p_subdomain = np.log2(subdomain_priors[subdomain])
    for word in df_counts.columns:
        p_joint = np.log2((df_counts[word][df_domain.subdomain == subdomain].sum()) / N)
        p_word = np.log2((df_counts[word].sum()) / N)
                
        pmi = p_joint - (p_word + p_subdomain)
        d_pmi[subdomain][word] = pmi

In [None]:
# Percentage of positive PMI values
df_pmi = pd.DataFrame(d_pmi)
(df_pmi > 0).sum().sum() / (df_pmi != 0).sum().sum()

In [None]:
df_pmi

In [None]:
pd.DataFrame({subdomain: df_pmi[subdomain].nlargest(30).index.tolist() for subdomain in subdomain_priors})

In [None]:
# reorder columns
df_pmi = df_pmi[subdomains[DOMAIN]]

In [None]:
df_pmi

In [None]:
TOP_N = 10

X_OFFSET_START = 0
Y_OFFSET_START = 1

X_OFFSET_INTERVAL = 0.35
Y_OFFSET_INTERVAL = 0.1

X_OFFSET_WORD = 0.11

# CMAPS = map(matplotlib.cm.get_cmap, ["Blues", "Oranges", "Greens", "Reds"])
CMAPS = gradients


fig = plt.figure()

x_offset = X_OFFSET_START
for subdomain, cmap in zip(df_pmi.columns, CMAPS):
    df_pmi_subdomain_top = df_pmi[subdomain].nlargest(TOP_N)
    
    y_offset = Y_OFFSET_START

#     plt.text(x_offset, y_offset, subdomain, fontweight="bold", color=cmap(1.0), fontname="Arial")
    y_offset -= Y_OFFSET_INTERVAL
    
    norm = matplotlib.colors.Normalize(vmin=min(0, df_pmi_subdomain_top.min()), vmax=df_pmi_subdomain_top.max())
    
    for word, pmi in df_pmi_subdomain_top.iteritems():
        alpha = int(pmi > 0)
        plt.text(x_offset, y_offset, f"({pmi:.2f}) ", color=cmap(norm(pmi)), fontsize=12, alpha=alpha, fontname="Arial")
        plt.text(x_offset + X_OFFSET_WORD, y_offset, word, color=cmap(norm(pmi)), fontsize=16, alpha=alpha)
        
        y_offset -= Y_OFFSET_INTERVAL
    
    x_offset += X_OFFSET_INTERVAL
    
plt.grid(False)
plt.axis("off")
plt.show()

fig.savefig(f"pmi_{DOMAIN}.pdf", bbox_inches="tight")

## Analysis of programs x language

- Scatterplot: for each stim AND baseDSL vs. ‘compressive’ DSLL: word count vs. program length; |set(words)| vs.  |set(program_tokens)|
  - Questions re: overall trend (sublinear?)
  - Questions re: where we see high variation in language length given baseDSL program length 
  - We can further break these down in: domain, subdomain, annotator, library_0 vs. library_compressive


Let's merge the df_structures data frame with trial

In [None]:
#fig = sns.scatterplot(x='n_blocks', y = 'what_char_sum', alpha=0.5, data=df_combined)
g = sns.jointplot(x='n_blocks', y = 'what_word_sum', alpha = 0.2,  data=df_trial)
#g.ax_joint.set_xscale('log')
#g.ax_joint.set_yscale('log')
#fig.plot([0, 100], [0, 100])

In [None]:
df_trial.query('what_word_sum == 1')['responses']

### n stroke vs mean word count for gadget item

In [None]:
# structures, n blocks vs mean word count for that stim

# just grab means (i.e. only one row per item needed)
df =  df_trial[df_trial.domain == 'drawing'].groupby(['domain','subdomain','stimId']).first().reset_index()

plt.figure(figsize=(10,10))

s = sns.scatterplot(data = df,
                x = 'n_strokes',
                y = 'what_word_mean',
                hue='subdomain',
                alpha=0.6)

# plt.title("number of words by gadget complexity")
plt.xlabel("strokes in image")
plt.ylabel("mean number of words")

s.plot([0,1],[0,1], 
       transform=s.transAxes, 
       color='grey',
       linestyle='--')

plt.savefig('./plots/gadget_language_item_complexity.pdf')

In [None]:
# structures, n blocks vs mean word count for that stim

# just grab means (i.e. only one row per item needed)
df = df_trial[df_trial.domain == 'drawing'].groupby(['domain','subdomain','stimId']).first().reset_index()

s = sns.FacetGrid(data = df,
                  col='subdomain', 
                  hue='subdomain',
                  height=5, 
                  aspect=0.85, # set aspect ratio here (although this includes titles, labels etc.)
                )

s.map(sns.scatterplot,
        'n_strokes',
        'what_word_mean',
        alpha=0.6)



for ax in s.axes_dict.values():
    ax.axline((0, 0), slope=1, c=".3", ls="--", zorder=0)
    ax.set(xlabel="strokes in image", ylabel="mean number of words")
    
plt.savefig('./plots/gadget_language_item_complexity_facet.pdf')

### n blocks vs mean word count for structure item

In [None]:
# structures, n blocks vs mean word count for that stim

# just grab means (i.e. only one row per item needed)
df =  df_trial[df_trial.domain == 'structures'].groupby(['domain','subdomain','stimId']).first().reset_index()

plt.figure(figsize=(10,10))

s = sns.scatterplot(data = df,
                x = 'n_blocks',
                y = 'what_word_mean',
                hue='subdomain',
                alpha=0.6)

plt.xlabel("blocks in structure")
plt.ylabel("mean number of words")

s.plot([0,1],[0,1], 
       transform=s.transAxes, 
       color='grey',
       linestyle='--')

plt.savefig('./plots/structure_language_item_complexity.pdf')

In [None]:
# structures, n blocks vs mean word count for that stim

# just grab means (i.e. only one row per item needed)
df = df_trial[df_trial.domain == 'structures'].groupby(['domain','subdomain','stimId']).first().reset_index()

s = sns.FacetGrid(data = df,
                  col='subdomain', 
                  hue='subdomain',
                  height=5, 
                  aspect=0.85, # set aspect ratio here (although this includes titles, labels etc.)
                )

s.map(sns.scatterplot,
        'n_blocks',
        'what_word_mean',
        alpha=0.6)

for ax in s.axes_dict.values():
    ax.axline((0, 0), slope=1, c=".3", ls="--", zorder=0)
    ax.set(xlabel="blocks in structure", ylabel="mean number of words")
    
plt.savefig('./plots/structure_language_item_complexity_facet.pdf')

### base program length vs mean word count for gadget item

In [None]:
# structures, n blocks vs mean word count for that stim

# just grab means (i.e. only one row per item needed)
df =  df_trial[df_trial.domain == 'drawing'].groupby(['domain','subdomain','stimId']).first().reset_index()

plt.figure(figsize=(10,10))

s = sns.scatterplot(data = df,
                x = 'base_program_length',
                y = 'what_word_mean',
                hue='subdomain',
                alpha=0.6)

plt.xlabel("base program length")
plt.ylabel("mean number of words")

# s.plot([0,1],[0,1], 
#        transform=s.transAxes, 
#        color='grey',
#        linestyle='--')

plt.savefig('./plots/gadget_language_item_complexity.pdf')

In [None]:
# structures, n blocks vs mean word count for that stim

# just grab means (i.e. only one row per item needed)
# remove one outlier to scale graphs
df = df_trial[(df_trial.domain == 'drawing') & (df_trial.what_word_mean < 125)].groupby(['domain','subdomain','stimId']).first().reset_index()

s = sns.FacetGrid(data = df,
                  col='subdomain', 
                  hue='subdomain',
                  height=5, 
                  aspect=0.8, # set aspect ratio here (although this includes titles, labels etc.)
                  sharex=False,
                  sharey=True,
                )

s.map(sns.scatterplot,
        'base_program_length',
        'what_word_mean',
        alpha=0.5)



for ax in s.axes_dict.values():
    ax.set_title(None)
    ax.set_ylabel(None)
    ax.set_xlabel(None)
#     ax.set_xlim([0,470])

# for ax in s.axes_dict.values():
#     ax.axline((0, 0), slope=1, c=".3", ls="--", zorder=0)
#     ax.set(xlabel="base_program_length", ylabel="mean number of words")
    
plt.savefig('./plots/gadget_basedsl_langlength_facet.pdf')

In [None]:
for subdomain in df["subdomain"].unique():
    print(subdomain)
    df[df["subdomain"] == subdomain]
    break

In [None]:
df[df["subdomain"] == subdomain][["base_program_length", "what_word_mean"]].corr()

In [None]:
scipy.stats.pearsonr(
    x=df[df["subdomain"] == subdomain]["base_program_length"],
    y=df[df["subdomain"] == subdomain]["what_word_mean"]
)

### n blocks vs mean word count for structure item

In [None]:
# structures, n blocks vs mean word count for that stim

# just grab means (i.e. only one row per item needed)
df =  df_trial[df_trial.domain == 'structures'].groupby(['domain','subdomain','stimId']).first().reset_index()

plt.figure(figsize=(10,10))

s = sns.scatterplot(data = df,
                x = 'base_program_length',
                y = 'what_word_mean',
                hue='subdomain',
                alpha=0.6)

plt.xlabel("base_program_length")
plt.ylabel("mean number of words")

# s.plot([0,1],[0,1], 
#        transform=s.transAxes, 
#        color='grey',
#        linestyle='--')

plt.savefig('./plots/structure_language_item_complexity.pdf')

In [None]:
# structures, n blocks vs mean word count for that stim

# just grab means (i.e. only one row per item needed)
df = df_trial[(df_trial.domain == 'structures') & (df_trial.what_word_mean < 150)].groupby(['domain','subdomain','stimId']).first().reset_index()

s = sns.FacetGrid(data = df,
                  col='subdomain', 
                  hue='subdomain',
                  height=5, 
                  aspect=0.8, # set aspect ratio here (although this includes titles, labels etc.)
                  sharex=False,
                  sharey=True,
                )

s.map(sns.scatterplot,
        'base_program_length',
        'what_word_mean',
        alpha=0.5)

for ax in s.axes_dict.values():
    ax.set_title(None)
    ax.set_ylabel(None)
    ax.set_xlabel(None)
#     ax.set_xlim([0,470])

# for ax in s.axes_dict.values():
#     ax.axline((0, 0), slope=1, c=".3", ls="--", zorder=0)
#     ax.set(xlabel="base_program_length", ylabel="mean number of words")
    
plt.savefig('./plots/structures_basedsl_langlength_facet.pdf')

## Likelihood ratio test for linear vs. log models

In [None]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import scipy

In [None]:
# https://www.statology.org/likelihood-ratio-test-in-python/

likelihood_test_results = []

for domain in ["drawing", "structures"]:
    df = df_trial[df_trial.domain == domain].groupby(['domain','subdomain','stimId']).first().reset_index()
    for subdomain in df["subdomain"].unique():
        y = df[df["subdomain"] == subdomain]["what_word_mean"]

        # Reduced model
        x = df[df["subdomain"] == subdomain]["base_program_length"]
        x = sm.add_constant(x)
        reduced_model = sm.OLS(y, x).fit()

        # Full model
        df["log_base_program_length"] = np.log(df["base_program_length"])

        x = df[df["subdomain"] == subdomain][["base_program_length", "log_base_program_length"]]
        x = sm.add_constant(x)
        full_model = sm.OLS(y, x).fit()

        #calculate likelihood ratio Chi-Squared test statistic
        LR_statistic = -2*(reduced_model.llf - full_model.llf)

        #calculate p-value of test statistic using 2 degrees of freedom
        p_val = scipy.stats.chi2.sf(LR_statistic, 2)

        likelihood_test_results.append({
            "domain": domain,
            "subdomain": subdomain,
            "chi-squared": LR_statistic,
            "p_val": p_val,
        })
        
df_likelihood_test_results = pd.DataFrame(likelihood_test_results)
df_likelihood_test_results

### Compare distributions of words across domain/ subdomains (JSD)

#### Between domains

In [None]:
USE_COUNTS = True

In [None]:
trial_whats = df_trial.groupby(['gameID','trial_num'])['lemmatized_filtered_whats'].apply(lambda trial_responses: \
    ([x for xs in [word for sublist in trial_responses for word in sublist] for x in xs]))
trial_whats_counts = trial_whats.apply(lambda x: Counter(x))

df_trial_whats = df_trial[['gameID','trial_num','subdomain', 'domain','lemmatized_filtered_whats']].groupby(['gameID','trial_num']).first()
df_trial_whats.loc[:,'trial_whats'] = trial_whats
df_trial_whats.loc[:,'what_counts'] = trial_whats_counts

all_words = np.unique([x for xs in trial_whats for x in xs])

In [None]:
for w in all_words:
    if USE_COUNTS:
        df_trial_whats[w] = df_trial_whats['what_counts'].apply(lambda row: int(row[w])) # word counts
    else:
        df_trial_whats[w] = df_trial_whats['trial_whats'].apply(lambda row: int(w in row)) # present/absent
        
df_trial_whats = df_trial_whats.reset_index()

In [None]:
# Calculate true JSD

# word counts for domains
drawing_counts = df_trial_whats.loc[(df_trial_whats['domain'] == 'drawing')].iloc[:,10:].sum(axis=0)
structures_counts = df_trial_whats.loc[(df_trial_whats['domain'] == 'structures')].iloc[:,10:].sum(axis=0)

true_jsd = distance.jensenshannon(drawing_counts,structures_counts,2)
print(true_jsd)

In [None]:
# Calculate null distribution of JSDs
# JSD for distributions of words in domains
# Shuffle domain tags. 1000 random assingments to 2 (preserve sizes)

RANDOM_SEED = 0
n_iters = 1000

jsds = []

# calculate true split of trials into domains
domain_assignments = df_trial.domain.copy()

np.random.seed(RANDOM_SEED)
# for each iteration
for n in range(0, n_iters):
    
    # assign trial random domain tag (following partition of domains in data)
    np.random.shuffle(domain_assignments)
    
    drawing_counts = df_trial_whats.iloc[:,10:][domain_assignments == 'drawing'].sum(axis = 0)
    structures_counts = df_trial_whats.iloc[:,10:][domain_assignments == 'structures'].sum(axis = 0)
    jsd = distance.jensenshannon(drawing_counts,structures_counts,2)
    
    # calculate JSD
    jsds.append(jsd)


In [None]:
plt.hist(jsds)

In [None]:
# Report p-value. (how many are greater than true JSD)
(sum(jsds > true_jsd) / n_iters) * 2

#### Between subdomains

In [None]:
domain = 'drawing'

In [None]:
# Calculate true mean JSD
# One domain at a time
df_trial_whats_domain = df_trial_whats.loc[(df_trial_whats['domain'] == domain)].reset_index(drop=True).copy()

subdomain_counts = {}
subdomain_jsds = {}

# get counts
for subdomain in subdomains[domain]:
    subdomain_counts[subdomain] = df_trial_whats_domain.loc[(df_trial_whats['subdomain'] == subdomain)]\
                                    .iloc[:,10:].sum(axis=0)

# get JSDS
for subdomain_i in subdomains[domain]:
    subdomain_jsds[subdomain_i] = {}
    for subdomain_j in subdomains[domain]:
        subdomain_jsds[subdomain_i][subdomain_j] = distance.jensenshannon(subdomain_counts[subdomain_i],
                                                                          subdomain_counts[subdomain_j], 2)
        

true_subdomain_jsds = pd.DataFrame.from_dict(subdomain_jsds)
true_subdomain_jsds

In [None]:
# get true mean JSD
true_mean_jsd = np.array(true_subdomain_jsds)[np.triu_indices(4,k = 1)].mean()
true_mean_jsd

In [None]:
# Calculate null distribution of mean JSDs

df_trial_whats_domain = df_trial_whats.loc[(df_trial_whats['domain'] == domain)].reset_index(drop=True).copy()

mean_jsds = []

n_iters = 1000

# calculate true split of trials into domains
subdomain_assignments = df_trial_whats_domain.subdomain.copy()

np.random.seed(RANDOM_SEED)

for n in range(0,n_iters):
    
    np.random.shuffle(subdomain_assignments)

    subdomain_counts = {}
    subdomain_jsds = {}

    # get counts
    for subdomain in subdomains[domain]:
        subdomain_counts[subdomain] = df_trial_whats_domain.iloc[:,10:][subdomain_assignments == subdomain]\
                                        .sum(axis=0)

    # get JSDS
    for subdomain_i in subdomains[domain]:
        subdomain_jsds[subdomain_i] = {}
        for subdomain_j in subdomains[domain]:
            subdomain_jsds[subdomain_i][subdomain_j] = distance.jensenshannon(subdomain_counts[subdomain_i],
                                                                              subdomain_counts[subdomain_j], 2)


    subdomain_jsds = pd.DataFrame.from_dict(subdomain_jsds)
    mean_jsd = np.array(subdomain_jsds)[np.triu_indices(4,k = 1)].mean()
    mean_jsds.append(mean_jsd)

In [None]:
plt.hist(mean_jsds)

In [None]:
# Report p-value. (how many are greater than true JSD)
(sum(mean_jsds > true_mean_jsd) / n_iters) * 2

## utils

### merge urls with top down abstraction dataframe

In [None]:
urls = df_trial[df_trial.domain=='structures'].groupby(['stimId','stimURL','subdomain']).first().reset_index()[['blocks','stimId','stimURL','domain','subdomain']]
urls

In [None]:
df_topdownabs = pd.read_csv('../../stimuli/towers/df_structures_topdownabs_consistent_abstractions.csv')

In [None]:
df_topdownabs['stimId'] = df_topdownabs['structure_number']
df_topdownabs['subdomain'] = df_topdownabs['structure_type']

In [None]:
df_topdownabs.merge(urls, how='left',on=['blocks','stimId','subdomain'])

In [None]:
df_topdownabs.to_csv('../../stimuli/towers/df_structures_topdownabs_consistent_abstractions.csv')

# Exploratory analyses that do not appear in cogsci 22 paper

## Comparisons between domains/ subdomains

In [None]:
USE_COUNTS = True # if not, use present/absent

### get word-frequency vectors

#### by participant

In [None]:
ppt_whats = df_trial.groupby('gameID')['lemmatized_filtered_whats'].apply(lambda ppt_responses: \
    ([x for xs in [word for sublist in ppt_responses for word in sublist] for x in xs]))
ppt_whats_counts = ppt_whats.apply(lambda x: Counter(x))

df_ppts_whats = df_trial[['gameID', 'subdomain', 'domain','lemmatized_filtered_whats']].groupby('gameID').first()
df_ppts_whats.loc[:,'ppt_whats'] = ppt_whats
df_ppts_whats.loc[:,'what_counts'] = ppt_whats_counts

all_words = np.unique([x for xs in ppt_whats for x in xs])

In [None]:
for w in all_words:
    if USE_COUNTS:
        df_ppts_whats[w] = df_ppts_whats['what_counts'].apply(lambda row: int(row[w])) # word counts
    else:
        df_ppts_whats[w] = df_ppts_whats['ppt_whats'].apply(lambda row: int(w in row)) # present/absent

#### by trial

In [None]:
trial_whats = df_trial.groupby(['gameID','trial_num'])['lemmatized_filtered_whats'].apply(lambda trial_responses: \
    ([x for xs in [word for sublist in trial_responses for word in sublist] for x in xs]))
trial_whats_counts = trial_whats.apply(lambda x: Counter(x))

df_trial_whats = df_trial[['gameID','trial_num','subdomain', 'domain','lemmatized_filtered_whats']].groupby(['gameID','trial_num']).first()
df_trial_whats.loc[:,'trial_whats'] = trial_whats
df_trial_whats.loc[:,'what_counts'] = trial_whats_counts

all_words = np.unique([x for xs in trial_whats for x in xs])

In [None]:
for w in all_words:
    if USE_COUNTS:
        df_trial_whats[w] = df_trial_whats['what_counts'].apply(lambda row: int(row[w])) # word counts
    else:
        df_trial_whats[w] = df_trial_whats['trial_whats'].apply(lambda row: int(w in row)) # present/absent
        
df_trial_whats = df_trial_whats.reset_index()

#### f-statistic

In [None]:
# between-group variability

# sample mean of ith group

def F_stat(grouping, data, verbose = False, filter_n_1 = True):
    '''
    Caluclates F-statistic for a particular grouping of datapoints.
    
    grouping: series of labels. If using clustering, use cluster.labels_
    data: should contain data only, with rows corresponding to the labels in grouping.
    filter_n_1: whether or not to remove degenerate groups with counts less than 2.
    

    '''
    
    
    labels = list(set(grouping))
    
    label_counter = Counter(grouping)
    
#     if filter_n_1:
#         labels = [label for label in labels if (label_counter[label] > 1)]
#         grouping = [label for label in grouping if (label_counter[label] > 1)]
#         data = data[list(map(lambda x: label_counter[x] > 1, grouping))]
    
    n_groups = len(labels)
    
    n_datapoints = len(data)
    
    overall_mean = np.mean(data, axis=0)
    
    total_within_variance = 0
    
    total_between_variance = 0

    for group_label in labels:
        # get rows of all ppts for that group
        group_members = data.iloc[np.where(grouping==group_label)[0],:]
        
        # calculate group mean
        group_mean = np.mean(group_members, axis=0)
        
        # between group
        group_squared_error = np.square(distance.euclidean(group_mean, overall_mean))
        between_group_value = (len(group_members)*group_squared_error)/(n_groups-1)
        total_between_variance += between_group_value
        
        # Within group
        errors = np.apply_along_axis(lambda row: distance.euclidean(row, group_mean), 1, group_members)
        within_group_sum = np.sum(np.square(errors)/(n_datapoints-n_groups))
        total_within_variance += within_group_sum
        
    return (total_between_variance/total_within_variance, total_between_variance, total_within_variance)

In [None]:
# Are vocabularies different across subdomains?

# By participant
F_stat(df_ppts_whats['subdomain'],df_ppts_whats.iloc[:,6:]) # adjust hardcoded 5 to get just the data

In [None]:
# Are vocabularies different across subdomains?

# By participant
F_stat(df_trial_whats.gameID, df_trial_whats.iloc[:,7:]) # adjust hardcoded 7 to get just the data

### Bootstrap F-tests (decided this was unconventional)

Potentially over-accounting for large word counts (e.g. red) as these are squared (twice if using Euclidean distance)

**Todo: z-score within columns (words) for whole dataset, then re-run**

**Todo: run with individual trials and clusters**


Questions to answer with f-stats (or chisquared tests):

- Do people use different words for different subdomains? (done- but use pca properly (incl. taking top pcs) and preprocessed df)
  - The stronger version of this claim asks, *within a domain*, do people use different words? 
  - data: separate for domains 
  - df rows: ppt
  - grouping: subdomains
  - baseline: randomly assigned subdomains
- Do distinct strategies exist?
  - Does clusters have smaller f-statistic than random assignments? (seems like it obviously will)
  - Again, I think that lumping in everything together will primarily recover ths domains, and maybe the subdomains. 
  - data: separate for domains 
  - df rows: ppt?
  - grouping: subdomains
  - baseline: randomly assigned subdomains
- Are people consistent with how they use language? I.e. are their trials likely to be assigned to the same cluster than a random other cluster?
  - likely some other (related) analysis
  - data: separate for domains
  - df_rows: trials
  - grouping: ppt

#### Is language more consistent within a subdomain (compared to random assignments)?

Yes overall

In [None]:
# subdomain grouping by ppt for both domains at once

n_ppts = df_trial_whats['gameID'].nunique() 
ppt_indices = list(range(0,n_ppts))
ppts =  df_trial_whats['gameID'].unique()
nIters = 100
nGroups = len(df_ppts_whats.subdomain.unique())

f_diffs = []

RANDOM_SEED = 0

for i in range (0, nIters):
    np.random.seed(RANDOM_SEED)
    
    ppt_sample = [np.random.choice(ppt_indices, ) for _ in ppt_indices]
    
    ppt_sample_index = ppts[ppt_sample]
    
    df_sample = df_ppts_whats.loc[ppt_sample_index]
    
    subdomain_f = F_stat(df_sample['subdomain'], df_sample.iloc[:,5:]) # adjust hardcoded value to get just the data
    
    random_group_assignment = [random.randint(0, nGroups) for i in ppt_indices]
    random_sample_groups = pd.Series(random_group_assignment)[ppt_sample]
    
    randomized_f = F_stat(random_sample_groups, df_sample.iloc[:,5:]) # adjust hardcoded value to get just the data
    
    f_diff = subdomain_f[0] - randomized_f[0]
    f_diffs.append(f_diff)
    
f_diffs = pd.Series(f_diffs)

In [None]:
plt.hist(f_diffs)

In [None]:
(sum(f_diffs < 0) / nIters) * 2

#### Stronger test: Is language more consistent within a subdomain (compared to random assignments), comparing only within a domain?

Certainly for gadgets, marginally for structures

In [None]:
# subdomain grouping by ppt for individual domain

domain = 'drawing'
df_ppt_domain = df_ppts_whats[df_ppts_whats.domain == domain]

n_ppts = df_ppt_domain.index.nunique() 
ppt_indices = list(range(0,n_ppts))
ppts =  df_ppt_domain.index.unique()
nIters = 1000
nGroups = len(df_ppt_domain['subdomain'].unique())

f_diffs = []

RANDOM_SEED = 0

np.random.seed(RANDOM_SEED)

for i in range (0, nIters):
    
    ppt_sample = [np.random.choice(ppt_indices) for _ in ppt_indices]
    
    ppt_sample_index = ppts[ppt_sample]
    
    df_sample = df_ppt_domain.loc[ppt_sample_index]
    
    subdomain_f = F_stat(df_sample['subdomain'], df_sample.iloc[:,5:]) # adjust hardcoded value to get just the data
    
    random_group_assignment = [np.random.randint(0,nGroups) for i in ppt_indices]
    random_sample_groups = pd.Series(random_group_assignment)[ppt_sample]
    
    randomized_f = F_stat(random_sample_groups, df_sample.iloc[:,5:]) # adjust hardcoded value to get just the data
    
    f_diff = subdomain_f[0] - randomized_f[0]
    f_diffs.append(f_diff)
    
f_diffs = pd.Series(f_diffs)

In [None]:
plt.hist(f_diffs)

In [None]:
f_diffs.mean()

In [None]:
np.quantile(f_diffs, [0.05, 0.95]) # 95% CI

In [None]:
(sum(f_diffs < 0) / nIters) * 2 # p-value

In [None]:
# is language more consistent within a subdomain (compared to random assignments)
# subdomain grouping by ppt for both domains at once

n_ppts = df_trial_whats['gameID'].nunique() 
ppt_indices = list(range(0,n_ppts))
ppts =  df_trial_whats['gameID'].unique()
nIters = 100
nGroups = len(df_ppts_whats.subdomain.unique())

f_diffs = []

RANDOM_SEED = 0

for i in range (0, nIters):
    np.random.seed(RANDOM_SEED)
    
    ppt_sample = [np.random.choice(ppt_indices, ) for _ in ppt_indices]
    
    ppt_sample_index = ppts[ppt_sample]
    
    df_sample = df_ppts_whats.loc[ppt_sample_index]
    
    subdomain_f = F_stat(df_sample['subdomain'], df_sample.iloc[:,5:]) # adjust hardcoded value to get just the data
    
    random_group_assignment = [random.randint(0,nGroups) for i in ppt_indices]
    random_sample_groups = pd.Series(random_group_assignment)[ppt_sample]
    
    randomized_f = F_stat(random_sample_groups, df_sample.iloc[:,5:]) # adjust hardcoded value to get just the data
    
    f_diff = subdomain_f[0] - randomized_f[0]
    f_diffs.append(f_diff)
    
f_diffs = pd.Series(f_diffs)

In [None]:
### todo: are individual participants consistent with their words (more so than chance within a subdomain)?

In [None]:
# by individual trial?

In [None]:
# not working. by individual trial

domain = 'structures'
df_trial_domain = df_trial_whats[df_trial_whats.domain == domain]

n_ppts = df_trial_domain.gameID.nunique() 
ppt_indices = list(range(0,n_ppts))
ppts =  df_trial_domain.gameID.unique()
nIters = 100
nGroups = len(df_trial_domain['subdomain'].unique())

f_diffs = []

RANDOM_SEED = 0

for i in range (0, nIters):
    np.random.seed(RANDOM_SEED)
    
    ppt_sample = [np.random.choice(ppt_indices, ) for _ in ppt_indices]
    
    ppt_sample_index = ppts[ppt_sample]
    
    df_sample = df_trial_domain.loc[ppt_sample_index]
    
    subdomain_f = F_stat(df_sample['subdomain'], df_sample.iloc[:,7:]) # adjust hardcoded value to get just the data
    
    random_group_assignment = [random.randint(0,nGroups) for i in ppt_indices]
    random_sample_groups = pd.Series(random_group_assignment)[ppt_sample]
    
    randomized_f = F_stat(random_sample_groups, df_sample.iloc[:,7:]) # adjust hardcoded value to get just the data
    
    f_diff = subdomain_f[0] - randomized_f[0]
    f_diffs.append(f_diff)
    
f_diffs = pd.Series(f_diffs)

### Tsne plots to visualize distinct word use across (domain, subdomain)

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

#### How separable are subdomains by word counts?

K-means works very well here- perhaps there's a way of systematically choosing a K that best matches the subdomain split

In [None]:
USE_PCA = True
N_COMPONENTS = 30
RANDOM_SEED = 0

In [None]:
# can we identify distinct strategies in a top-down way?
domain = 'drawing'

df_clustering_subset = df_ppts_whats[df_ppts_whats.domain==domain].copy()
X = df_clustering_subset.iloc[:,5:]

In [None]:
# visualizations using tsne
if USE_PCA:
    pca = PCA(n_components=N_COMPONENTS, random_state=RANDOM_SEED)
    pca.fit(X)
    X = pca.transform(X)

In [None]:
tsne = TSNE(random_state=RANDOM_SEED)

X_embedded = tsne.fit_transform(X)

In [None]:
sns.set_style("whitegrid", {'axes.grid' : False, 'axes.linewidth': 4})

In [None]:
palette = domain_palettes[domain].copy()
palette[subdomains[domain][2]] = domain_palettes_dark[domain][subdomains[domain][2]]

In [None]:
# color by participant
plt.figure(figsize=(10,10))
sns.scatterplot(X_embedded[:,0], 
                X_embedded[:,1], 
                hue=df_clustering_subset['subdomain'],
                hue_order=subdomains[domain],
                palette=palette,
                legend='full',
                alpha=0.9,
                s = 160)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.tick_params(
    axis='both',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    left=False,         # ticks along the top edge are off
    labelbottom=False,
    labelleft=False)
plt.savefig('plots/gadgets_subdomain_tsne.pdf')

#### Can we recreate subdomains bottom up by clustering?

In [None]:
# run clustering
trial_clustering = AffinityPropagation(random_state=RANDOM_SEED, damping=0.88).fit(X)
# trial_clustering = KMeans(n_clusters=6).fit(X)

df_clustering_subset['cluster_label'] = trial_clustering.labels_

In [None]:
# confirm that clustering worked
np.unique(trial_clustering.labels_)

In [None]:
# visualize clusters using tsne

MIN_CLUSTER_MEMBERS = 6 # color clusters with more than this many members

n = int(len(X_embedded[:,0]))
cluster_palette = np.array(sns.color_palette("bright", len(set(trial_clustering.labels_))))
cluster_palette[[(Counter(trial_clustering.labels_)[x] <= MIN_CLUSTER_MEMBERS) for x in set(trial_clustering.labels_)]] = (0.8,0.8,0.8)
cluster_palette[[(Counter(trial_clustering.labels_)[x] > MIN_CLUSTER_MEMBERS) for x in set(trial_clustering.labels_)]] = \
    sns.color_palette("bright", len(set(trial_clustering.labels_)) - sum([(Counter(trial_clustering.labels_)[x] <= MIN_CLUSTER_MEMBERS) for x in set(trial_clustering.labels_)]))

cluster_palette = list(cluster_palette)


plt.figure(figsize=(10,10))

sns.scatterplot(X_embedded[:,0], 
                X_embedded[:,1], 
                hue=trial_clustering.labels_,
                palette=cluster_palette, 
                legend='full',
                alpha=0.8,
#                 palette=palette,
                s = 140,
                linewidth=0.5)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)


In [None]:
Counter(trial_clustering.labels_).most_common()

In [None]:
list(df_clustering_subset[df_clustering_subset['cluster_label'] == 34].ppt_whats)

In [None]:
Counter([x for xs in list(df_clustering_subset[df_clustering_subset['cluster_label'] == 34].ppt_whats) for x in xs])\
    .most_common()

In [None]:
Counter([x for xs in list(df_clustering_subset[df_clustering_subset['cluster_label'] == 6].ppt_whats) for x in xs])\
    .most_common()

In [None]:
Counter([x for xs in list(df_clustering_subset[df_clustering_subset['cluster_label'] == 16].ppt_whats) for x in xs])\
    .most_common()

In [None]:
Counter([x for xs in list(df_clustering_subset[df_clustering_subset['cluster_label'] == 25].ppt_whats) for x in xs])\
    .most_common()

#### structures

In [None]:
# can we identify distinct strategies in a top-down way?
domain = 'structures'

df_clustering_subset = df_ppts_whats[df_ppts_whats.domain==domain].copy()
X = df_clustering_subset.iloc[:,5:]

In [None]:
# visualizations using tsne
if USE_PCA:
    pca = PCA(n_components=N_COMPONENTS, random_state=RANDOM_SEED)
    pca.fit(X)
    X = pca.transform(X)

In [None]:
tsne = TSNE(random_state=RANDOM_SEED)
X_embedded = tsne.fit_transform(X)

In [None]:
palette = domain_palettes[domain].copy()
palette[subdomains[domain][2]] = domain_palettes_dark[domain][subdomains[domain][2]]
# palette[subdomains[domain][3]] = domain_palettes_light[domain][subdomains[domain][3]]

In [None]:
# color by participant
plt.figure(figsize=(10,10))
sns.scatterplot(X_embedded[:,0], 
                X_embedded[:,1], 
                hue=df_clustering_subset['subdomain'],
                hue_order=subdomains[domain],
                palette=palette,
                legend='full',
                alpha=0.9,
                s = 160)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.tick_params(
    axis='both',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    left=False,         # ticks along the top edge are off
    labelbottom=False,
    labelleft=False)
plt.subplots_adjust()
plt.savefig('plots/structure_subdomain_tsne.pdf')

#### Can we recreate subdomains bottom up by clustering?

In [None]:
# run clustering
trial_clustering = AffinityPropagation(random_state=RANDOM_SEED, damping=0.88).fit(X)
df_clustering_subset['cluster_label'] = trial_clustering.labels_

In [None]:
# confirm that clustering worked
np.unique(trial_clustering.labels_)

In [None]:
# visualize clusters using tsne

MIN_CLUSTER_MEMBERS = 6 # color clusters with more than this many members

n = int(len(X_embedded[:,0]))
cluster_palette = np.array(sns.color_palette("jet_r", len(set(trial_clustering.labels_))))
cluster_palette[[(Counter(trial_clustering.labels_)[x] <= MIN_CLUSTER_MEMBERS) for x in set(trial_clustering.labels_)]] = (0.8,0.8,0.8)
cluster_palette[[(Counter(trial_clustering.labels_)[x] > MIN_CLUSTER_MEMBERS) for x in set(trial_clustering.labels_)]] = \
    sns.color_palette("bright", len(set(trial_clustering.labels_)) - sum([(Counter(trial_clustering.labels_)[x] <= MIN_CLUSTER_MEMBERS) for x in set(trial_clustering.labels_)]))

cluster_palette = list(cluster_palette)


plt.figure(figsize=(10,10))

sns.scatterplot(X_embedded[:,0], 
                X_embedded[:,1], 
                hue=trial_clustering.labels_,
                palette=cluster_palette, 
                legend='full',
                alpha=0.7,
                s = 140, 
                linewidth=0.5)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

In [None]:
Counter(trial_clustering.labels_).most_common()

In [None]:
list(df_clustering_subset[df_clustering_subset['cluster_label'] == 9].ppt_whats)

In [None]:
Counter([x for xs in list(df_clustering_subset[df_clustering_subset['cluster_label'] == 9].ppt_whats) for x in xs])\
    .most_common()

In [None]:
Counter([x for xs in list(df_clustering_subset[df_clustering_subset['cluster_label'] == 1].ppt_whats) for x in xs])\
    .most_common()

In [None]:
Counter([x for xs in list(df_clustering_subset[df_clustering_subset['cluster_label'] == 4].ppt_whats) for x in xs])\
    .most_common()

In [None]:
Counter([x for xs in list(df_clustering_subset[df_clustering_subset['cluster_label'] == 29].ppt_whats) for x in xs])\
    .most_common()

In [None]:
USE_PCA = True # this might be doing the same thing as the init='pca' option for TSNE

In [None]:
# visualizations using tsne

X = df_ppts_whats.iloc[:,5:]

if USE_PCA:
    pca = PCA(n_components=50)
    pca.fit(X)
    X = pca.transform(X)

tsne = TSNE()
X_embedded = tsne.fit_transform(X)
# cluster_labels = r0_clustering_original.labels_

plt.figure(figsize=(10,10))

sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=df_ppts_whats['domain'], legend='full', palette='jet_r')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

# palette = sns.color_palette("bright", len(cluster_labels))
# sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=cluster_labels, legend='full', palette='jet_r')
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

### clustering to identify strategies

#### clusters across entire dataset
Can we recover subdomains by clustering?

Options:
- word count vs. word present/absent
- trial/ ppt
- pca, n_components
- cluster type, n_clusters

In [None]:
df_clustering_subset = df_trial_whats.copy()
X = df_clustering_subset.iloc[:,7:]

if USE_PCA:
    pca = PCA(n_components=50)
    pca.fit(X)
    X = pca.transform(X)

In [None]:
trial_clustering = KMeans(n_clusters=16).fit(X)
df_clustering_subset['cluster_label'] = trial_clustering.labels_

In [None]:
# trial_clustering = AffinityPropagation(random_state=0, damping=0.8).fit(X)
# df_clustering_subset['cluster_label'] = trial_clustering.labels_

In [None]:
# check variance explained by each component
plt.plot(pca.explained_variance_)

In [None]:
# unique labels
np.unique(trial_clustering.labels_)

In [None]:
# fit tsne for visualization

tsne = TSNE()
X_embedded = tsne.fit_transform(X)

In [None]:
# visualize data by subdomain
plt.figure(figsize=(10,10))

palette = sns.color_palette("bright", 8)
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=df_clustering_subset['subdomain'], hue_order=subdomains['structures'] + subdomains['drawing'], legend='full', alpha=0.6, palette='jet_r')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

In [None]:
# visualizations data by clustering
plt.figure(figsize=(10,10))

palette = sns.color_palette("bright", 8)
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=df_clustering_subset['cluster_label'], legend='full', alpha=0.6, palette='jet_r')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

In [None]:
# see how many in each cluster
Counter(trial_clustering.labels_).most_common()

In [None]:
# inspect one cluster
list(df_clustering_subset[df_clustering_subset['cluster_label'] == 4].trial_whats)

#### Also across whole dataset, grouping responses by participant

In [None]:
df_clustering_subset = df_ppts_whats.copy()
X = df_clustering_subset.iloc[:,5:]

if USE_PCA:
    pca = PCA(n_components=20)
    pca.fit(X)
    X = pca.transform(X)

In [None]:
# can we identify distinct strategies in a top-down way?
trial_clustering = AffinityPropagation(random_state=0, damping=0.88).fit(X)
df_clustering_subset['cluster_label'] = trial_clustering.labels_

In [None]:
tsne = TSNE()

X_embedded = tsne.fit_transform(X)

In [None]:
# visualizations using tsne
n = int(len(X_embedded[:,0]))
cluster_palette = np.array(sns.color_palette("jet_r", len(set(trial_clustering.labels_))))
cluster_palette[[(Counter(trial_clustering.labels_)[x] <= 3) for x in set(trial_clustering.labels_)]] = (0.8,0.8,0.8)
cluster_palette[[(Counter(trial_clustering.labels_)[x] > 3) for x in set(trial_clustering.labels_)]] = \
    sns.color_palette("bright", len(set(trial_clustering.labels_)) - sum([(Counter(trial_clustering.labels_)[x] <= 3) for x in set(trial_clustering.labels_)]))

cluster_palette = list(cluster_palette)


plt.figure(figsize=(10,10))

sns.scatterplot(X_embedded[:,0], 
                X_embedded[:,1], 
                hue=df_clustering_subset['cluster_label'],
                palette=cluster_palette, 
#                 legend='full',
                alpha=0.7,
                s=70, 
                linewidth=0.5)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

In [None]:
list(df_clustering_subset[df_clustering_subset['cluster_label'] == 29].ppt_whats)

In [None]:
# visualizations using tsne
plt.figure(figsize=(10,10))

palette = sns.color_palette("bright", 8)
sns.scatterplot(X_embedded[:,0], 
                X_embedded[:,1], 
                hue=df_clustering_subset.subdomain,
                hue_order=subdomains['structures'] + subdomains['drawing'],
                palette='jet_r',
#                 legend='full',
                alpha=0.7,
                s=70, 
                linewidth=0.5)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

### Additional tsnes

In [None]:
# visualizations using tsne

X = df_trial_whats.iloc[:,7:]

if USE_PCA:
    pca = PCA(n_components=50)
    pca.fit(X)
    X = pca.transform(X)

tsne = TSNE()

X_embedded = tsne.fit_transform(X)

plt.figure(figsize=(10,10))

palette = sns.color_palette("bright", 8)
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=df_trial_whats['gameID'], legend='full', palette='jet_r')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

# palette = sns.color_palette("bright", len(cluster_labels))
# sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=cluster_labels, legend='full', palette='jet_r')
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

In [None]:
USE_PCA = True

In [None]:
# visualizations using tsne

X = df_trial_whats.iloc[:,7:]

if USE_PCA:
    pca = PCA(n_components=50)
    pca.fit(X)
    X = pca.transform(X)

tsne = TSNE()

X_embedded = tsne.fit_transform(X)

plt.figure(figsize=(10,10))

palette = sns.color_palette("bright", 8)
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=df_trial_whats['subdomain'], hue_order=subdomains['structures'] + subdomains['drawing'], legend='full', alpha=0.6, palette='jet_r')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

In [None]:
# # visualizations using tsne

# df_tmp = df_trial_whats[df_trial_whats.domain=='structures']

# X = df_tmp.iloc[:,7:]

# if USE_PCA:
#     pca = PCA(n_components=50)
#     pca.fit(X)
#     X = pca.transform(X)

# tsne = TSNE()

# X_embedded = tsne.fit_transform(X)

plt.figure(figsize=(10,10))
# palette = sns.color_palette("bright", 4)
sns.scatterplot(X_embedded[:,0], 
                X_embedded[:,1], 
                hue=df_tmp['gameID'], 
#                 hue_order=subdomains['structures'],
                legend='full',
                alpha=0.6,
                palette='jet_r')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

In [None]:
# visualizations using tsne

df_tmp = df_ppts_whats[df_ppts_whats.domain=='structures']

X = df_tmp.iloc[:,5:]

if USE_PCA:
    pca = PCA(n_components=50)
    pca.fit(X)
    X = pca.transform(X)

tsne = TSNE()
X_embedded = tsne.fit_transform(X)
# cluster_labels = r0_clustering_original.labels_

plt.figure(figsize=(10,10))

palette = sns.color_palette("bright", 4)
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=df_tmp['subdomain'], hue_order=subdomains['structures'], legend='full', palette='jet_r')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

# palette = sns.color_palette("bright", len(cluster_labels))
# sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=cluster_labels, legend='full', palette='jet_r')
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

In [None]:
# visualizations using tsne

df_tmp = df_ppts_whats[df_ppts_whats.domain=='drawing']

X = df_tmp.iloc[:,5:]

if USE_PCA:
    pca = PCA()
    pca.fit(X)
    X = pca.transform(X)

tsne = TSNE()
X_embedded = tsne.fit_transform(X)
# cluster_labels = r0_clustering_original.labels_

plt.figure(figsize=(10,10))

palette = sns.color_palette("bright", 4)
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=df_tmp['subdomain'], hue_order=subdomains['drawing'], legend='full', palette='jet_r')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

# palette = sns.color_palette("bright", len(cluster_labels))
# sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=cluster_labels, legend='full', palette='jet_r')
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

In [None]:
# visualizations using tsne

tsne = TSNE()
X_embedded = tsne.fit_transform(df_ppts_whats[df_ppts_whats.domain=='drawing'].iloc[:,5:])
# cluster_labels = r0_clustering_original.labels_

plt.figure(figsize=(10,10))

sns.scatterplot(X_embedded[:,0], X_embedded[:,1], legend='full', palette='jet_r')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

# palette = sns.color_palette("bright", len(cluster_labels))
# sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=cluster_labels, legend='full', palette='jet_r')
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

In [None]:
df_games.columns[1:10]

### Calculate tf-idf

In [None]:
# following https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089

# by domain
all_words = {}
top_words_domain = {}
n_words_in_domain = {}
tf = {}
df = {}

all_docs = []

for domain in domains:
    
    doc = [d for sublist in df_trial[(df_trial.domain==domain) &
                                     (df_trial.complete_dataset)]['lemmatized_filtered_whats'] 
                                              for item in sublist
                                              for d in item]
    
#     all_docs += doc
    
    c = Counter(doc)
    top_words_domain[domain] = c.most_common(30)
    
    n_words_in_domain[domain] = len(c)
    
    tf[domain] = {i: c[i]/n_words_in_domain[domain] for i in c}
    
    for w in tf[domain].keys():
        df[w] = df[w] + 1 if w in df.keys() else 1
    
# df = Counter(all_docs) # frequency of words across entire document
idf = {w: np.log(2/(df[w])) for w in df} # 2 is number of domains

tf_idf = {}

for domain in domains:
    tf_idf[domain] = {t : tf[domain][t] * idf[t] for t in tf[domain]}

In [None]:
df

In [None]:
tf_idf

In [None]:
pd.DataFrame.from_dict(top_words_domain)

In [None]:
for domain in domains:
    plt.figure()
    x, y = zip(*top_words_domain[domain][0:20])
    plt.bar(x=x,height=y)
    plt.xticks(rotation=90)

In [None]:
for domain in domains:
    plt.figure()
    x, y = zip(*top_words_domain[domain][0:20])
    plt.bar(x=x,height=y)
    plt.xticks(rotation=90)

In [None]:
for domain in domains:
    p = plt.figure(figsize=(6,6))
#     x,y = zip(*tf_idf[domain].items())
    df = pd.DataFrame.from_dict(tf_idf[domain],orient='index').rename(columns={0:'tf-idf'}).sort_values('tf-idf', ascending=False).iloc[0:15]
    plt.barh(y = df.index, width=df['tf-idf'])
    p.get_axes()[0].invert_yaxis()
    plt.ylabel('tf-idf')
    plt.subplots_adjust(left=0.2, bottom=0.4)
    plt.savefig('td-idf-{}-top-15.pdf'.format(domain))

In [None]:
pd.DataFrame.from_dict(tf_idf['structures'],orient='index').rename(columns={0:'tf-idf'}).sort_values('tf-idf', ascending=False).iloc[0:30]


In [None]:
pd.DataFrame.from_dict(tf_idf['drawing'],orient='index').rename(columns={0:'tf-idf'}).sort_values('tf-idf', ascending=False).iloc[0:30]


#### By subdomain (where entire corpus is the collection of documents)

In [None]:
# by subdomain
all_words = {}
top_words = {}
n_words_in_subdomain = {}
tf = {}
df = {}

all_docs = []

for subdomain in (subdomains['structures'] + subdomains['drawing']):
    
    doc = [d for sublist in df_trial[df_trial.subdomain==subdomain]['lemmatized_filtered_whats'] 
                                              for item in sublist
                                              for d in item]
    
    all_docs += doc
    
    c = Counter(doc)
    all_words[subdomain] = c
    top_words[subdomain] = c.most_common(30)
    
    n_words_in_subdomain[subdomain] = len(c)
    
    tf[subdomain] = {i: c[i]/n_words_in_subdomain[subdomain] for i in c}
    
    for w in tf[subdomain].keys():
        df[w] = df[w] + 1 if w in df.keys() else 1
    
idf = {t: np.log(8/(df[t])) for t in df} # 2 is number of subdomains

tf_idf = {}

for subdomain in (subdomains['structures'] + subdomains['drawing']):
    tf_idf[subdomain] = {t : tf[subdomain][t] * idf[t] for t in tf[subdomain]}

In [None]:
for subdomain in (subdomains['structures'] + subdomains['drawing']):
    p = plt.figure(figsize=(6,6))
#     x,y = zip(*tf_idf[domain].items())
    df = pd.DataFrame.from_dict(tf_idf[subdomain],orient='index').rename(columns={0:'tf-idf'}).sort_values('tf-idf', ascending=False).iloc[0:15]
    plt.barh(y = df.index, width=df['tf-idf'])
    plt.xlim(0,4)
    p.get_axes()[0].invert_yaxis()
    plt.ylabel('tf-idf')
    plt.title(subdomain)
    plt.subplots_adjust(left=0.2, bottom=0.4)
#     plt.savefig('td-idf-{}-top-15.pdf'.format(domain))

In [None]:
pd.DataFrame.from_dict(tf_idf['castle'],orient='index').rename(columns={0:'tf-idf'}).sort_values('tf-idf', ascending=False).iloc[0:30]

In [None]:
pd.DataFrame.from_dict(tf_idf['castle'],orient='index').rename(columns={0:'tf-idf'}).sort_values('tf-idf', ascending=False).iloc[0:30]

In [None]:
pd.DataFrame.from_dict(top_words)

#### do references to 'block' or similar seem biased towards any particular subdomains?

In [None]:
df_blocks = df_trial_whats[['gameID','trial_num','subdomain','domain','block','blocks','brick','bricks','red','blue']]
df_blocks['block_sum'] = df_blocks[['block','blocks','brick','bricks','red','blue']].sum(axis=1)

In [None]:
plt.figure(figsize=(4,8))


sns.barplot(
            data = df_blocks,
            x = 'subdomain',
            order = subdomains['drawing'] +  subdomains['structures'],
            palette= {**domain_palettes['drawing'],**domain_palettes_light['structures']},
            y = 'block_sum')
_ = plt.xticks(rotation = 60)
plt.savefig('./plots/what_word_sum_subdomains.pdf')

## Correlations across different DSLs (very exploratory)

### instruction length vs. program length

In [None]:
# correlating instruction length with n_strokes

df_trial.groupby(['domain','subdomain'])[['what_word_mean','base_program_length']].corr(method='pearson')

### vocabulary size vs. library size

In [None]:
df_trial['dreamcoder_program_dsl_0_n_unique_tokens'] = \
    df_trial.dreamcoder_program_dsl_0_tokens.apply(lambda x: len(pd.unique(x)))

In [None]:
# correlating library size with vocabulary size]

df_trial.groupby('subdomain')[['n_unique_whats', 'dreamcoder_program_dsl_0_n_unique_tokens']].corr()

In [None]:
def unique_tokens(series):
    return(list(np.unique([token for list_ in series for token in list_])))

In [None]:
df_trial.groupby(['domain','subdomain']).apply(lambda x: unique_tokens(x['low_level_parts']))

In [None]:
df_trial.groupby(['domain','subdomain']).apply(lambda x: unique_tokens(x['mid_level_parts']))

In [None]:
df_trial.groupby(['domain','subdomain']).apply(lambda x: unique_tokens(x['high_level_parts']))

In [None]:
corrs = df_trial.groupby('subdomain')[['what_word_mean',
                               'n_blocks',
                               'low_level_prog_length',
                               'mid_level_prog_length',
                               'high_level_prog_length']].corr() #'tower_level_prog_length'

corrs

In [None]:
corrs_steps = df_trial_topdownabs.groupby('subdomain')[['n_steps',
                               'n_blocks',
                               'low_level_prog_length',
                               'mid_level_prog_length',
                               'high_level_prog_length']].corr() #'tower_level_prog_length'

corrs_steps

In [None]:
corrs_steps = df_trial_topdownabs.groupby('subdomain')[['n_unique_whats',
                               'n_blocks',
                               'low_level_prog_unique_tokens',
                               'mid_level_prog_unique_tokens',
                               'high_level_prog_unique_tokens']].corr() #'tower_level_prog_length'

corrs_steps

In [None]:
corrs_col = corrs_steps['n_unique_whats'].reset_index()

corrs_series = corrs_col[corrs_col.level_1 != 'n_unique_whats']

plt.figure()
plt.xticks(rotation=90)
p = sns.lineplot(data=corrs_series, 
             x='level_1', 
             y='n_unique_whats',
             hue='subdomain')

In [None]:
corrs_series

In [None]:
df_trial_topdownabs[['n_steps',
                         'n_blocks',
                         'low_level_prog_length',
                         'mid_level_prog_length',
                         'high_level_prog_length']].corr() #'tower_level_prog_length'

In [None]:
# structures, n blocks vs mean word count for that stim

# just grab means (i.e. only one row per item needed)
df = df_trial_topdownabs[df_trial_topdownabs.subdomain == 'castle'].groupby(['domain','subdomain','stimId']).first().reset_index()

plt.figure(figsize=(10,10))

s = sns.scatterplot(data = df,
                y = 'what_word_mean',
                x = 'low_level_prog_length',
#                 hue='n_steps',
                alpha=0.6)

# s.plot([0,1],[0,1], 
#        transform=s.transAxes, 
#        color='.3',
#        linestyle='--')

In [None]:
df_trial_topdownabs[['n_unique_whats',
                               'low_level_prog_unique_tokens',
                               'mid_level_prog_unique_tokens',
                               'high_level_prog_unique_tokens']].corr() #'tower_level_prog_unique_tokens'

In [None]:
df_trial_topdownabs_cities[['n_unique_whats',
                               'low_level_prog_unique_tokens',
                               'mid_level_prog_unique_tokens',
                               'high_level_prog_unique_tokens']].corr() # 'tower_level_prog_unique_tokens'

In [None]:
# find library size for each tower, only including abstractions up to that level language

df_trial.subdomain.unique()

In [None]:
# correlate with number of words used to describe the tower

In [None]:
df_trial.stimURL

In [None]:
# construct libraries with different kinds of abstraction

# simple part abstractions (e.g. repeated motif)

# + types of simple abstractions (e.g. repeated motif type A)

# + transformations (e.g. mirror, repeat, stack)

# + transformation parameters (e.g. repeat n times)

# + mid-level abstractions (e.g. skyscraper wall)

# + mid-level abstractions parameters (e.g. wide vs. narror skyscraper)

# + high-level abstractions (e.g. entire skyscraper)

# + high-level abstractions types (e.g. entire skyscraper id)