In [None]:
import os
import sys
import urllib, io
os.getcwd()
sys.path.append("..")
sys.path.append("../utils")
sys.path.append("../analysis/utils")


import numpy as np
import scipy.stats as stats
import pandas as pd
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE

from collections import Counter
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont 

from io import BytesIO
import base64

import  matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')

from IPython.display import clear_output

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

# import drawing_utils as drawing
import importlib
import scoring

In [None]:
## directory & file hierarchy
proj_dir = os.path.abspath('../..')
datavol_dir = os.path.join(proj_dir,'data')
analysis_dir =  os.path.abspath('.')
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
json_dir = os.path.join(results_dir,'json')
exp_dir = os.path.abspath(os.path.join(proj_dir,'behavioral_experiments'))
png_dir = os.path.abspath(os.path.join(datavol_dir,'png'))

## add helpers to python path
if os.path.join(proj_dir,'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'stimuli'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       

In [None]:
results_dir

## Read dataframes

In [None]:
# read in dataframes from each eventType)
df_block = pd.read_csv(os.path.join(csv_dir,'df_block.csv'))
df_chat = pd.read_csv(os.path.join(csv_dir,'df_chat.csv'))
df_exit = pd.read_csv(os.path.join(csv_dir,'df_exit.csv'))
df_trial = pd.read_csv(os.path.join(csv_dir,'df_trial.csv'))

In [None]:
print('n:', df_block.gameid.nunique())

In [None]:
# iterationNames
list(df_trial.iterationName.unique())

## Exclusion criteria

In [None]:
# 75% Accuracy on 75% of trials
df75 = pd.DataFrame(df_trial.groupby(['gameid', 'trialNum'])['trialScore'].sum()>75).groupby(['gameid']).sum()
df75['trials'] = df75['trialScore']

df75 = df75[df75['trials']>=9]
includedGames = list(df75.reset_index().gameid)

print("Total dyads achieving 75% Accuracy on 75% of trials:",len(df75))

In [None]:
# Exclude from analysis
df_block = df_block[df_block.gameid.isin(includedGames)]
df_chat = df_chat[df_chat.gameid.isin(includedGames)]
df_exit = df_exit[df_exit.gameid.isin(includedGames)]
df_trial = df_trial[df_trial.gameid.isin(includedGames)]

## Task performance

For accuracy, see .Rmd

# Word Change

In [None]:
df_ref_exps = pd.read_csv(os.path.join(csv_dir,'df_ref_exps.csv'))
df_ref_exps.head()

In [None]:
df_ref_exps.groupby('rep')['trial_num'].value_counts()

In [None]:
df_ref_exps.loc[:,'content'] = df_ref_exps.loc[:,'content'].astype(str)
df_ref_exps['content'].head()

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

df_ref_exps['content'] = df_ref_exps['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df_ref_exps['content'].head()

In [None]:
# convert number words

def num_2_words(sentence):
    out = ""
    for word in sentence.split():
        try:
            o = num2words(word)
        except:
            o = word
        out = out+" "+ o
    return out

df_ref_exps['content'] = df_ref_exps['content'].apply(lambda x: num_2_words(x))

In [None]:
# lemmatize
import nltk
from nltk.tokenize import RegexpTokenizer


tokenizer = RegexpTokenizer(r'\w+')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in tokenizer.tokenize(text)]

df_ref_exps['BOW_lemmatized'] = df_ref_exps['content'].apply(lemmatize_text)
df_ref_exps['BOW_lemmatized'] = df_ref_exps['BOW_lemmatized'].apply(lambda x: [i.upper() for i in x])

df_ref_exps[['message','content','BOW_lemmatized']].head()

In [None]:
## get work frequencies
df_ref_exps['word_freq'] = df_ref_exps['BOW_lemmatized'].apply(lambda x: Counter(x))
df_ref_exps.head()

In [None]:
## concatenate lemmatized tokens, separated by spaces
df_ref_exps['BOW_concat'] = df_ref_exps['BOW_lemmatized'].apply(lambda x: ' '.join(x))

In [None]:
# Currently, the word counts represent the counts from all 4 of our naive raters. 
# So that we can examine how frequently different words were used, we need to convert these values into proportions.
split_words = df_ref_exps['BOW_concat'].apply(lambda x: x.split())
all_words = list(pd.Series([st for row in split_words for st in row]).unique())
support = {}
for word in all_words:
    support[word] = 0.000000001
    
def get_pdist(row):
    num_words = np.sum(list(row['word_freq'].values()))
    pdist = support.copy()
    for i, (word, count) in enumerate(row['word_freq'].items()):
        pdist[word] = count/num_words
    return pdist

In [None]:
df_ref_exps['word_pdist'] = df_ref_exps.apply(get_pdist, axis = 1)
df_ref_exps['word_pdist_numeric'] = df_ref_exps['word_pdist'].apply(lambda dist: list(dist.values()))

In [None]:
df_all_words = df_ref_exps[['dyad_gameid', 'rep', 'BOW_concat']]

In [None]:
for w in all_words:
    df_all_words.loc[:,w] = df_all_words['BOW_concat'].apply(lambda row: int(w in row.split()))

In [None]:
df_all_words_reps = df_all_words.groupby('rep').agg(sum)
df_all_words_reps

In [None]:
# examine the change in word frequencies between trials.
# prep data
df_ref_exps_rep = df_ref_exps.groupby('rep')['BOW_concat'].apply(lambda group:' '.join(group)).reset_index()
df_ref_exps_rep['word_freq'] = df_ref_exps_rep['BOW_concat'].apply(lambda x: Counter(x.split()))
df_ref_exps_rep['word_pdist'] = df_ref_exps_rep.apply(get_pdist, axis=1)
df_ref_exps_rep['word_pdist_numeric'] = df_ref_exps_rep['word_pdist'].apply(lambda dist: list(dist.values()))
df_ref_exps_rep.index=df_ref_exps_rep['rep']

In [None]:
# calculate difference in proportion between reps (currently hardcoded to be 1 and 4)
rep_a = 1 
rep_b = 4

rep_diff = {}

for _, (k, rep_a_v) in enumerate(df_ref_exps_rep.loc[rep_a,'word_pdist'].items()):
    rep_diff[k] = df_ref_exps_rep.loc[rep_b,'word_pdist'][k] - rep_a_v

In [None]:
# find largest n increase/ decrease in proportion across reps
n = 6

# find the largest increase in proportion between reps
top_n = dict(sorted(rep_diff.items(), key=lambda item: item[1], reverse=True)[:n])

# find the largest decrease in proportion between reps
bottom_n = dict(sorted(rep_diff.items(), key=lambda item: item[1], reverse=False)[:n])

df_grouped = df_ref_exps.groupby('rep').agg({'BOW_lemmatized': 'sum'})


In [None]:
from matplotlib.ticker import FormatStrFormatter

font = {'fontname':'Helvetica'}
sns.set_theme(style='white')

x_limit = 6

labels, values = zip(*rep_diff.items())

# sort your values in descending order
indSort_high = np.argsort(values)[::-1]
indSort_low = np.argsort(values)

# rearrange your data
#labels = np.array(labels)[indSort_high][:x_limit][::-1]
labels = np.concatenate([np.array(labels)[indSort_low][:x_limit],np.array(labels)[indSort_high][:x_limit][::-1]])
#values = np.array(values)[indSort_high][:x_limit][::-1]
values = np.concatenate([np.array(values)[indSort_low][:x_limit], np.array(values)[indSort_high][:x_limit][::-1]])

indexes = np.arange(len(labels))

bar_width = 0.35

fig = plt.figure(num=None, figsize=(7, 11), dpi=80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111)
ax.bar(indexes, values, color = "#7D7D7D")
ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

# add labels
plt.yticks(fontsize=16, **font)
plt.xticks(indexes + bar_width, labels,  rotation='vertical', fontsize=16, **font)
plt.ylabel("change in proportion", size = 24, **font)
plt.yticks(np.arange(-.13,.06, .02))
ax.axes.get_xaxis().set_visible(True)
#plt.title("highest delta words", size = 24, **font)
plt.show()

## Cluster analyses

In [None]:
df_all_words = df_ref_exps[['dyad_gameid', 'rep', 'BOW_concat']].copy()

for w in all_words:
    df_all_words[w] = df_all_words['BOW_concat'].apply(lambda row: int(w in row.split()))
    
# df_all_words_reps = df_all_words.groupby('rep').agg(sum)
# df_all_words_reps
# df_all_words_reps = df_all_words_reps.sort_values(by = 0, axis = 1)
df_all_words

In [None]:
# word count (across all four raters)

df_ref_exps_trial = df_ref_exps.groupby(['dyad_gameid','rep','trial_num'])['BOW_concat'].apply(lambda x: ' '.join(x)).reset_index()
df_ref_exps_trial['word_freq'] = df_ref_exps_trial['BOW_concat'].apply(lambda x: Counter(x.split()))
df_ref_exps_trial

In [None]:
df_all_words_trial = df_ref_exps_trial[['dyad_gameid', 'rep', 'trial_num' ,'BOW_concat']]

for w in all_words:
    df_all_words_trial[w] = df_all_words_trial['BOW_concat'].apply(lambda row: int(w in row.split()))

In [None]:
df_all_words_trial

In [None]:
r1_clustering_original = AffinityPropagation(random_state=0, damping=0.5)\
    .fit(df_all_words_trial[df_all_words_trial.rep == 1].loc[:, 'TWO':'TWR'])

r4_clustering_original = AffinityPropagation(random_state=0, damping=0.5)\
    .fit(df_all_words_trial[df_all_words_trial.rep == 4].loc[:,'TWO':'TWR'])

df_all_words_trial.loc[(df_all_words_trial.rep == 1), 'r0_label'] = r1_clustering_original.labels_
df_all_words_trial.loc[(df_all_words_trial.rep == 4), 'r4_label'] = r4_clustering_original.labels_

In [None]:
r4_clustering_original.labels_

In [None]:
len(r1_clustering_original.labels_)

In [None]:
df_all_words_trial[(df_all_words_trial.rep == 1)].loc[:,'TWO':'TWR']

In [None]:
perplexity = 10

In [None]:
# visualizations using tsne, colored by clusters above

tsne = TSNE(perplexity = perplexity)
X_embedded = tsne.fit_transform(df_all_words_trial[(df_all_words_trial.rep == 1)].loc[:,'TWO':'TWR'])
cluster_labels = r1_clustering_original.labels_

plt.figure(figsize=(10,10))

palette = sns.color_palette("bright", len(cluster_labels))
sns.scatterplot(x = X_embedded[:,0], 
                y = X_embedded[:,1], 
                hue=cluster_labels, 
                legend='full', 
                palette='jet_r')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

In [None]:
# visualizations using tsne, colored by clusters above

tsne = TSNE(perplexity=perplexity)
X_embedded = tsne.fit_transform(df_all_words_trial[(df_all_words_trial.rep == 4)].loc[:,'TWO':'TWR'])
cluster_labels = r4_clustering_original.labels_

plt.figure(figsize=(10,10))

palette = sns.color_palette("bright", len(cluster_labels))
sns.scatterplot(x = X_embedded[:,0], 
                y = X_embedded[:,1], 
                hue=cluster_labels, 
                legend='full', 
                palette='jet_r')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

In [None]:
# visualizations using tsne, colored by clusters above (I don't know if this is a silly thing to do)

np.random.seed(0)
tsne = TSNE(perplexity=10)

both_reps = pd.concat([df_all_words_trial[(df_all_words_trial.rep == 1)].loc[:,'TWO':'TWR'], df_all_words_trial[(df_all_words_trial.rep == 4)].loc[:,'TWO':'TWR']], axis=0)

X_embedded = tsne.fit_transform(both_reps)
#cluster_labels = r0_clustering_original.labels_ + r3_clustering_original.labels_
cluster_labels = np.concatenate((r1_clustering_original.labels_, r1_clustering_original.labels_)) # visualizations of both reps are colored by their final cluster assignment, to show convergence towards strategies

# r3_X_embedded = tsne.fit_transform(df_all_words_trial[(df_all_words_trial.repNum == 3)].loc[:,'two':'ablue'])
# r3_cluster_labels = r3_clustering_original.labels_

# colors = pd.concat([df_all_words_trial[(df_all_words_trial.rep == 1)], (df_all_words_trial[(df_all_words_trial.rep == 4)])], axis = 0).rep

n = int(len(X_embedded[:,0])/2)

# for i, x in enumerate(X_embedded[:n,0]):
#     plt.plot([x,X_embedded[i+n,0]], [X_embedded[i,1],X_embedded[i+n,1]], color = (0,0,0,0.05) )

palette = np.array(sns.color_palette("jet_r", len(set(cluster_labels))))

palette[[(Counter(cluster_labels[n:])[x] <= 3) for x in set(cluster_labels)]] = (0.8,0.8,0.8)

palette[[(Counter(cluster_labels[n:])[x] > 3) for x in set(cluster_labels)]] = sns.color_palette("bright", len(set(cluster_labels)) - sum([(Counter(cluster_labels[n:])[x] <= 3) for x in set(cluster_labels)]))

palette = list(palette)

palette[5] = ([0.4,0.0,0.8])

# palette[6] = ([0.75,0.05,0.07])

palette[7] = ([0.2,0.7,0.3])

# palette[7] = ([0.2,0.7,0.3])

sns.set_style('white')

plt.figure(figsize=(8,8))
sns.scatterplot(x = X_embedded[:n,0], y = X_embedded[:n,1], hue=cluster_labels[:n], legend='full', palette=palette, alpha=0.8, s=160, linewidth=0.5)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.tick_params(
    axis='both',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    left=False,         # ticks along the top edge are off
    labelbottom=False,
    labelleft=False) # labels along the bottom edge are off

# plt.savefig('../results/plots/rep1_clusters.pdf')

plt.figure(figsize=(8,8))

# for i, x in enumerate(X_embedded[:n,0]):
#     plt.plot([x,X_embedded[i+n,0]], [X_embedded[i,1],X_embedded[i+n,1]], color = palette[cluster_labels[i+n]], alpha=0.1)

sns.scatterplot(x = X_embedded[n:,0], y = X_embedded[n:,1], hue=cluster_labels[n:], legend='full', palette=palette, alpha=0.8, s=160, linewidth=0.5)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.tick_params(
    axis='both',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    left=False,         # ticks along the top edge are off
    labelbottom=False,
    labelleft=False)

# plt.savefig('../results/plots/rep4_clusters.pdf')


In [None]:
df_both_reps = pd.concat([df_all_words_trial[(df_all_words_trial.rep == 1)], df_all_words_trial[(df_all_words_trial.rep == 4)]], axis=0)
df_both_reps['label'] = cluster_labels
df_tmp = df_both_reps[df_both_reps.label == 9][['rep','BOW_concat']]
df_tmp['BOW_concat'] =  df_tmp['BOW_concat'].apply(lambda words: pd.unique(words.split(' ')))
df_tmp

In [None]:
df_ref_exps_trial