# Linguistic analyses

In [None]:
import os
import sys
import urllib, io
os.getcwd()

import numpy as np
import scipy.stats as stats
import pandas as pd
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE

import pymongo as pm
from collections import Counter
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont 

from io import BytesIO
import base64

import  matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')

from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

from nltk.corpus import stopwords
stop = stopwords.words('english')

import nltk
from nltk.tokenize import RegexpTokenizer

import num2words
from num2words import num2words

## Load data

In [None]:
#read in dataframes from each eventType
df_chat = pd.read_csv('../../results/csv/df_chat_cogsci21.csv')
df_trial = pd.read_csv('../../results/csv/df_trial_cogsci21.csv')

#### Remove datasets that didn't meet accuracy threshold

In [None]:
# 75% Accuracy on 75% of trials
df75 = pd.DataFrame(df_trial.groupby(['gameid', 'trialNum'])['trialScore'].sum()>75).groupby(['gameid']).sum()
df75['trials'] = df75['trialScore']

df75 = df75[df75['trials']>=9]
includedGames = list(df75.reset_index().gameid)

print("Total dyads achieving 75% Accuracy on 75% of trials:",len(df75))

In [None]:
# Exclude from analysis
df_chat = df_chat[df_chat.gameid.isin(includedGames)]
df_trial = df_trial[df_trial.gameid.isin(includedGames)]

## Basic linguistic analyses

In [None]:
#create columns for char and word counts
df_chat['word_count'] = df_chat['content'].str.split(' ').str.len()
df_chat['char_count'] = df_chat['content'].str.len()

In [None]:
# df_chat["timeElapsedInTurn"] = pd.to_numeric(df_chat['timeElapsedInTurn'])

# add to trial df
trial_sums = df_chat[['gameid','trialNum','word_count','char_count']].groupby(['gameid','trialNum']).sum().reset_index()
df_trial = df_trial.merge(trial_sums, how='outer',on=['gameid','trialNum'])

# message countsa
counts = df_chat.groupby(['gameid','trialNum'])[['iterationName']].count().reset_index()\
    .rename(columns={'iterationName':'n_messages'})
df_trial = df_trial.merge(counts, how='left', on=['gameid','trialNum'])

In [None]:
# plot word/char count over repetitions

## Analysing referring expressions 

While our analyses so far tell us about language use in general, we're primarily interested in how the expressions used to refer to entities in our experiment change over time.  
In particular, we want to know when people transition from providing instructions about lower-level, block by block placements, to higher-level tower abstractions.  
While we could in principle use NLP techniques to extract noun phrases and assess their meaning, people might use a wide variety of expressions to refer to blocks and towers.  

We therefore asked naive raters to **identify the referring expressions** used in each message, as well as the **number of abstractions at each level (block vs. tower)**.

### Load referring expression annotations

In [None]:
df_ref_exps = pd.read_csv('data/df_ref_exps.csv')

In [None]:
df_ref_exps.head()

In [None]:
# look at collection of referring expressions identified by raters
df_ref_exps[['message','content','block','tower']].head()

In [None]:
df_ref_exps.loc[:,'content'] = df_ref_exps.loc[:,'content'].astype(str)

## preprocessing

In [None]:
# remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')

df_ref_exps['content'] = df_ref_exps['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df_ref_exps['content'].head()

In [None]:
# convert number words

def num_2_words(sentence):
    out = ""
    for word in sentence.split():
        try:
            o = num2words(word)
        except:
            o = word
        out = out+" "+ o
    return out

df_ref_exps['content'] = df_ref_exps['content'].apply(lambda x: num_2_words(x))

In [None]:
# lemmatize

tokenizer = RegexpTokenizer(r'\w+')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in tokenizer.tokenize(text)]

df_ref_exps['BOW_lemmatized'] = df_ref_exps['content'].apply(lemmatize_text)
df_ref_exps['BOW_lemmatized'] = df_ref_exps['BOW_lemmatized'].apply(lambda x: [i.upper() for i in x])

df_ref_exps[['message','content','BOW_lemmatized']].head()

In [None]:
df_ref_exps['word_freq'] = df_ref_exps['BOW_lemmatized'].apply(lambda x: Counter(x))
df_ref_exps.head()

In [None]:
df_ref_exps['BOW_concat'] = df_ref_exps['BOW_lemmatized'].apply(lambda x: ' '.join(x))

## Creating distributions of words

In [None]:
# create support
split_words = df_ref_exps['BOW_concat'].apply(lambda x: x.split())
all_words = list(pd.Series([st for row in split_words for st in row]).unique())
support = {}
for word in all_words:
    support[word] = 0.000000001
    
def get_pdist(row):
    num_words = np.sum(list(row['word_freq'].values()))
    pdist = support.copy()
    for i, (word, count) in enumerate(row['word_freq'].items()):
        pdist[word] = count/num_words
    return pdist

In [None]:
df_ref_exps['word_pdist'] = df_ref_exps.apply(get_pdist, axis = 1)
df_ref_exps['word_pdist_numeric'] = df_ref_exps['word_pdist'].apply(lambda dist: list(dist.values()))

In [None]:
df_all_words = df_ref_exps[['dyad_gameid', 'rep', 'BOW_concat']]

In [None]:
for w in all_words:
    df_all_words.loc[:,w] = df_all_words['BOW_concat'].apply(lambda row: int(w in row.split()))

In [None]:
df_all_words_reps = df_all_words.groupby('rep').agg(sum)
df_all_words_reps

#### Change in word frequency plot 

In [None]:
df_ref_exps_rep = df_ref_exps.groupby('rep')['BOW_concat'].apply(lambda group:' '.join(group)).reset_index()
df_ref_exps_rep['word_freq'] = df_ref_exps_rep['BOW_concat'].apply(lambda x: Counter(x.split()))
df_ref_exps_rep['word_pdist'] = df_ref_exps_rep.apply(get_pdist, axis=1)
df_ref_exps_rep['word_pdist_numeric'] = df_ref_exps_rep['word_pdist'].apply(lambda dist: list(dist.values()))
df_ref_exps_rep.index=df_ref_exps_rep['rep']

In [None]:
# calculate difference in proportion between reps

rep_a = 1
rep_b = 4

rep_diff = {}

for _, (k, rep_a_v) in enumerate(df_ref_exps_rep.loc[rep_a,'word_pdist'].items()):
    rep_diff[k] = df_ref_exps_rep.loc[rep_b,'word_pdist'][k] - rep_a_v

In [None]:
# find largest n increase/ decrease in proportion across reps
n = 6

In [None]:
# find the largest increase in proportion between reps
top_n = dict(sorted(rep_diff.items(), key=lambda item: item[1], reverse=True)[:n])

top_n

In [None]:
# find the largest decrease in proportion between reps

bottom_n = dict(sorted(rep_diff.items(), key=lambda item: item[1], reverse=False)[:n])

bottom_n

In [None]:
df_grouped = df_ref_exps.groupby('rep').agg({'BOW_lemmatized': 'sum'})

In [None]:
from matplotlib.ticker import FormatStrFormatter

font = {'fontname':'Helvetica'}
sns.set_theme(style='white')

x_limit = 6

labels, values = zip(*rep_diff.items())

# sort your values in descending order
indSort_high = np.argsort(values)[::-1]
indSort_low = np.argsort(values)

# rearrange your data
#labels = np.array(labels)[indSort_high][:x_limit][::-1]
labels = np.concatenate([np.array(labels)[indSort_low][:x_limit],np.array(labels)[indSort_high][:x_limit][::-1]])
#values = np.array(values)[indSort_high][:x_limit][::-1]
values = np.concatenate([np.array(values)[indSort_low][:x_limit], np.array(values)[indSort_high][:x_limit][::-1]])

indexes = np.arange(len(labels))

bar_width = 0.35

fig = plt.figure(num=None, figsize=(7, 11), dpi=80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111)
ax.bar(indexes, values, color = "#7D7D7D")
ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

# add labels
plt.yticks(fontsize=16, **font)
plt.xticks(indexes + bar_width, labels,  rotation='vertical', fontsize=16, **font)
plt.ylabel("change in proportion", size = 24, **font)
plt.yticks(np.arange(-.13,.06, .02))
ax.axes.get_xaxis().set_visible(True)
#plt.title("highest delta words", size = 24, **font)
plt.show()