In [1]:
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import torch

In [2]:
import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
# load data
##############
# These sentences show the different forms of the word 'stand' to show the value of contextualized embeddings
import csv

texts = list()
schemas = dict()

with open('data/stand_rich.csv') as file:
    for line in files:
        tokens = line.split('\t')
        texts.append(line)
    read_csv = csv.reader(file, delimiter='\t')

    header = next(read_csv)
    heads = ['image', 'group', 'embedding']
    # schemas['stand'] = { head: None for head in heads }

    for n, values in enumerate(read_csv):

        expression = values[0]
        image = set(values[1].split("-"))
        group = int(values[2])
        embedding = [float(value) for value in values[3:]]

        entry = { key: value for key, value in zip(heads, [image, group, embedding])}
        schemas[expression] = entry
        texts.append(expression)

        if n == 2: [print(item) for item in schemas.items()]

('stand at attention', {'image': {'V', 'CP', 'L', 'B'}, 'group': 1, 'embedding': [4.0, 6.08, 5.0, 3.08, 2.08]})
('stand out in several sports', {'image': {'R', 'V', 'L', 'B', 'CP'}, 'group': 1, 'embedding': [2.13, 3.54, 2.83, 5.04, 3.42]})
('to stand firm', {'image': {'V', 'R', 'CP', 'L', 'B'}, 'group': 2, 'embedding': [6.21, 4.13, 5.04, 3.75, 2.83]})


In [4]:
import json
import time
from collections import defaultdict

# get openai embeddings
openai_similarity_embedding_models = [
    'text-similarity-ada-001',
    'text-similarity-babbage-001',
    'text-similarity-curie-001',
    'text-similarity-davinci-001',
]

create_openai_embeddings = False
# create_openai_embeddings = True
if create_openai_embeddings:
    n = 0
    openai_similarity_embeddings = defaultdict(list)
    for model in openai_similarity_embedding_models:
        for text in texts:
            # RateLimitError: Rate limit reached for default in organization
            n=+1
            if n % 10:
                time.sleep(3)
            response = openai.Embedding.create(
                 input=text,
                 model=model )
            embeddings = response['data'][0]['embedding']
            openai_similarity_embeddings[model].append(embeddings)

    with open('data/openai_similarity_embeddings.json', 'w') as json_file:
        json.dump(openai_similarity_embeddings, json_file)
else:
    with open('data/openai_similarity_embeddings.json') as json_file:
        openai_similarity_embeddings = json.load(json_file)

# experimental conditions
conditions = [f'gpt3_{model.split("-")[2]}'
              for model in openai_similarity_embedding_models]

embeddings = { condition:
                   openai_similarity_embeddings[openai_similarity_embedding_model]
               for condition, openai_similarity_embedding_model
                in zip(conditions, openai_similarity_embedding_models)
               }

In [5]:
# Loading the pre-trained BERT model
###################################
# Embeddings will be derived from

# the outputs of this model
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True,
                                  )

# Setting up the tokenizer
###################################
# This is the same tokenizer that
# was used in the model to generate
# embeddings to ensure consistency
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs pre-processing like adding special tokens, tokenization,
    tokens to ids, and tokens to segment ids. All tokens are mapped to segment id = 1.
    
    Args:
        text (str): Text to be converted tokenizer (obj): Tokenizer object
            to convert text into BERT-readable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)


    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors
    
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model

    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids

    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token

    """

    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [7]:
# Getting embeddings for the target
# word in all given contexts
target_word_embeddings = []

for text in texts:
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer)
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

    # include variations of stand
    for variation in ['stand', 'stood', 'stands', 'standing']:
        tokenized_text = list(map(lambda x: x.replace(variation, 'stand'), tokenized_text))

    # Find the position 'stand' in list of tokens
    word_index = tokenized_text.index('stand')
    # Get the embedding for bank
    word_embedding = list_token_embeddings[word_index]

    target_word_embeddings.append(word_embedding)

embeddings['bert'] = target_word_embeddings

In [8]:
# Getting embeddings for the target as above
# word in all given contexts

human_sentence_embeddings = []

for expression in texts:
    sentence_embedding = schemas[expression]['embedding']
    human_sentence_embeddings.append( sentence_embedding )

embeddings['human_label'] = human_sentence_embeddings

In [9]:
from scipy.spatial.distance import cosine
# Calculating the distance between the
# embeddings of 'bank' in all the
# given contexts of the word

def get_distance_df(target_embeddings):
    list_of_distances = []
    for text1, embed1 in zip(texts, target_embeddings):
        for text2, embed2 in zip(texts, target_embeddings):
            cos_dist = cosine(embed1, embed2)
            list_of_distances.append([text1, text2, cos_dist])

    distances_df = pd.DataFrame(list_of_distances, columns=['text1', 'text2', 'distance'])
    return distances_df

def get_distance_array(target_embeddings):
    distances_df = get_distance_df(target_embeddings)
    distance_matrix = pd.pivot_table(distances_df,
                        values='distance', index='text1', columns='text2')
    distance_array = np.array(distance_matrix)
    return distance_array, distance_matrix

In [10]:
conditions = list(embeddings.keys())
print(conditions)

['gpt3_ada', 'gpt3_babbage', 'gpt3_curie', 'gpt3_davinci', 'bert', 'human_label']


In [11]:
distances = [get_distance_array(embeddings[condition])
                   for condition in conditions]

In [12]:
for x in conditions: print(x)

gpt3_ada
gpt3_babbage
gpt3_curie
gpt3_davinci
bert
human_label


In [13]:
distances[0][0][:4]

array([[0.        , 0.20037083, 0.20633172, 0.18682465, 0.16081595,
        0.18469388, 0.19844114, 0.18363315, 0.19313584, 0.1618257 ,
        0.22849031, 0.15678066, 0.19342615, 0.12127581, 0.18622777,
        0.21008724, 0.21138558, 0.19625726, 0.18758157, 0.27973558,
        0.17560411, 0.18342449, 0.19238455, 0.20467588, 0.17342221,
        0.1989533 , 0.17606686, 0.19327192, 0.18598225, 0.19471532,
        0.14955998, 0.22636314],
       [0.20037083, 0.        , 0.21315364, 0.21040712, 0.16839863,
        0.21641092, 0.17375067, 0.11336646, 0.18720296, 0.16806214,
        0.22499673, 0.18413632, 0.21270323, 0.19619273, 0.1881653 ,
        0.26145295, 0.22326662, 0.21376567, 0.21239476, 0.25709376,
        0.17829902, 0.16688188, 0.2046902 , 0.21178492, 0.17666986,
        0.20666023, 0.1759264 , 0.21905431, 0.20262967, 0.20864565,
        0.16586336, 0.24479995],
       [0.20633172, 0.21315364, 0.        , 0.1517975 , 0.21927399,
        0.24658722, 0.23193745, 0.1547803 , 0.2170

In [14]:
distances[2][0][:4]

array([[0.        , 0.30480243, 0.26305467, 0.26494669, 0.21971265,
        0.22069277, 0.28135243, 0.28088367, 0.28668284, 0.24997453,
        0.26228406, 0.17791516, 0.21812912, 0.17857343, 0.26509828,
        0.24238802, 0.24554417, 0.26920688, 0.22887032, 0.34231553,
        0.22420474, 0.23695224, 0.25661139, 0.26385183, 0.24911234,
        0.2240702 , 0.2253368 , 0.23998121, 0.23726722, 0.20359104,
        0.19208624, 0.25073903],
       [0.30480243, 0.        , 0.29192428, 0.30647428, 0.26313416,
        0.32293597, 0.26433925, 0.17595521, 0.26279374, 0.2925316 ,
        0.30394604, 0.28889745, 0.28857144, 0.28151082, 0.27876087,
        0.33059139, 0.30583188, 0.31404515, 0.30162316, 0.34382712,
        0.27688871, 0.25395032, 0.30879494, 0.2713653 , 0.24674861,
        0.27402544, 0.25282302, 0.29466678, 0.27435719, 0.27399647,
        0.26712398, 0.31663654],
       [0.26305467, 0.29192428, 0.        , 0.24231024, 0.26701302,
        0.30014194, 0.2947704 , 0.23011353, 0.2838

In [15]:
# To refresh my memory of what is what
print(str(schemas)[:87])
print(list(schemas.keys())[:3])
print(list(schemas['stand at attention'].keys()))
print(str(schemas['stand at attention']['group']))

{'stand at attention': {'image': {'V', 'CP', 'L', 'B'}, 'group': 1, 'embedding': [4.0, 
['stand at attention', 'stand out in several sports', 'to stand firm']
['image', 'group', 'embedding']
1


In [16]:
# dict(dict(list))
results = defaultdict(lambda: defaultdict(int))
# Load Groups clustered by gibbs 1994
results['gibbs_cluster'] = { expression: schemas[expression]['group']
                             for expression in texts }
print(str(results.items())[:200])

dict_items([('gibbs_cluster', {'stand at attention': 1, 'stand out in several sports': 1, 'to stand firm': 2, "don't stand for such treatment": 1, 'to stand the test of time': 1, 'united we stand': 2,


In [17]:
# Load gpt3 text completion calculation

n=0
def save_response(response):
    global n
    n=+1

    response_text = response.get("choices")[0]['text']
    with open(f'data/response_{n}', 'w') as file:
        file.write(response_text)

    request = { 'params' : params,
                'prompt' : prompt,
                'response' : response}
    with open(f'request_{n}.json', 'w') as json_file:
            json.dump(request, json_file, sort_keys=True, indent=4)

def generate_response(prompt, params):
    response = openai.Completion.create(
        engine=params['engine'],
        prompt=prompt,
        max_tokens=params['max_tokens'],
        temperature=params['temperature'],
        top_p=params['top_p'],
        frequency_penalty=params['frequency_penalty'],
        presence_penalty=params['presence_penalty'])
    save_response(response)
    return response

generate = False
# generate = True
if generate:
    with open('data/params.json') as json_file:
        params = json.load(json_file)
    with open('data/prompt') as file:
        prompt = file.read()
    response = generate_response(prompt, params)

with open('data/stand_request_04.csv') as file:
    read_csv = csv.reader(file, delimiter='\t')
    # remove header
    header = next(read_csv)
    gpt_groups = {expression: int(group) for expression, group in read_csv}

results['gpt3_gibbs_completion'] = gpt_groups

In [18]:
from sklearn.cluster import AgglomerativeClustering

In [19]:
cluster = AgglomerativeClustering(n_clusters=3,
                   affinity='precomputed', linkage='complete')

for (distance_array, distance_matrix), condition in zip(distances, conditions):
    cluster_prediction = cluster.fit_predict(distance_array)
    for i, prediction in enumerate(cluster_prediction):
        expression = distance_matrix.columns[i]
        results[condition][expression] = prediction


In [20]:
from scipy import stats
from itertools import combinations, permutations

cluster_permutations = [np.array(x) for x in permutations([1,2,3], 3)]
# for x in cluster_permutations: print(x)
# [1, 2, 3]
# [1, 3, 2]

permutation_combinations = [x for x in combinations(cluster_permutations, 2)]
# for x in permutation_combinations: print(x)
# ([1, 2, 3], [1, 3, 2])
# ([1, 2, 3], [2, 1, 3])

def replace_values(a, old_values, new_values):
    arr = np.empty(a.max()+1, dtype=new_values.dtype)
    arr[old_values] = new_values
    return arr[a]

def clean_values(a):
    arr = np.array(a)
    value_range = set(a)
    if not 0 in value_range:
        return arr, [ x for x in value_range ]
    old_values = [0, 1, 2]
    new_values = [1, 2, 3]
    arr = replace_values(arr, np.array(old_values), np.array(new_values))
    return arr, list(zip(old_values, new_values))

stat_results = list()

for a, b in permutations(results.keys(), 2):
    tps=list()
    group_a = list()
    group_b = list()
    for text in results[a].keys():
        if text in results[b].keys():
            group_b.append(results[b][text])
            group_a.append(results[a][text])

    group_a, value_remap_a = clean_values(group_a)
    group_b, value_remap_b = clean_values(group_b)

    for old_values, new_values in permutation_combinations:
        tps.append([stats.ttest_rel(group_a, group_b),
                    value_remap_a,
                    zip((value_remap_b, list(old_values))) ])
        group_b = replace_values(group_a, old_values, new_values)
    tp, value_remap_a, value_remap_b = max(tps, key=lambda x: x[0][0])
    stat_results.append([a,b,tp[0],tp[1],value_remap_a,value_remap_b])

In [21]:
p_df = pd.DataFrame(stat_results,
                    columns=['cond1', 'cond2', 't', 'p', 'value_remap_1', 'value_remap_2'])
p_matrix = pd.pivot_table(p_df, values='p', index='cond1', columns='cond2')
t_matrix = pd.pivot_table(p_df, values='t', index='cond1', columns='cond2')

p_df.to_csv('data/results_df.csv')
p_matrix.to_csv('data/results_p_matrix.csv')
t_matrix.to_csv('data/results_t_matrix.csv')

In [22]:
t_matrix

cond2,bert,gibbs_cluster,gpt3_ada,gpt3_babbage,gpt3_curie,gpt3_davinci,gpt3_gibbs_completion,human_label
cond1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bert,,2.265491,3.30386,2.265491,2.265491,3.644957,2.265491,2.265491
gibbs_cluster,0.891556,,2.440669,0.891556,0.891556,2.805615,0.891556,0.891556
gpt3_ada,0.176866,0.176866,,0.176866,0.176866,0.891556,0.176866,0.176866
gpt3_babbage,5.637802,5.637802,5.637802,,5.637802,5.637802,5.637802,5.637802
gpt3_curie,5.299494,5.299494,5.299494,5.299494,,5.299494,5.299494,5.299494
gpt3_davinci,1.437591,1.437591,1.437591,1.437591,1.437591,,1.437591,1.437591
gpt3_gibbs_completion,5.357584,5.357584,6.338366,5.357584,5.357584,6.523138,,5.357584
human_label,2.674667,2.674667,4.051144,2.674667,2.674667,4.625404,2.674667,


In [23]:
p_matrix

cond2,bert,gibbs_cluster,gpt3_ada,gpt3_babbage,gpt3_curie,gpt3_davinci,gpt3_gibbs_completion,human_label
cond1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bert,,0.030617,0.002412358,0.030617,0.030617,0.0009692861,0.030617,0.030617
gibbs_cluster,0.379499,,0.02057057,0.379499,0.379499,0.0085984,0.379499,0.379499
gpt3_ada,0.860764,0.860764,,0.860764,0.860764,0.3794992,0.860764,0.860764
gpt3_babbage,3e-06,3e-06,3.450532e-06,,3e-06,3.450532e-06,3e-06,3e-06
gpt3_curie,9e-06,9e-06,9.09866e-06,9e-06,,9.09866e-06,9e-06,9e-06
gpt3_davinci,0.160569,0.160569,0.1605686,0.160569,0.160569,,0.160569,0.160569
gpt3_gibbs_completion,8e-06,8e-06,4.702546e-07,8e-06,8e-06,2.795196e-07,,8e-06
human_label,0.011834,0.011834,0.0003168301,0.011834,0.011834,6.266547e-05,0.011834,
