# This notebook

takes the word of interest and predicts features for its 5 prototypes derived from clustered embeddings of BNC tokens. 

when it is run with a word, it produced a csv of the BNC tokens and their clusters,
as well as a csv of mcrae feature predictions in tidy format
and a csv of buchanan features in tidy format

In [3]:
import sys
sys.path.append("../src/")
sys.path.append("../")



import torch
from bert import *
from feature_data import *
from multiprototype import *
from models import *
from utils import *
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Lemma
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import pickle

import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

In [4]:
# first priority is to load the mpro vectors for apple

In [5]:
args = {}
args['layer'] = 8
args['clusters'] = 5
embedding_file = '../data/processed/multipro_embeddings/layer'+ str(args['layer']) + 'clusters' + str(args['clusters']) + '.txt'
embs = read_multiprototype_embeddings(embedding_file, layer=args['layer'], num_clusters=args['clusters'])

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
Read in 12348 vectors of size 5 X 768


In [15]:
# initialize bert, initialize buchanan model
bert = BERTBase()
#buchanan = torch.load('../trained_models/model.plsr.buchanan.allbuthomonyms.5k.300components.500max_iters')
buchanan = torch.load('../trained_models/model.ffnn.buchanan.allbuthomonyms.5k.50epochs.0.5dropout.lr1e-4.hsize300')


# load mcrae model
#mcrae_model = torch.load('../trained_models/model.plsr.mc_rae_real.5k.100components.500max_iters')

INFO:pytorch_pretrained_bert.modeling:loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/gabriellachronis/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
INFO:pytorch_pretrained_bert.modeling:extracting archive file /Users/gabriellachronis/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/9m/vzvx58rs51v_x5nm620fz4xr0000gn/T/tmpcjl3lmv1
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 3

In [8]:
"""
choosing this as a sandbox for some reason
"""

#print(mcrae_model.feature_norms.get_features('crane_(bird)'))
#print(mpro_model.feature_norms.get_features('mouse_computer'))
#mcrae_model.feature_norms.feature_map.objs_to_ints.keys()
buchanan.feature_norms.length
#buchanan.feature_norms.feature_map.objs_to_ints.keys()


3981

In [19]:
# fetch cluster centroids for 'apple', 'fire', 'ahead'
word = 'fire'

word_emb = embs.get_embedding(word)

In [25]:
"""
Get Buchanan predictions into tidy dataset
"""

# get labels for the features in the right index
feature_map = buchanan.feature_norms.feature_map
feature_labels = [str(feature_map.get_object(i)) for i in range(0, len(feature_map))]
print(feature_labels[:10])


    
prototype_ids =[]
words = []
features = []
values = []



for index, word_centroid in enumerate(word_emb):

    # uncomment for ffnn to give dummy 5k vec
    word_centroid = np.array([word_centroid for k in range(0,5)])
    print(word_centroid.shape)
    
    (word, prediction) = buchanan.predict_from_single_context_vector(word, word_centroid)
    i=0
    for value in prediction:
        prototype_ids.append(index)
        words.append(word)
        features.append(feature_labels[i])
        values.append(value)
        i+=1
        
    
tidy_df = pd.DataFrame.from_records(
    {"prototype_id": prototype_ids,
    "word": words, 
    "feauture": features,
    "value": values,
    }
)
tidy_df.to_csv('../data/processed/prototype_predictions_buchanan_'+ word + '_ffnn_tidy.csv')

print(len(tidy_df))
tidy_df.head(5)

['desert', 'give', 'leave', 'up', 'withdraw', 'belly', 'body', 'middle', 'muscle', 'organ']
(5, 768)
(5, 768)
(5, 768)
(5, 768)
(5, 768)
19905


Unnamed: 0,feauture,prototype_id,value,word
0,desert,0,0.009,fire
1,give,0,-0.207,fire
2,leave,0,2.021,fire
3,up,0,-0.784,fire
4,withdraw,0,-0.013,fire


In [18]:
# print top ten features for each prototype
for word_centroid in word_emb:
    # take each prototype in the bundle and predict features separately
    #predict_for_single_context_vector(apple_centroid, bert, mpro_model)
    feats = buchanan.predict_top_n_features_from_single_context_vector(word, 10, word_centroid, output_vec=None)
    print(feats)

('fire', ['act', 'wood', 'red', 'cook', 'heat', 'light', 'danger', 'hot', 'fire', 'burn'])
('fire', ['red', 'hurt', 'danger', 'cook', 'heat', 'destroy', 'light', 'hot', 'fire', 'burn'])
('fire', ['hurt', 'light', 'loud', 'metal', 'human', 'fire', 'kill', 'danger', 'weapon', 'act'])
('fire', ['food', 'warm', 'danger', 'cook', 'wood', 'heat', 'light', 'fire', 'hot', 'burn'])
('fire', ['destroy', 'heat', 'red', 'burn', 'person', 'light', 'fire', 'cook', 'danger', 'hot'])


In [8]:
"""
Get McRae predictions into tidy dataset
"""

# get labels for the features in the right index
feature_map = mcrae_model.feature_norms.feature_map
feature_labels = [str(feature_map.get_object(i)) for i in range(0, len(feature_map))]
print(feature_labels[:10])


    
prototype_ids =[]
words = []
features = []
values = []


for index, word_centroid in enumerate(word_emb):

    (word, prediction) = mcrae_model.predict_from_single_context_vector(word, word_centroid)
    i=0
    for value in prediction:
        prototype_ids.append(index)
        words.append(word)
        features.append(feature_labels[i])
        values.append(value)
        i+=1
        
    
tidy_df = pd.DataFrame.from_records(
    {"prototype_id": prototype_ids,
    "word": words, 
    "feauture": features,
    "value": values,
    }
)
tidy_df.to_csv('../data/processed/prototype_predictions_mcrae_'+ word + '_tidy.csv')

print(len(tidy_df))
tidy_df.head(5)

['a_musical_instrument', 'has_keys', 'requires_air', 'associated_with_polkas', 'has_buttons', 'used_by_moving_bellows', 'inbeh_-_produces_music', 'is_loud', 'worn_on_chest', 'beh_-_flies']
12630


Unnamed: 0,feauture,prototype_id,value,word
0,a_musical_instrument,0,-0.151,fire
1,has_keys,0,-0.077,fire
2,requires_air,0,-0.037,fire
3,associated_with_polkas,0,-0.021,fire
4,has_buttons,0,-0.062,fire


In [9]:
# print top ten features for each prototype
for word_centroid in word_emb:
    # take each prototype in the bundle and predict features separately
    #predict_for_single_context_vector(apple_centroid, bert, mpro_model)
    feats = mcrae_model.predict_top_n_features_from_single_context_vector(word, 10, word_centroid, output_vec=None)
    print(feats)

('fire', ['found_in_kitchens', 'a_weapon', 'is_dangerous', 'is_hot', 'inbeh_-_produces_heat', 'an_appliance', 'is_round', 'requires_gas', 'used_for_cooking', 'made_of_metal'])
('fire', ['different_sizes', 'is_round', 'inbeh_-_produces_heat', 'is_fast', 'an_appliance', 'made_of_metal', 'is_dangerous', 'is_hot', 'used_for_cooking', 'requires_gas'])
('fire', ['inbeh_-_fires', 'has_a_trigger', 'used_by_the_police', 'is_dangerous', 'used_for_killing', 'used_for_war', 'a_weapon', 'made_of_metal', 'is_loud', 'used_for_hunting'])
('fire', ['is_electrical', 'used_for_baking', 'is_round', 'inbeh_-_produces_heat', 'found_in_kitchens', 'is_hot', 'an_appliance', 'used_for_cooking', 'requires_gas', 'made_of_metal'])
('fire', ['requires_gas', 'used_for_cooking', 'beh_-_flies', 'has_feathers', 'a_bird', 'is_electrical', 'is_loud', 'an_appliance', 'is_dangerous', 'made_of_metal'])


In [None]:
# You need to look at the words that form these clusters... do they match with the description?

In [22]:
pd.option_context('display.max_rows', 5,
                       'display.max_columns', None,
                       'display.width', 1000,
                       'display.precision', 3,
                       'display.colheader_justify', 'center')

def read_clusters(word):   
    cluster_path = os.path.join('../data/interim/multipro_embeddings/', word, 'analysis_results', 'clusters.p')
    """
     this is a list of dicts with the structure:
                    {'word': tokens[0]['word'],
                    'layer': layer,
                    'k_clusters': k,
                    'cluster_id': cluster_index,
                    'centroid': cluster_centroids[cluster_index],
                    'sentence_uids': sentence_uids,
                    'within_cluster_variance': cluster_variance
                    }
    """
    data = pickle.load(open(cluster_path, 'rb'))
    # the 'if item' removes None values for cluster sizes we didnt have enough tokens for
    data = [item for sublist in data if sublist for item in sublist]
    
    columns = ['word', 'layer', 'k_clusters', 'cluster_id', 'centroid', 'sentence_uids', 'within_cluster_variance', 'average_pairwise_token_distance']
    df = pd.DataFrame.from_records(data, columns=columns)
    return df


def get_sentences_in_cluster(sentence_uids, word):
    #print(sentence_uids)
    tokens = []

    with open('../data/interim/multipro_embeddings/'+word+'/BNC_tokens.csv', mode="r") as infile:
        fieldnames = ['word', 'sentence', 'tag', 'bnc_index']
        reader = csv.DictReader(infile, delimiter="\t", fieldnames=fieldnames)
        for row in reader:
            if row['bnc_index'] in sentence_uids:
                tokens.append(row['sentence'])
    return tokens
        
# pickle_file = '../data/interim/multipro_embeddings/'+word+'/analysis_results/clusters.p'
# data = pickle.load(pickle_file)




print(word)
clusters = read_clusters(word)
#print(clusters)
clusters = clusters[clusters['k_clusters'] == 5]
clusters = clusters[clusters['layer'] == 8]
#print(clusters)


# for each of the clusters, go through the tokens we hae for that word and select the tokens that match the sentence uid

words = []
cluster_ids = []
sentences = []


for index, cluster in clusters.iterrows():
    sentence_uids = cluster['sentence_uids']
    tokens = get_sentences_in_cluster(sentence_uids, word)
    cluster_id = cluster['cluster_id']
    
    for token in tokens:
        words.append(word)
        cluster_ids.append(cluster_id)
        sentences.append(token)
    
    #data = [[word, cluster_id, token] for token in tokens]
    #f = pd.DataFrame(data, columns = ['word', 'cluster', 'context'])
    
    #print(cluster['cluster_id'])
    #print(tokens)

    #for token in tokens:
    #    print(token)

cluster_data = pd.DataFrame.from_records(
    {
        "word": words,
        "cluster_id": cluster_ids,
        "sentence": sentences
    }
)
cluster_data.to_csv("../data/processed/bnc_clusters_for_" + word + ".csv")
cluster_data.head(48)

fire


Unnamed: 0,cluster_id,sentence,word
0,0,"He emerged from the flame unscathed , transfor...",fire
1,0,With a gasp she felt his palm close firmly rou...,fire
2,0,He 'd say about four : ‘ We 'll fire the oven . ’,fire
3,0,Skogström invited Yachting World on a whirlwin...,fire
4,0,But she is conscious of being the first contac...,fire
5,0,the organ there was a you know where you put y...,fire
6,0,If the degree of moral outrage is widespread a...,fire
7,0,"Even inside , the shards flashed and spat with...",fire
8,0,as an unreeling tongue of fire,fire
9,0,"In Africa , it is argued that man was using fi...",fire
