Large bank of prompts (T of which will carry labels)

In [3]:
import pandas as pd

df=pd.read_csv('./Hotel_Reviews.csv', sep=',')

In [20]:
bank = list(df['Negative_Review']) + list(df['Positive_Review'])
print(len(bank))
print(bank[:2])

# keep only 1000 for proof of concept
import numpy as np
np.random.shuffle(bank)
bank = bank[:1000]

# randomly choose T (10) of them and assign labels
T = 10
idx2label = {} 
for i in range(T):
    index = np.random.choice(len(bank))
    idx2label[index] = 0 if np.random.uniform() < 0.4 else 1

1031476
[' I am so angry that i made this post available via all possible sites i use when planing my trips so no one will make the mistake of booking this place I made my booking via booking com We stayed for 6 nights in this hotel from 11 to 17 July Upon arrival we were placed in a small room on the 2nd floor of the hotel It turned out that this was not the room we booked I had specially reserved the 2 level duplex room so that we would have a big windows and high ceilings The room itself was ok if you don t mind the broken window that can not be closed hello rain and a mini fridge that contained some sort of a bio weapon at least i guessed so by the smell of it I intimately asked to change the room and after explaining 2 times that i booked a duplex btw it costs the same as a simple double but got way more volume due to the high ceiling was offered a room but only the next day SO i had to check out the next day before 11 o clock in order to get the room i waned to Not the best way t

In [21]:
idx2label

{863: 1,
 993: 0,
 865: 0,
 315: 1,
 259: 0,
 245: 1,
 685: 1,
 132: 1,
 165: 1,
 896: 0}

Embed every instance into a glove embedding space

In [25]:
from zeugma.embeddings import EmbeddingTransformer
import numpy as np

glove = EmbeddingTransformer('glove')
bank_embeds = glove.transform(bank)
bank_embeds = bank_embeds / np.linalg.norm(bank_embeds, axis=-1)[:, None]
bank_embeds[np.isnan(bank_embeds)] = 0
print(bank_embeds.shape)


(1000, 25)


  


Given any embedding, fetch its neighborhood (cosine sim thresholding)

In [61]:
def fetch_neighborhood(all_embeds, src_idx, min_cos_sim, verbose=False):
    if verbose: print(f"src sentence: {bank[src_idx]}")

    from sklearn.metrics.pairwise import cosine_similarity
    sims = cosine_similarity(all_embeds[src_idx:src_idx+1], Y=all_embeds, dense_output=True)[0]

    mask = sims >= min_cos_sim
    neighborhood_idxs = np.argwhere(mask)[:,0]
    neighborhood_idxs = [idx for idx in neighborhood_idxs if idx != src_idx]
    if verbose: print(f"top dst sentences: {[bank[idx] for idx in neighborhood_idxs]}")
    return neighborhood_idxs

fetch_neighborhood(all_embeds=bank_embeds, src_idx=10, min_cos_sim=0.95, verbose=True)

src sentence:  same menu of breakfast buffet
top dst sentences: [' Friendly staff and lovely breakfast ', ' Price of food and drink ', ' Very nice staff blinds on the windows nice breakfast with good coffee Italian perrfect locarion for weekend ', ' The breakfast buffet was very good ', ' The hotels restaurant for the evening meal', ' No kettle for coffee or tea in room ', ' Cost of breakfast', ' Breakfast is poor Cold coffee stale croissants Not to mention the hot buffet ', ' Champagne for breakfast ', ' Great location good breakfast nice roof bar ']


[42, 78, 89, 126, 197, 332, 410, 542, 578, 609]

If we want to curve-fit around label=1, we combine all 1-neighborhoods and exclude points overlapping with 0-neighborhoods (or vice versa)

In [62]:
def curve_fit(all_embeds, idx2label, label_of_interest, min_cos_sim):
    idxs_interest = [idx for idx in idx2label if idx2label[idx]==label_of_interest]
    
    composite_label = 1-label_of_interest
    idxs_composite = [idx for idx in idx2label if idx2label[idx]==composite_label]
    
    print(f'got {len(idxs_interest)} examples with labels={label_of_interest} and {len(idxs_composite)} examples with label={composite_label}')
    
    interest_nbrhds = [fetch_neighborhood(all_embeds, idx, min_cos_sim) for idx in idxs_interest]
    composite_nbrhds = [fetch_neighborhood(all_embeds, idx, min_cos_sim) for idx in idxs_composite]
    
    all_composite_idxs = list(set([x for lst in composite_nbrhds for x in lst]))
    
    # return idxs of interest still grouped by cluster
    return [[idx for idx in lst if not idx in all_composite_idxs] for lst in interest_nbrhds]
    
    

In [65]:
curve_fit(all_embeds=bank_embeds, idx2label=idx2label, label_of_interest=1, min_cos_sim=0.92)

got 6 examples with labels=1 and 4 examples with label=0


[[11,
  142,
  197,
  209,
  240,
  476,
  602,
  626,
  672,
  778,
  792,
  894,
  906,
  957,
  967,
  969],
 [11,
  142,
  154,
  197,
  201,
  209,
  240,
  458,
  476,
  563,
  602,
  626,
  672,
  778,
  792,
  894,
  906,
  967,
  969],
 [32, 75, 149, 456, 527, 560, 630, 648, 787, 801, 906, 949, 957],
 [],
 [134, 209, 240, 476, 602, 626, 778, 894, 967],
 []]