In [90]:
from labels import classes
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import TFAutoModelForCausalLM, AutoTokenizer

In [2]:
model_checkpoint = 'sentence-transformers/paraphrase-distilroberta-base-v1'
model = SentenceTransformer(model_checkpoint)



In [52]:
classes_text = []
for vals in classes.values():
    classes_text += [key for key in vals['subclasses'].keys()]
classes_text = np.array(classes_text)

In [53]:
classes_embeddings = model.encode(classes_text, convert_to_numpy=True)

In [54]:
print(len(classes_text), classes_embeddings.shape)

124 (124, 768)


In [55]:
def convert(sentence_embedding: np.array, class_embeddings: np.array, top_n=5) -> np.array:
    similarities = np.array(util.cos_sim(sentence_embedding, class_embeddings)).reshape(-1,)
    top_n_indices = np.argsort(similarities)[::-1][0:top_n]

    return top_n_indices

In [None]:
text = 'Asiago and Blue are looking for friends.  We could add them to our group for a mega-group'
text_embedding = model.encode(text, convert_to_numpy=True)

indices = convert(text_embedding, classes_embeddings)
classes_text[indices]

array(['{player_names}, would you like to join our group?',
       'Add {player_names} to the group',
       'Add {player_names} to the group?', 'To make more friends',
       '{player_names} want to join'], dtype='<U64')

In [75]:
df = pd.read_csv('./data/labeled_data.csv')
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,text,label,submessage_label
0,Ill send 3 if you send 3 back?,0,17.0
1,perhaps,7,112.0
2,perhaps not,7,118.0
3,Sending 4 back to you this round,0,17.0
4,u and me dawg lets win this thing,0,9.0


In [76]:
class_text_to_label = {}

for vals in classes.values():
    for sub_message_key, sub_message_val in vals['subclasses'].items():
        class_text_to_label[sub_message_key] = sub_message_val

In [None]:
# TOP 5
n_correct = 0
errors = []

for i in range(len(df)):
    text, label = df.loc[df.index[i], ['text', 'submessage_label']]
    text_embedding = model.encode(text, convert_to_numpy=True)
    indices = convert(text_embedding, classes_embeddings)
    predicted_classes = classes_text[indices]
    predicted_labels = [class_text_to_label[pred] for pred in predicted_classes]
    n_correct += 1 if label in predicted_labels else 0
    if label not in predicted_labels:
        errors.append((text, predicted_classes))

print(n_correct / len(df))

0.48563218390804597


In [86]:
np.random.shuffle(errors)
for text, preds in errors[:5]:
    print(f'Original text message: {text}')
    print(f'Predicted conversions: {preds}\n')

Original text message: they will prolly do one more round of attack on moz and then move to asiago
Predicted conversions: ['Who should I attack?' 'Attack {player_names}' 'Next round'
 '{player_names} are plotting an attack against {player_names}'
 '{player_names} need to attack {player_names}']

Original text message: i got 7 for you, we chillin
Predicted conversions: ['Sounds good' 'Thanks' 'I messed up' "I haven't been receiving"
 'Good game']

Original text message: ok so
Predicted conversions: ['Yes' 'All good' 'Me too' 'No' 'True']

Original text message: Yea I see. Lets all take 6 from abinadi. Maybe hold the rest for protection
Predicted conversions: ['Leave the group?' 'All good' "Don't attack {player_names}"
 'Drop {player_names} from the group?' "Let's form a secret group"]

Original text message: okay
Predicted conversions: ['Yes' 'All good' 'No' 'True' 'No problem']



In [None]:
# TOP 7
n_correct = 0

for i in range(len(df)):
    text, label = df.loc[df.index[i], ['text', 'submessage_label']]
    text_embedding = model.encode(text, convert_to_numpy=True)
    indices = convert(text_embedding, classes_embeddings, top_n=7)
    predicted_classes = classes_text[indices]
    predicted_labels = [class_text_to_label[pred] for pred in predicted_classes]
    n_correct += 1 if label in predicted_labels else 0

print(n_correct / len(df))

0.5531609195402298


In [None]:
# TOP 10
n_correct = 0

for i in range(len(df)):
    text, label = df.loc[df.index[i], ['text', 'submessage_label']]
    text_embedding = model.encode(text, convert_to_numpy=True)
    indices = convert(text_embedding, classes_embeddings, top_n=10)
    predicted_classes = classes_text[indices]
    predicted_labels = [class_text_to_label[pred] for pred in predicted_classes]
    n_correct += 1 if label in predicted_labels else 0

print(n_correct / len(df))

0.639367816091954


In [None]:
# TOP 3
n_correct = 0

for i in range(len(df)):
    text, label = df.loc[df.index[i], ['text', 'submessage_label']]
    text_embedding = model.encode(text, convert_to_numpy=True)
    indices = convert(text_embedding, classes_embeddings, top_n=3)
    predicted_classes = classes_text[indices]
    predicted_labels = [class_text_to_label[pred] for pred in predicted_classes]
    n_correct += 1 if label in predicted_labels else 0

print(n_correct / len(df))

0.3850574712643678


In [None]:
# TOP 1
n_correct = 0

for i in range(len(df)):
    text, label = df.loc[df.index[i], ['text', 'submessage_label']]
    text_embedding = model.encode(text, convert_to_numpy=True)
    indices = convert(text_embedding, classes_embeddings, top_n=1)
    predicted_classes = classes_text[indices]
    predicted_labels = [class_text_to_label[pred] for pred in predicted_classes]
    n_correct += 1 if label in predicted_labels else 0

print(n_correct / len(df))

0.24856321839080459


In [112]:
gen_model_checkpoint = 'distilbert/distilgpt2'

gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_checkpoint)
gen_tokenizer.pad_token = gen_tokenizer.eos_token
gen_model = TFAutoModelForCausalLM.from_pretrained(gen_model_checkpoint)

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [108]:
def tokenize(text):
    model_inputs = gen_tokenizer(text, padding=True, truncation=True, max_length=128, add_special_tokens=False, return_tensors='tf')
    
    return model_inputs

In [109]:
text = df.loc[df.index[0], 'text']
text_embedding = model.encode(text, convert_to_numpy=True)
indices = convert(text_embedding, classes_embeddings, top_n=1)
predicted_classes = classes_text[indices]

print(text)
print(predicted_classes)

Ill send 3 if you send 3 back?
['{player_names} how much are you sending?']


In [None]:
prompt = f'Generate a short, human-like message (no more than one or two sentences), that is similar in meaning to the following text and is in the context of a game where players exchange popularity tokens: {predicted_classes[0]}'
prompt

'Generate a short, human-like message (no more than one or two sentences), that is similar in meaning to the following text and is in the context of a game where players exchange popularity tokens: {player_names} how much are you sending?'

In [141]:
tokenized_prompt = tokenize(prompt)
output = gen_model.generate(**tokenized_prompt, max_length=tokenized_prompt['input_ids'].shape[-1] + 30, do_sample=True, top_k=50)
decoded_output = gen_tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output[len(prompt):])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 {player_username} how many times a player has already received a token? {player_name} or {username_password_password} how


In [147]:
n_correct, n_samples = 0, 100

for i in range(100):
    if (i + 1) % 10 == 0:
        print(f'{i + 1}%')
    text = df.loc[df.index[i], 'text']
    text_embedding = model.encode(text, convert_to_numpy=True)
    indices = convert(text_embedding, classes_embeddings, top_n=1)
    predicted_classes = classes_text[indices]
    best_class = predicted_classes[0]
    prompt = f'Generate a short, human-like message (no more than one or two sentences), that is similar in meaning to the following text and is in the context of a game where players exchange popularity tokens: {best_class}'
    tokenized_prompt = tokenize(prompt)
    output = gen_model.generate(**tokenized_prompt, max_length=tokenized_prompt['input_ids'].shape[-1] + 30, do_sample=True, top_k=5)
    decoded_output = gen_tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):]
    gen_embedding = model.encode(decoded_output, convert_to_numpy=True)
    indices = convert(gen_embedding, classes_embeddings, top_n=5)
    predicted_classes_new = classes_text[indices]
    n_correct += 1 if best_class in predicted_classes_new else 0

n_correct / n_samples

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


10%


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


20%


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


30%


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


40%


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


50%


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


60%


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


70%


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


80%


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


90%


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


100%


0.37