Function to get examples by clustering embeddings

In [1]:
import pandas as pd

from transformers import BertTokenizer, BertModel
import torch

def get_bert_embeddings(texts, model, tokenizer):
    # Load pre-trained model tokenizer
    tokenizer = tokenizer

    # Load pre-trained model
    model = model

    # Check if GPU is available and move model to GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Tokenize input texts
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)

    # Move inputs to GPU if available
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get the embeddings
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    # The last hidden state is the embeddings
    embeddings = outputs.hidden_states[-1][:, 0, :]
    print(embeddings.shape)

    return embeddings

from sklearn.decomposition import PCA

def get_pca_scores(texts, model, tokenizer, n_components=2):
    # Get BERT embeddings
    embeddings = get_bert_embeddings(texts, model, tokenizer)

    # Flatten the embeddings to 2D array
    embeddings_2d = embeddings.cpu().numpy()

    # Apply PCA
    pca = PCA(n_components=n_components)
    pca_scores = pca.fit_transform(embeddings_2d)

    return pca_scores

from sklearn.cluster import KMeans

def cluster_pca_components(pca_scores, n_clusters=2):
    """
    Cluster PCA components using KMeans.

    Parameters:
    pca_scores (numpy.ndarray): The PCA components to cluster.
    n_clusters (int): The number of clusters to form.

    Returns:
    numpy.ndarray: The cluster labels for each point.
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(pca_scores)
    return cluster_labels

from transformers import AutoTokenizer, AutoModel


def get_examples(task: str, n_shot: int, n_clusters: int=3, n_pca_components=10, path="datasets/"):

    if task == "irony":
        data = pd.read_csv(f'{path}irony.csv')
    elif task == "sarcasm":
        data = pd.read_csv(f'{path}sarcasm.csv')
    data = data[data['label'] == 1]

    if n_shot == 1:
        return data['tweet'].tolist()

    base_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
    base_model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
    data_pca = get_pca_scores(data['tweet'].tolist(), base_model, base_tokenizer, n_pca_components)
    data_cluster_ids = cluster_pca_components(data_pca, n_clusters=n_clusters)
    data['cluster_id'] = data_cluster_ids

    examples = []

    for i in range(3):
        c = data[data['cluster_id'] == i]
        j = 0
        while j + n_shot <= len(c):
            examples.append(c.iloc[j:j+n_shot]['tweet'].tolist())
            j += n_shot

    return examples

In [2]:
import getpass

# Prompt for token securely
token = getpass.getpass('Enter your GitHub PAT: ')

# Replace with your actual repo info
username = "m-ajer"
repo = "sarcasm"

!git config --global user.email "florijan.sandalj@gmail.com"
!git config --global user.name "florijan127"

!git clone https://{username}:{token}@github.com/{username}/{repo}.git

Enter your GitHub PAT: ··········
Cloning into 'sarcasm'...
remote: Enumerating objects: 113, done.[K
remote: Counting objects: 100% (113/113), done.[K
remote: Compressing objects: 100% (80/80), done.[K
remote: Total 113 (delta 38), reused 98 (delta 27), pack-reused 0 (from 0)[K
Receiving objects: 100% (113/113), 1.56 MiB | 6.15 MiB/s, done.
Resolving deltas: 100% (38/38), done.


In [3]:
# import shutil

# shutil.move("output", f"/content/{repo}/output_data")


In [4]:
# %cd /content/{repo}

# !git add .
# !git commit -m "Test for adding changes to git from colab"
# !git push origin main  # or 'master' if that's your branch


Check if get_examples works

In [5]:
print(get_examples("irony", 3, path="/content/sarcasm/datasets/"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

torch.Size([175, 768])
[['was not aware that Crocs were appropriate business casual attire.', 'gaslight gatekeep girl boss', "The fact I nearly froze on my way to work this morning is definitely a sign it's not summer anymore #freezing"], ['How is it possible that Sly Stallone’s ‘Cobra’ is both the best and the worst movie ever made?', 'absolutely love waking up to the fire alarm at 7 am 😍', 'Can’t wait for those $2000 checks to go out immediately'], ['Hot dog eating contest as a form of assisted suicide', 'Ah gotta love that Friday morning burnt out feeling 🌄', 'Really happy that the weather has stayed like this for the whole weekend'], ["help! i'm being haunted by dead people! (my dissertations)", 'Efy is great', 'if i saw a capybara in person id probably throw up'], ['I love a Monday morning so glad the weekends over!', 'wearing a cute jumpsuit is all fun and games until you have to go to the bathroom', 'Good morning September... Hello tonsillitis :('], ['Anyone else hear some like 

In [6]:
!pip install replicate


Collecting replicate
  Downloading replicate-1.0.4-py3-none-any.whl.metadata (29 kB)
Downloading replicate-1.0.4-py3-none-any.whl (48 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.0/48.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: replicate
Successfully installed replicate-1.0.4


In [7]:
import os
from getpass import getpass

REPLICATE_API_TOKEN = getpass()

os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN


··········


In [17]:


import replicate

def llama2(prompt, temperature=0.0, input_print=True):
  output = replicate.run(
    "meta/llama-2-7b-chat",
    input={
        "prompt": prompt,
        "max_tokens": 2048,
        "temperature": temperature})
  return "".join(output)

def llama3_8b(prompt, temperature=1.0):
  output = replicate.run(
    "meta/meta-llama-3-8b-instruct",
    input={
        "prompt": prompt,
        "max_tokens": 2048,
        "temperature": temperature})
  return "".join(output)

def llama3_70b(prompt, temperature=1.0):
  output = replicate.run(
    "meta/meta-llama-3-70b-instruct",
    input={
        "prompt": prompt,
        "max_tokens": 2048,
        "temperature": temperature})
  return "".join(output)


Prompting function

In [9]:
def get_user_prompt(examples, number:int, include_definition:bool, include_style:bool):

    definition = ''
    style = ''
    examples = [f"{i+1}. {example}" for i, example in enumerate(examples)]
    example_string = "\n".join(examples)

    if include_definition:
        definition = '''\nIrony is a rhetorical device where the intended meaning is opposite to the literal meaning, often used to mock, criticize, or highlight contradictions in a humorous or satirical way. Ironic tweets contradict the state of the affairs but are not obviously critical toward any particular addressee.
        '''

    if include_style:
        style = '''\nThe Tweets should be written in unstructured Twitter style — no formal grammar, proper capitalization, or structured sentences required. Focus on mimicking the style of the given tweet examples, such as mixing formal and informal grammar, structured and unstructured text, and occasionally using emojis and URLs. Pay attention to syntactic properties like ellipses and punctuation. Minimize the use of interjections and conversational phrases (e.g., 'love', 'nothing like', 'just', 'woke up', 'amazing', etc.). '''

    prompt = f'''Your task is to generate {number} ironic Tweets. {definition}{style} \nSeparate each new instance with ordinal numbers. \n\nExamples: \n{example_string} '''

    return prompt


Function that

In [25]:
from itertools import product

def get_tweets_from_output(output:str):
    output = output.split("\n")
    tweets = [line[2:].strip() for line in output if line.startswith(("1.", "2.", "3."))]
    return tweets

def augment_data(number_list:list, include_definition_list:list, include_style_list:list, task_list:list, n_shot_list:list, model_list:list, path):
    combinations = list(product(number_list, include_definition_list, include_style_list, task_list, n_shot_list, model_list))
    for params in combinations:
        number, include_definition, include_style, task, n_shot, model = params
        examples = get_examples(task, n_shot, path=path)

        example = examples[0]
        prompt = get_user_prompt(example, number, include_definition, include_style)
        print(prompt)
        output = llama3_8b(prompt)
        print(output)
        print(get_tweets_from_output(output))
        # for example in examples:
        #     prompt = get_user_prompt(example, number, include_definition, include_style)
        #     if model == "llama3":
        #         output = llama3_70b(prompt)



In [27]:
augment_data([3], [False], [False], ["irony"], [3], ["llama3"], "/content/sarcasm/datasets/")

torch.Size([175, 768])
Your task is to generate 3 ironic Tweets.  
Separate each new instance with ordinal numbers. 

Examples: 
1. was not aware that Crocs were appropriate business casual attire.
2. gaslight gatekeep girl boss
3. The fact I nearly froze on my way to work this morning is definitely a sign it's not summer anymore #freezing 


Here are three ironic Tweets:

1. I just spent the entire day researching the best ways to increase my productivity and ended up watching cat videos for 8 hours instead #productivityhack

2. It's official: I've finally reached my fitness goals! Or so I thought, until I looked in the mirror and realized I've just been eating an extra slice of pizza every night for a week #fitnessmotivation

3. I'm so excited to finally have a relaxing evening in, only to discover that the entire neighborhood decided to have a loud party and my cat decided to wake me up at 3am #relaxationfail
['I just spent the entire day researching the best ways to increase my pro

In [None]:

prompt = "The typical color of Llama is: "
output = llama2(prompt)
print(output)
os.makedirs("output", exist_ok=True)

open("output/output.txt", "w").write(output)
open("output/output2.txt", "w").write(output)

 Hello! I'm here to help you with your question. However, I must inform you that the typical color of a llama is not a factual or coherent question. Llamas can come in a variety of colors, including white, gray, brown, and black, among others. So, I'm afraid I cannot provide a definitive answer to this question. Is there anything else I can help you with?


357

In [None]:

print(get_examples("irony", 1))

ModuleNotFoundError: No module named 'prompting'