In [1]:
import pandas as pd
import numpy as np


In [2]:
GPT2 = "gpt2"
FLAN = "flan"

In [8]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) 

def generate_text_from_prompt(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", max_length=49)
    
    outputs = model.generate(**inputs, max_length=50, num_return_sequences=1, do_sample=True, top_k=0)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)

def generate_text_from_prompt(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", max_length=49)
    
    outputs = model.generate(**inputs, max_length=50, num_return_sequences=1, do_sample=True, top_k=0)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

In [4]:
def get_fsl_data(task):
    return pd.read_csv("../data/syn/mid/fsl-cat-"+task+".csv")

tasks = {
    'news':{
        'loop':10
    },
    'spam':{
        'loop':100
    },
    'sentiment':{
        'loop':250
    }
}

In [9]:
LLM = FLAN

In [6]:
# df = get_fsl_data(task)
# prompt = df[df['fsl_category']==0].sample(n=1)
# prompt['text'].values[0]

In [10]:
for task in tasks:
    df = get_fsl_data(task)
    df0_n = pd.DataFrame(columns=['text'])
    df1_n = pd.DataFrame(columns=['text'])
    #cat 0
    df0 = df[df['y']==0]
    print(f"Start: {task}")
    print(f"df0 rows: {df0.shape[0]}, nun: {df0['fsl_category'].nunique()}")
    for value, count in df0['fsl_category'].value_counts().items(): 
        print(f"\nValue: {value}, Count: {count}")
        prompt = df0[df0['fsl_category']==value].sample(n=1)
        prompt = prompt['text'].values[0]
        print(f"Prompt: {prompt[:20]}")
        for _ in range(count):
            text = generate_text_from_prompt(prompt)
            df0_n.loc[len(df0_n)] = text
            if _%tasks[task]['loop']==0:
                print(_, end=" ")
        print(f"\nFinished Cat: ", value)
    df0_n['y']=0
    print("Finished df0")

    df1 = df[df['y']==0]
    print(f"Start: {task}")
    print(f"df1 rows: {df1.shape[0]}, nun: {df1['fsl_category'].nunique()}")
    for value, count in df0['fsl_category'].value_counts().items(): 
        print(f"\nValue: {value}, Count: {count}")
        prompt = df1[df1['fsl_category']==value].sample(n=1)
        prompt = prompt['text'].values[0]
        print(f"Prompt: {prompt[:20]}")
        for _ in range(count):
            text = generate_text_from_prompt(prompt)
            df1_n.loc[len(df1_n)] = text
            if _%tasks[task]['loop']==0:
                print(_, end=" ")
        print(f"\nFinished Cat: ", value)
    df1_n['y']=1
    print("Finished df1")

    df = pd.concat([df1_n, df0_n], ignore_index=True).sample(frac=1)
    df.to_csv('../data/syn/'+LLM+'/fs/auto-'+task+'-fs.csv', index=False)








Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Start: news
df0 rows: 68, nun: 3

Value: 0, Count: 52
Prompt: Debate Commission ST
0 10 20 30 40 50 
Finished Cat:  0

Value: 1, Count: 8
Prompt: KISS Bassist Gene Si
0 
Finished Cat:  1

Value: 2, Count: 8
Prompt: Charlotte Police REV
0 
Finished Cat:  2
Finished df0
Start: news
df1 rows: 68, nun: 3

Value: 0, Count: 52
Prompt: U.S. mistakenly gran
0 10 20 30 40 50 
Finished Cat:  0

Value: 1, Count: 8
Prompt: “Public School Force
0 
Finished Cat:  1

Value: 2, Count: 8
Prompt: Young Girl's Emotion
0 
Finished Cat:  2
Finished df1
Start: spam
df0 rows: 602, nun: 4

Value: 3, Count: 447
Prompt: Have a good trip. Wa
0 100 200 300 400 
Finished Cat:  3

Value: 2, Count: 69
Prompt: Correct. So how was 
0 
Finished Cat:  2

Value: 1, Count: 62
Prompt: Ahhh. Work. I vaguel
0 
Finished Cat:  1

Value: 0, Count: 24
Prompt: R we going with the 
0 
Finished Cat:  0
Finished df0
Start: spam
df1 rows: 602, nun: 4

Value: 3, Count: 447
Prompt: If you r @ home then
0 100 200 300 400 
Finished Cat: 