**Imports**

In [1]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from sklearn.model_selection import train_test_split

**Model: GPT-2**

In [2]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)

**Utils**

In [3]:
def count_rows(task):
    file = "../data/orig/processed/train/" + task + "-data-mini.csv"
    df = pd.read_csv(file)
    return int(df.shape[0]/2)

def load_data(task):
    if(task == "news"):
        df_1 = pd.read_csv("..\\data\\orig\\main\\news\\BuzzFeed_fake_news_content.csv")
        df_2 = pd.read_csv("..\\data\\orig\\main\\news\\BuzzFeed_real_news_content.csv")

        df_1['text'] = df_1['title'] + ' ' + df_1['text']
        df_2['text'] = df_2['title'] + ' ' + df_2['text']

        df_1 = df_1[['text']]
        df_2 = df_2[['text']]

        df_1['y'] = 1
        df_2['y'] = 0

        df1_train, df1_test = train_test_split(df_1, random_state=42)
        df2_train, df2_test = train_test_split(df_2, random_state=36)

        df1 = pd.concat([df1_train, df2_train], ignore_index=True)
        df1 = df1.sample(frac=1).reset_index(drop=True)
        return df1
    
    if(task == "spam"):
        df = pd.read_csv("../data/orig/main/spam/data.csv", encoding='ISO-8859-1')
        df = df[["v1", "v2"]]
        df["v1"] = df["v1"].apply(lambda x: 1 if x=="spam" else 0)
        df.rename(columns={"v1":"y","v2":"text"}, inplace=True)
        df = df.sample(frac=1).reset_index(drop=True)
        df1, df2 = train_test_split(df, test_size=0.2, random_state=65)
        df1.dropna(inplace=True)
        df1 = pd.concat([df1[df1['y']==1], df1[df1['y']==0].sample(n=602)], ignore_index=True)
        return df1
    df = pd.read_csv("../data/orig/main/sentiment/data.csv", encoding='latin-1', header=None)
    df = df[[0,5]]
    df.rename(columns={0:'y',5:'text'}, inplace=True)
    df['y'] = df['y'].apply(lambda x: 1 if x==4 else 0)
    df = df.sample(frac=1).reset_index(drop=True)
    df1, df2  = train_test_split(df, random_state=46, test_size=0.3)
    df1.dropna(inplace=True)
    n = 2500
    df1 = pd.concat([df1[df1['y']==1].sample(n=n), df1[df1['y']==0].sample(n=n)], ignore_index=True)
    return df1

zsl_tasks = [
    {
        'name':'news',
        'rows': count_rows('news'),
        'print_count':10
    },
    {
        'name':'spam',
        'rows': count_rows('spam'),
        'print_count':100
    },    
    {
        'name':'sentiment',
        'rows': count_rows('sentiment'),
        'print_count':250
    }
]

def generate_text_from_prompt(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", max_length=49)
    
    outputs = model.generate(**inputs, max_length=50, num_return_sequences=1, do_sample=True, top_k=0)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# generate_text_from_prompt(prompt=d.loc[8,'text'])


**ZSP - Data Generation**

In [4]:
df_main = pd.DataFrame()
for task in zsl_tasks:
    print("1. Starting task: ", task)    
    df = load_data(task['name'])
    print("2. Loaded Data, Rows: ", df.shape[0])
    print("3. Prog: ")
    for _ in range(df.shape[0]):
        new_text = generate_text_from_prompt(prompt=df.loc[_,'text'])
        df.loc[_,'text'] = new_text
        df_main = df
        if _%task['print_count'] == 0:
            print(_, end=' ')
            df.to_csv("../data/syn/mid/auto-" + task['name'] + "-pp-data.csv", index=False)
    print("\n4. Saving df")
    df = df.sample(frac=1).reset_index(drop=True)
    df_main = df
    df.to_csv("../data/syn/gpt2/pp/auto-" + task['name'] + "-data.csv", index=False)
    print("5. Ending task: ", task['name'])
    print()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


1. Starting task:  {'name': 'news', 'rows': 68, 'print_count': 10}
2. Loaded Data, Rows:  136
3. Prog: 
0 10 20 30 40 50 60 70 80 90 100 110 120 130 
4. Saving df
5. Ending task:  news

1. Starting task:  {'name': 'spam', 'rows': 602, 'print_count': 100}
2. Loaded Data, Rows:  1197
3. Prog: 
0 100 200 300 400 500 600 700 800 900 1000 1100 
4. Saving df
5. Ending task:  spam

1. Starting task:  {'name': 'sentiment', 'rows': 2499, 'print_count': 250}
2. Loaded Data, Rows:  5000
3. Prog: 
0 250 500 750 1000 1250 1500 1750 2000 2250 2500 2750 3000 3250 3500 3750 4000 4250 4500 4750 
4. Saving df
5. Ending task:  sentiment

