# Estrazione dei prompt


In [1]:
import nltk
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import sent_tokenize
import matplotlib.pyplot as plt
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/eliaguerra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
categories = ["sci.space", "rec.autos", "rec.sport.hockey"] # Categories extracted from the 20newsgroups dataset
k = 4 # Number of sentences to be extracted from 20newsgroups dataset and used as prompts
min_number_of_words = 20 # Minimum number of words in a sentence to be considered as a prompt
random_seed = 0 # Random seed for reproducibility

Il dataset viene caricando rimuovendo le parti relative a `headers`, `footers` e `quotes`. I post sono contenuti in `dataset.data`. La divisione in frasi viene fatta tramite la libreria `nltk` e il tokenizzatore `punkt` tramite `sent_tokenize`. I prompt vengono salvati con i rispettivi argomenti su un Dataframe

In [3]:
dataset = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"), categories=categories, shuffle=False) # 20newsgroups dataset loading
first_k_sentences = [' '.join(sent_tokenize(sample)[:k]) for sample in dataset.data] # Extracting the first k sentences from each sample 
df = pd.DataFrame(first_k_sentences, columns=['First k Sentences']) # Dataframe creation
df['Label'] = dataset.target
df['label_name'] = df['Label'].apply(lambda x: dataset.target_names[x]) # Adding the label names to the dataframe

Calcolo del numero di parole per ogni prompt ed eliminazione dei prompt con meno di `min_number_of_words` parole

In [4]:
def count_words(sentence):
    # Simple function to count the number of words in a sentence
    return len(sentence.split())

In [5]:
df['Number of Words'] = df['First k Sentences'].apply(count_words)
df = df[df['Number of Words'] > min_number_of_words] # Filtering sentences with less than min_number_of_words words
print("Average number of words: ", df['Number of Words'].mean())
print("Standard deviation: ", df['Number of Words'].std())
print("Maximum number of words: ", df['Number of Words'].max())
print("Minimum number of words: ", df['Number of Words'].min())

Average number of words:  71.01012066952121
Standard deviation:  161.99423057212127
Maximum number of words:  6864
Minimum number of words:  21


Calcolo del numero di prompt per ogni argomento. Al fine di bilanciare il dataset, il numero di prompt per ogni argomento viene al numero di prompt dell'argomento con cardinaliÃ  minore.

In [6]:
label_counts = df['Label'].value_counts() # Counting the number of samples for each label
print(label_counts)
class_samples = min(label_counts)

Label
2    875
1    862
0    832
Name: count, dtype: int64


Selezione randomica dei prompt per ogni argomento.

In [7]:
dfs = []
for label in [0,1,2]:
    filtered_df = df[df['Label'] == label]

    if len(filtered_df) >= class_samples:
        sampled_df = filtered_df.sample(class_samples, random_state=random_seed) # Sampling class_samples samples from each category
        dfs.append(sampled_df)
    else:
        print(f"Category {label} has less than {class_samples} samples")
result_df = pd.concat(dfs, ignore_index=True)
print(result_df['Label'].value_counts())

Label
0    832
1    832
2    832
Name: count, dtype: int64


# Creazione del dataset sintetico

In [8]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tqdm.notebook import tqdm_notebook
import torch
import re

device = "cuda" if torch.cuda.is_available() else "cpu" # Checking if GPU is available
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large") # Loading the GPT-2 tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2-large").to(device) # Load the pre-trained GPT-2 model
min_new_tokens = 20 # Minimum number of tokens to be generated
max_new_tokens = 300 # Maximum number of tokens to be generated

In [9]:
def process_string(text):
    """
    Removes unwanted characters.

    Parameters:
        text (str): The input string to be processed.

    Returns:
        str: The processed string with unwanted characters removed and formatted.
    """
    text = re.sub(r"[^\w\s.,!?']", ' ', text) # Replace all characters that are not words, spaces, periods, commas, exclamation marks, question marks, or apostrophes with a space
    text = re.sub('[\n\t]+', ' ', text) # Replace all newlines and tabs with a space
    text = re.sub(' +', ' ', text) # Replace all multiple spaces with a single space
    text = ''.join(c for c in text if c.isprintable()) # Remove non-printable characters
    return text.strip()
    

In [10]:
def generate_dataset(input, tokenizer, model, min_new_tok, max_new_tok, device = "cpu"):
    """
    Generates samples from the input dataset using a given model. The function skips the generation process if the length of the encoded input and max_new_tok exceeds 1024.

    Parameters:
        input (pandas.DataFrame): The input dataset. It should have columns 'First k Sentences', 'Label', and 'label_name'.
        tokenizer (PreTrainedTokenizer): The tokenizer to be used for encoding the prompts.
        model (PreTrainedModel): The model to be used for generating the samples.
        min_new_tok (int): The minimum number of new tokens to generate.
        max_new_tok (int): The maximum number of new tokens to generate.
        device (str, optional): The device to run the model on. Defaults to "cpu".

    Returns:
        tuple: Two lists of dictionaries. The first list contains the raw generated samples with the keys 'label', 'label_name', 'prompt', and 'text'. 
        The second list contains the processed generated samples with the same keys.
    """
    generated_samples_raw = []
    generated_samples = []
    for idx, input_row in tqdm_notebook(input.iterrows(), total = input.shape[0]):
        prompt = process_string(input_row['First k Sentences']) # Process the input prompt with the function previously defined
        label = input_row['Label']
        label_name = input_row['label_name']
        encoded_input = tokenizer.encode(prompt, return_tensors="pt").to(device) # Encode the input prompt

        if len(encoded_input[0]) + max_new_tok > 1024: # Skip the generation process if it exceeds 1024 tokens otherwise it will raise an error
            continue

        outputs = model.generate(encoded_input,  # Encoded prompt
                                 min_new_tokens = min_new_tok, # Min number of new tokens
                                 max_new_tokens = max_new_tok, # Max number of new tokens
                                 num_return_sequences=1, # Number of samples to generate
                                 no_repeat_ngram_size=2, # To avoid word repetition
                                 pad_token_id=50256, 
                                 eos_token_id=50256
                                 )
        data = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True) # Decode the output
        generated_samples_raw.append({"label": label, "label_name": label_name, "prompt": prompt, "text": data})
        generated_samples.append({"label": label, "label_name": label_name, "prompt": prompt, "text": process_string(data)})
    return generated_samples_raw, generated_samples

Generazione del dataset sintetico eseguendo la funzione `generate_dataset` sul dataframe dei prompt.

In [11]:
synthetic_dataset_raw, synthetic_dataset = generate_dataset(result_df, tokenizer, model, min_new_tokens, max_new_tokens, device)

  0%|          | 0/10 [00:00<?, ?it/s]

Esempio di output non post-processato

In [12]:
synthetic_dataset_raw[0]

{'label': 1,
 'label_name': 'rec.sport.hockey',
 'prompt': 'The Selke candidate forwards main purpose on a shift is to prevent goals from being scored not to score them. When Lemieux or Gilmour play their number one purpose is to score defence is secondary especially considering the line that plays against them is probably a defensive one. That is why they are not Selke candidates. Someone posted something about this assumption being lost in translation it was a few months ago .',
 'text': "The Selke candidate forwards main purpose on a shift is to prevent goals from being scored not to score them. When Lemieux or Gilmour play their number one purpose is to score defence is secondary especially considering the line that plays against them is probably a defensive one. That is why they are not Selke candidates. Someone posted something about this assumption being lost in translation it was a few months ago. I think it is a good idea to look at the numbers and see if it holds true.\n\nThe

Esempio di output post-processato

In [13]:
synthetic_dataset[0]

{'label': 1,
 'label_name': 'rec.sport.hockey',
 'prompt': 'The Selke candidate forwards main purpose on a shift is to prevent goals from being scored not to score them. When Lemieux or Gilmour play their number one purpose is to score defence is secondary especially considering the line that plays against them is probably a defensive one. That is why they are not Selke candidates. Someone posted something about this assumption being lost in translation it was a few months ago .',
 'text': "The Selke candidate forwards main purpose on a shift is to prevent goals from being scored not to score them. When Lemieux or Gilmour play their number one purpose is to score defence is secondary especially considering the line that plays against them is probably a defensive one. That is why they are not Selke candidates. Someone posted something about this assumption being lost in translation it was a few months ago. I think it is a good idea to look at the numbers and see if it holds true. The fi

In [15]:
synthetic_dataset_raw_df = pd.DataFrame(synthetic_dataset_raw) # Convert the list of dictionaries to a dataframe
synthetic_dataset_df = pd.DataFrame(synthetic_dataset) # Convert the list of dictionaries to a dataframe
synthetic_dataset_raw_df.to_csv('synthetic_dataset_raw_test.csv', index=False) # Save the raw generated samples to a CSV file
synthetic_dataset_df.to_csv('synthetic_dataset_test.csv', index=False) # Save the processed generated samples to a CSV file