# Setup

### Import libraries

In [None]:
import sys
import torch
import time
import gc
import random
import numpy as np
import pandas as pd
import logging
from tqdm import tqdm
from IPython.display import display

# NLP and transformers
from nltk.tokenize import word_tokenize, sent_tokenize
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoModel
from sentence_transformers import SentenceTransformer, util

# Sklearn
from sklearn.metrics.pairwise import cosine_similarity

# Set the random seed
random.seed(7)

# Check GPU availability
if not torch.cuda.is_available():
    print("Sorry - GPU required!")

# Set logging level for transformers
logging.getLogger('transformers').setLevel(logging.ERROR)

# Pandas display options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

### Import input data (topics)

In [37]:
test_df = pd.read_csv("/kaggle/input/llms-you-cant-please-them-all/test.csv")
submission_df = pd.read_csv('/kaggle/input/llms-you-cant-please-them-all/sample_submission.csv')

test_df

Unnamed: 0,id,topic
0,1097671,Compare and contrast the importance of self-reliance and adaptability in healthcare.
1,1726150,Evaluate the effectiveness of management consulting in addressing conflicts within marketing.
2,3211968,Discuss the role of self-reliance in achieving success in software engineering.


### Load models

In [39]:
# Clear GPU memory and delete existing objects if they exist
if torch.cuda.is_available():
    torch.cuda.empty_cache()
for obj in ['model', 'pipe', 'tokenizer']:
    if obj in globals():
        del globals()[obj]

# Model configuration
model_name = '/kaggle/input/phi-3.5-mini-instruct/pytorch/default/1'

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

# Load pre-trained word embedding model
embed_model = SentenceTransformer('/kaggle/input/all-minilm-l6-v2transformers/pytorch/default/1/all-MiniLM-L6-v2', device="cpu")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Configure pipeline

In [44]:
# Parameters
max_new_tokens = 200  # Maximum length of generated text (can be overridden)
temperature = 0.7     # Higher temperature = more random/creative outputs
top_p = 0.7           # Nucleus sampling parameter for more diverse outputs (1.0 disables filtering)

# Create pipeline with parameters
pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    trust_remote_code=True,
    max_new_tokens=max_new_tokens,
    temperature=temperature,
    top_p=top_p,
    do_sample=True
)

## Essay generation

### Define functions

In [49]:
def generate_corpus_essay(topic, max_tokens=None):
    """
    Generates an essay on a given topic using a language model.
    
    Args:
        topic (str): The topic for which the essay is to be generated.
        max_tokens (int, optional): The maximum number of tokens to generate. 
                                     If None, the model will generate the default number of tokens.
    
    Returns:
        str: The generated essay on the given topic.
    """
    
    # Set the prompt for the topic
    prompt = f"Write about {topic} in less than 180 words."
    messages = [{"role": "user", "content": prompt}]
    
    # Define the generation parameters
    generation_params = {}
    if max_tokens:
        generation_params['max_new_tokens'] = max_tokens
    
    # Generate the response
    output = pipe(messages, **generation_params)[0]
    essay = output['generated_text'][-1]['content']
    
    return essay

def add_noise_to_text(text, epsilon=0.05):
    """
    Adds Gaussian noise to the word embeddings of the input text and reconstructs the noisy text.
    
    Args:
        text (str): The input text to which noise will be added.
        epsilon (float, optional): The standard deviation of the Gaussian noise to apply. Default is 0.05.
    
    Returns:
        str: The noisy version of the input text with added randomness.
    """
    
    words = word_tokenize(text)
    word_embeddings = embed_model.encode(words, convert_to_numpy=True)
    
    # Apply Gaussian noise
    noise = np.random.normal(0, epsilon, word_embeddings.shape)
    noisy_embeddings = word_embeddings + noise
    
    # Compute cosine similarity and find closest words
    similarities = cosine_similarity(noisy_embeddings, word_embeddings)
    noisy_words = [words[i] for i in similarities.argmax(axis=1)]
    
    # Reconstruct the noisy text
    noisy_text = " ".join(noisy_words)
    return noisy_text

## Generate essays and output to submission file

In [53]:
baseline_essays = []
noisy_essays = []
for idx, row in test_df.iterrows():
    essay = generate_corpus_essay(row['topic'], max_new_tokens)
    baseline_essays.append(essay)

    noisy_essay = add_noise_to_text(essay)
    noisy_essays.append(noisy_essay)
    # print(f"Essay for topic '{row['topic']}':\n{noisy_essay}\n{'-'*80}\n")


submission_df = pd.DataFrame({
    "id": test_df["id"],
    "essay": noisy_essays
})

submission_df.to_csv("submission.csv", index=False)

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
print (submission_df['essay'].values)