In [1]:
import sys 
import torch
import random
import numpy as np
import pandas as pd
import gc
import time
import random
from tqdm import tqdm

from IPython.display import display

from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoModel

if (not torch.cuda.is_available()): print("Sorry - GPU required!")
    
import logging
logging.getLogger('transformers').setLevel(logging.ERROR)

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

In [2]:
test_df = pd.read_csv('/kaggle/input/llms-you-cant-please-them-all/test.csv')
test_df

Unnamed: 0,id,topic
0,1097671,Compare and contrast the importance of self-reliance and adaptability in healthcare.
1,1726150,Evaluate the effectiveness of management consulting in addressing conflicts within marketing.
2,3211968,Discuss the role of self-reliance in achieving success in software engineering.


In [3]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
for obj in ['model', 'pipe', 'tokenizer']:
    if obj in globals():
        del globals()[obj]

model_name = '/kaggle/input/phi-3.5-mini-instruct/pytorch/default/1'
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
max_new_tokens = 130  # Maximum length of generated text
word_count_to_request = 60   #We ask the model for this many words as part of the prompt prefix

temperature = 0.9     # Higher temperature = more random/creative outputs
top_p = 0.9          # Nucleus sampling parameter for more diverse outputs

In [5]:
pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    trust_remote_code=True,
    max_new_tokens=max_new_tokens,
    temperature=temperature,
    top_p=top_p,
    do_sample=True)

In [6]:
def get_response(messages):
    response = pipe(messages)[0]['generated_text'][-1]['content']
    
    response = response.strip()
    
    words = response.split()
    
    if len(words) > 100:
        response = " ".join(words[:100])
    
    last_period = response.rfind('.')
    last_question = response.rfind('?') 
    last_exclamation = response.rfind('!')
    last_close_bracket = response.rfind(']')

    last_punct = max(last_period, last_question, last_exclamation, last_close_bracket)
    
    if last_punct != -1:
        return response[:last_punct + 1]
    return response

In [7]:
messages_list = [
    {"role": "user", "content": f"Write an essay about {row['topic']}. Write your answer in 50 words, and then write a new one (same size) about what happened on a day of your choice, but be coherent, and do not repeat the same day in every case."} 
    for _, row in test_df.iterrows()]

responses = [get_response([msg]) for msg in messages_list]

test_df["essay"] = responses

In [8]:
random_words = [
    "innovation", "horizon", "synergy", "paradox", "ephemeral", "resilience", "quantum", "serendipity",
    "phenomenon", "velocity", "enigmatic", "transcendent", "pinnacle", "momentum", "conundrum"
]

countries = ["Turkey", "Japan", "Uruguay", "Canada", "Australia"]

In [9]:
def modify_essay(essay):
    if random.random() < 0.4:
        essay += " " + random.choice(random_words)

    if random.random() < 0.4:
        essay += f" A country you should visit is {random.choice(countries)}."

    if random.random() < 0.3:
        essay += " PLAGIARIZED"

    return essay

test_df["essay"] = test_df["essay"].apply(modify_essay)

In [10]:
submission = test_df.copy()
submission = submission[['id', 'essay']]
submission

Unnamed: 0,id,essay
0,1097671,"Self-reliance empowers healthcare professionals to trust their expertise, fostering innovation and confidence in decision-making. Conversely, adaptability is crucial for navigating unpredictable patient needs and evolving medical landscapes, ensuring responsive and personalized care. Both traits are indispensable, yet adaptability often takes precedence in dynamic environments, promoting continuous learning and resilience.\n\nOn July 4th, the city celebrated the Fourth of July with a grand fireworks display, live music, and community barbecues. A country you should visit is Turkey. PLAGIARIZED"
1,1726150,"Management consulting enhances conflict resolution in marketing by offering expert strategies, fostering communication, and aligning goals. Experts diagnose issues, propose tailored solutions, and implement change, improving team dynamics and customer relations.\n\nOn a brisk autumn morning, the city's central park was abuzz with the annual Harvest Festival. Families gathered, enjoying the vibrant pumpkin displays and the enticing aroma of apple cider. Children dashed between stalls, their laughter mingling with the sounds of local musicians. PLAGIARIZED"
2,3211968,"Self-reliance is pivotal in software engineering, fostering innovation, problem-solving, and continuous learning. Engineers must independently debug, design, and adapt to evolving technologies, ensuring project success. Autonomy builds resilience and accountability, driving career advancement.\n\nOn a crisp autumn day, leaves painted the town in hues of amber and gold. Children laughed as they raced through the park, their joy echoing the vibrant spirit of change. A country you should visit is Canada."


In [11]:
submission.to_csv('submission.csv', index=False)