In [None]:
!pip install -Uq /kaggle/input/llm-whls/bitsandbytes-0.41.1-py3-none-any.whl
!pip install -Uq /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl
!pip install -Uq /kaggle/input/library-off-for-llm/transformers-4.38.2-py3-none-any.whl

In [None]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch

from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
input_token_len = 1024
output_token_len = 30

# searched hyperparameters for generation, using n_beams=4 contributes to the major improvement in the score from bronze bracket to silver bracket
temp = 0.6
n_beams = 4

In [None]:
test_df = pd.read_csv('/kaggle/input/llm-prompt-recovery/test.csv')

In [None]:
base_model_name = "/kaggle/input/phi/transformers/2/1"
adapter_model_name = "/kaggle/input/phi2-public-data-sft-adapter/pytorch/public-data-sft/1/phi2_public_data_sft"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name,trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = AutoModelForCausalLM.from_pretrained(base_model_name,trust_remote_code=True)
model = PeftModel.from_pretrained(model, adapter_model_name)
model = model.merge_and_unload()

In [None]:
model.to(device)
model.eval()
print('model loaded !!')

In [None]:
def text_generate(ori_text, rew_text, model, tokenizer, input_max_len=512, output_len=128, device='cuda'):
    prompt = f"Instruct: Original Text:{ori_text}\nRewritten Text:{rew_text}\nWrite a prompt that was likely given to the LLM to rewrite original text to rewritten text.\nOutput: Please improve this text by"
    inputs = tokenizer(prompt, max_length=input_max_len, truncation=True, return_tensors="pt", return_attention_mask=False).to(device)
    
    input_token_len = len(inputs.input_ids[0])
    inputs = {k:v.to(device) for k,v in inputs.items()}
#     max_len = input_token_len + output_len
    outputs = model.generate(**inputs,
                         do_sample=True,
                         max_new_tokens=output_len,
                         pad_token_id=tokenizer.pad_token_id,
                         temperature=temp,
                         num_beams=n_beams,
                         )
    text = tokenizer.batch_decode(outputs,skip_special_tokens=True,clean_up_tokenization_spaces=False)[0]
    start_index = text.find('Output:')
    generated_text = text[start_index+len('Output:'):].strip()
    
    sentences = [sentence.strip() for sentence in generated_text.split('.')]
    
    generated_text = sentences[0]
    
    new_prompt = f"Instruct: Original Text:{ori_text}\nRewritten Text:{rew_text}\nWrite a prompt that was likely given to the LLM to rewrite original text to rewritten text.\nOutput: {generated_text.split('.')[0]}" + ", maintaining the original meaning but altering the tone in "
    
    new_inputs = tokenizer(new_prompt, max_length=input_max_len, truncation=True, return_tensors="pt", return_attention_mask=False).to(device)
    
    new_outputs = model.generate(**new_inputs,
                         do_sample=True,
                         max_new_tokens=output_len,
                         pad_token_id=tokenizer.pad_token_id,
                         temperature=temp,
                         num_beams=n_beams,
                         )
    
    new_text = tokenizer.batch_decode(new_outputs,skip_special_tokens=True,clean_up_tokenization_spaces=False)[0]
    
    new_generated_text = new_text[start_index+len('Output:'):].strip()
    
    sentences = [sentence.strip() for sentence in new_generated_text.split('.')]
    
    generated_text = sentences[0]
    return generated_text

In [None]:
mean_prompt = "Please improve this text using the writing style with maintaining the original meaning but altering the tone."

In [None]:
rewrite_prompts = []

In [None]:
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    prompt = mean_prompt
    try:
        prompt = text_generate(row['original_text'],
                               row['rewritten_text'],
                               model,
                               tokenizer,
                               input_token_len,
                               output_token_len,
                               device,
                              )
    except:
        pass
        
    rewrite_prompts.append(prompt)

In [None]:
test_df['rewrite_prompt'] = rewrite_prompts

In [None]:
sub_df = test_df[['id', 'rewrite_prompt']]
sub_df.to_csv('submission.csv', index=False)