In [12]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch

from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

In [13]:
input_token_len = 1024
output_token_len = 100

cache_dir = "/tmp/k7/"

In [14]:
test_df = pd.read_csv('../data/test.csv')

In [15]:
base_model_name = "'microsoft/phi-2"
adapter_model_name = "/kaggle/input/llm-prompt-recovery-model/"

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True, cache_dir=cache_dir)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model = AutoModelForCausalLM.from_pretrained(base_model_name,trust_remote_code=True, cache_dir=cache_dir)
model = PeftModel.from_pretrained(model, adapter_model_name, cache_dir=cache_dir)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model.to(device)
model.eval()
print('model loaded !!')

In [None]:
def text_generate(ori_text, rew_text, model, tokenizer, input_max_len=512, output_len=20, device='cuda'):
    prompt = f"Instruct: Original Text:{ori_text}\nRewritten Text:{rew_text}\nWrite a prompt that was likely given to the LLM to rewrite original text to rewritten text.\nOutput:"
    inputs = tokenizer(prompt, max_length=input_max_len, truncation=True, return_tensors="pt", return_attention_mask=False)
    
    input_token_len = len(inputs.input_ids[0])
    inputs = {k:v.to(device) for k,v in inputs.items()}
    max_len = input_token_len + output_len
    
    outputs = model.generate(**inputs,
                         do_sample=False,
                         max_length=max_len,
                         pad_token_id=tokenizer.pad_token_id,
                         )
    text = tokenizer.batch_decode(outputs,skip_special_tokens=True,clean_up_tokenization_spaces=False)[0]
    start_index = text.find('Output:')
    generated_text = text[start_index+len('Output:'):].strip()
    return generated_text

In [None]:
mean_prompt = "'Rewrite the following text in the style of [author/style], while preserving the original meaning. Adapt the tone, diction, and stylistic elements to match the specified writing style, aiming to enhance clarity, elegance, and impact.'"

In [None]:
rewrite_prompts = []

In [None]:
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    prompt = mean_prompt
    try:
        prompt = text_generate(row['original_text'],
                               row['rewritten_text'],
                               model,
                               tokenizer,
                               input_token_len,
                               output_token_len,
                               device,
                              )
    except:
        pass
        
    rewrite_prompts.append(prompt)

In [None]:
test_df['rewrite_prompt'] = rewrite_prompts

In [None]:
sub_df = test_df[['id', 'rewrite_prompt']]
sub_df.to_csv('submission.csv', index=False)