In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from tqdm import tqdm

from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity

from transformers import pipeline, AutoTokenizer,GPT2Tokenizer, GPT2Model, AutoTokenizer, AutoModelForCausalLM

from transformers import BertTokenizer, BertForMaskedLM, AdamW

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# %pip install sentence_transformers


In [None]:
### Comment it out in Kaggle

# train_df = pd.read_csv('/kaggle/input/8000-data/8000_data (1).csv')
train_df = pd.read_csv('data/8000_data.csv')
# train_df = pd.read_csv('data/generate_data.csv')
np.random.seed(42)

# Randomly shuffling the dataframe
train_df = train_df.sample(frac=1).reset_index(drop=True)


train_df = train_df.dropna()
train_df = train_df.rename(columns={'prompt':'rewrite_prompt'})
train_df['id'] = range(len(train_df))
test_df = train_df[1000:2000]
train_df = train_df[:1000]


In [None]:
test_df

In [None]:
train_df = train_df.dropna()
# test_df = test_df.dropna()
test_df['original_text'] = test_df['original_text'].fillna('')
test_df['rewritten_text'] = test_df['rewritten_text'].fillna('')

train_df['original_text'] = train_df['original_text'].apply(lambda x: x[:200])
train_df['rewritten_text'] = train_df['rewritten_text'].apply(lambda x: x[:200])

test_df['original_text'] = test_df['original_text'].apply(lambda x: x[:200])
test_df['rewritten_text'] = test_df['rewritten_text'].apply(lambda x: x[:200])

In [None]:
### Uncomment it

# train_df = pd.read_csv('/kaggle/input/llm-prompt-recovery/train.csv')
# test_df = pd.read_csv('/kaggle/input/llm-prompt-recovery/test.csv')

train_df['word_count'] = train_df['rewrite_prompt'].apply(lambda x: len(x.split()))
word_count_distribution = train_df['word_count'].value_counts(normalize=True)
train_df

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
device

In [None]:
import re
import torch
from torch.utils.data import Dataset

class MaskedSequenceDataset(Dataset):
    def __init__(self, dataframe, tokenizer, mask_token='[MASK]', distribution=None):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.mask_token = mask_token

        self.dataframe['original_text'] = dataframe['original_text'].apply(lambda x: re.sub('<.*?>', '', x).strip())
        self.dataframe['rewritten_text'] = dataframe['rewritten_text'].apply(lambda x: re.sub('<.*?>', '', x).strip())
        if distribution is None:
            self.dataframe['rewrite_prompt'] = dataframe['rewrite_prompt'].apply(lambda x: re.sub('<.*?>', '', x).strip())
        self.dataframe['id'] = dataframe['id']
        self.distribution = distribution
        
    def combine_and_mask(self, original, rewrite, prompt_length):
        masks = " ".join([self.mask_token for _ in range(prompt_length)])
#         masked_sequence = f"{original} The task is to rewrite this narrative with the given blanks: {masks} {rewrite}"
        masked_sequence = f"[CLS] {original} [SEP] The task is to rewrite this narrative with the given blanks: {masks} {rewrite} [SEP]"

        return masked_sequence

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        
        if self.distribution is None:
            prompt = row['rewrite_prompt']
            target_subsequence_ids = self.tokenizer.encode(prompt, add_special_tokens=False)
            mask_length = len(target_subsequence_ids)
        else:
            mask_length = np.random.choice(self.distribution.index, p=self.distribution.values, size=1)[0]
        masked_sequence = self.combine_and_mask(row['original_text'], row['rewritten_text'], mask_length)
        
        inputs = self.tokenizer(masked_sequence, truncation=True, padding='max_length', max_length=512, return_tensors="pt")
        input_ids = inputs['input_ids'].squeeze(0)
        
        labels = torch.full_like(input_ids, fill_value=-100)
        
        mask_indices = (input_ids == self.tokenizer.mask_token_id).nonzero(as_tuple=False).squeeze()
        
        if self.distribution is None:
            if len(mask_indices) >= mask_length:
                labels[mask_indices[:mask_length]] = torch.tensor(target_subsequence_ids, dtype=torch.long)
            else:
                raise ValueError("Not enough mask tokens to fit the rewrite prompt")
            return row['id'], input_ids, labels
        else:
            if len(mask_indices) >= mask_length:
                labels[mask_indices[:mask_length]] = torch.ones(mask_length, dtype=torch.long)
            else:
                raise ValueError("Not enough mask tokens to fit the rewrite prompt")
            return row['id'], input_ids, labels


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Uncomment on Kaggle to load BERT
# MODEL_DIR = "/kaggle/input/huggingface-bert/"
# tokenizer = BertTokenizer.from_pretrained(MODEL_DIR + "bert-base-uncased")
# model = BertForMaskedLM.from_pretrained(MODEL_DIR + "bert-base-uncased")



In [None]:

train_dataset = MaskedSequenceDataset(train_df, tokenizer)
test_dataset = MaskedSequenceDataset(test_df, tokenizer, distribution=word_count_distribution)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)



In [None]:
model.train()
model.to(device) 

# optimizer = AdamW(model.parameters(), lr=5e-5)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

epochs = 10
for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch"):
        _, input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        
        model.zero_grad()
        
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Average loss at epoch {epoch + 1}: {avg_loss:.4f}")



In [None]:
### Comment the block out in Kaggle

def batch_cosine_similarity(x1, x2):
    x1_norm = torch.nn.functional.normalize(x1, p=2, dim=-1)
    x2_norm = torch.nn.functional.normalize(x2, p=2, dim=-1)
    
    cos_sim = torch.mm(x1_norm, x2_norm.transpose(0, 1))
    
    return cos_sim

from sentence_transformers import SentenceTransformer

scs_model = SentenceTransformer("sentence-t5-base")

def sharpened_cosine_similarity_batch(scs_model, output_texts, target_texts, sharpen_factor=3):
    target_embeddings = scs_model.encode(target_texts, convert_to_tensor=True).to(device)
    output_embeddings = scs_model.encode(output_texts, convert_to_tensor=True).to(device)
    
    cos_sims = batch_cosine_similarity(target_embeddings, output_embeddings)
    
    sharpened_scores = [cos_sims[i][i].unsqueeze(0) ** sharpen_factor for i in range(cos_sims.size(0))]
    
    return sharpened_scores

In [None]:
output_texts = ["Transform this sentence into a more humorous and sarcastic version, using irony and wit."]
target_texts = ["Rewrite this text to infuse it with humor and a light-hearted tone, while still conveying the scientific facts and the relative safety due to the distant future impact date."]
sharpened_cosine_similarity_batch(scs_model, output_texts,target_texts)

In [None]:
### Comment it out in Kagle

# predicted_sentences = []
# target_sentences = []
# id_list = []

# for batch in tqdm(test_loader):
#     ids, input_ids, labels = batch
#     input_ids = input_ids.to(device)
#     labels = labels.to(device)
#     with torch.no_grad():
#         outputs = model(input_ids)
#         predictions = outputs.logits.argmax(dim=-1)
    
#         for idx, (pred, label, id_value) in enumerate(zip(predictions, labels, ids)):
#             valid_positions = label != -100
#             valid_predictions = pred[valid_positions]
#             valid_label = label[valid_positions]
#             predicted_sentence = tokenizer.decode(valid_predictions, skip_special_tokens=True)

#             target_sentence = tokenizer.decode(valid_label, skip_special_tokens=True)
        
#             predicted_sentences.append(predicted_sentence)
#             id_list.append(id_value.item())  
#             target_sentences.append(target_sentence)
# score = sharpened_cosine_similarity_batch(scs_model, predicted_sentences, target_sentences, sharpen_factor=3)
# print(torch.mean(torch.stack(score)))
# score         

In [None]:
predicted_sentences = []
id_list = []

for batch in tqdm(test_loader):
    ids, input_ids, labels = batch
    input_ids = input_ids.to(device)
    labels = labels.to(device)
    with torch.no_grad():
        outputs = model(input_ids)
        predictions = outputs.logits.argmax(dim=-1)
        
    for idx, (pred, label, id_value) in enumerate(zip(predictions, labels, ids)):
        valid_positions = label != -100
        valid_predictions = pred[valid_positions]
        
        predicted_sentence = tokenizer.decode(valid_predictions, skip_special_tokens=True)
        
        predicted_sentences.append(predicted_sentence)
        id_list.append(id_value.item())  



In [None]:
# tensor(0.5840, device='cuda:0')


In [None]:
# Comment it out in Kaggle
label_sentences = test_df['rewrite_prompt'].tolist()
score = sharpened_cosine_similarity_batch(scs_model, predicted_sentences, label_sentences, sharpen_factor=3)
print(torch.mean(torch.stack(score)))
score

In [None]:
submission_df = pd.DataFrame({'id': id_list,'rewrite_prompt': predicted_sentences})
# submission_df = pd.DataFrame({'id': id_list,'rewrite_prompt': predicts, 'target':targets})

submission_df

In [None]:
submission_df.to_csv('submission.csv', index=False)
