In [1]:
import pandas as pd
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("data/final_data.csv")

In [3]:
def add_special_tokens(df):
    # Concatenate the prompts and texts with a special token in between
    df['combined_text'] = "<|original|> Original text: "+ df['original_text'] + " <|rewritten|> Rewritten text:" + df['rewritten_text'] + "<|prompt|>" + " The prompt that instructing language model to rewrite original text into rewritten text is: " + df['prompt']
    df['combined_text_inf'] = "<|original|> Original text: " + df['original_text'] + " <|endoftext|> Rewritten text: " + df['rewritten_text'] + "<|prompt|>" + " The prompt that instructing language model to rewrite original text into rewritten text is: "
    return df



# Apply the function to each row in the 'rewritten_text' column
df = add_special_tokens(df)
df.head()

Unnamed: 0,original_text,prompt,rewrite_prompt,rewritten_text,combined_text,combined_text_inf
0,I get it now. You have to give to get. Where's...,Convey the same message as this text but throu...,Convey the same message as this text but throu...,"""The cosmic tapestry of existence unraveled be...",Original text: I get it now. You have to give ...,Original text: I get it now. You have to give ...
1,Atlas was not seen to be special by any means....,Restyle this text as if it were written by a A...,Restyle this text as if it were written by a A...,"In the radiant tapestry of the summertime sky,...",Original text: Atlas was not seen to be specia...,Original text: Atlas was not seen to be specia...
2,Gleaming eyes shining in the dark. Visions of...,Adapt this text as a script for a wizard in a ...,Adapt this text as a script for a wizard in a ...,(A smoky bar in the heart of a roaring twentie...,Original text: Gleaming eyes shining in the da...,Original text: Gleaming eyes shining in the da...
3,"Remember, there's security cameras at every co...",Convey the same message as this text but throu...,Convey the same message as this text but throu...,"The Time Traveler's Tale:""My dear reader, I ha...","Original text: Remember, there's security came...","Original text: Remember, there's security came..."
4,Waves 17 Septillion ships per cloud Like loc...,Imagine this text was a villain in the world o...,Imagine this text was a villain in the world o...,"Sure, here is the text rewritten as a villain ...",Original text: Waves 17 Septillion ships per ...,Original text: Waves 17 Septillion ships per ...


In [4]:
df.shape

(999, 6)

In [5]:
data_file = 'fine_tune_data.txt'
df['combined_text'][:980].to_csv(data_file, index=False, header=False)


In [6]:
os.environ["HF_TOKEN"] = "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"

data_file = 'fine_tune_data.txt'
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b")

# Ensure the tokenizer will not split <|endoftext|> into multiple tokens
tokenizer.add_special_tokens({'additional_special_tokens': ['<|endoftext|>']})
model.resize_token_embeddings(len(tokenizer))

# 3. Create the dataset and data collator
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=data_file,
    block_size=128)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [7]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    overwrite_output_dir=True,       # overwrite the content of the output directory
    num_train_epochs=5,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    logging_steps=100,
    save_steps=10_000,               # after # steps model is saved
    save_total_limit=2,              # only last 2 models are saved
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# 5. Start the fine-tuning
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Currently logged in as: [33mlgan[0m ([33mcse151b_llm[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,3.2336
200,2.8074
300,2.7307
400,2.2471
500,1.3226
600,1.3198
700,1.3326
800,0.765
900,0.4404
1000,0.4315


TrainOutput(global_step=1840, training_loss=0.9921675656152809, metrics={'train_runtime': 1237.2779, 'train_samples_per_second': 11.893, 'train_steps_per_second': 1.487, 'total_flos': 2.239751361134592e+16, 'train_loss': 0.9921675656152809, 'epoch': 5.0})

In [8]:

model.save_pretrained('model/gemma/1000_5e/')
tokenizer.save_pretrained('model/gemma/1000_5e/') 

('model/gemma/1000_5e/tokenizer_config.json',
 'model/gemma/1000_5e/special_tokens_map.json',
 'model/gemma/1000_5e/tokenizer.json')

In [7]:
tokenizer = AutoTokenizer.from_pretrained('model/gemma/1000_5e/')
model = AutoModelForCausalLM.from_pretrained('model/gemma/1000_5e/')

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]


In [8]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256001, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaRM

In [20]:
nindex = 983
input_text = df["combined_text_inf"][nindex]
prompt = df["prompt"][nindex]
input_text

'Original text: A 100 year war between me and my government.   Ends today.   With my victory.   They say powers are a curse, a virus infecting every human on earth. Later the boom waits the more deadly it is.   I am 100 years old to this day and im tired of being on the run, it\'s time for me to go up in their front base and activate my power right then and there. I will be so strong that they ca n\'t even imprison me.   24 hours later, the man that had written this letter was indeed telling a half truth. He lost that day, he did what he promised and went to their base and activated the power of....immortality.   They say you can hear his ancient cries from a mile away. <|endoftext|> Rewritten text: MOONLIT GRAVEYARD - NIGHTA lone grave stands in the center of a moonlit graveyard. The wind whips around the tomb, carrying with it the scent of decay and evil.VOICE (O.S.): (singsong voice) "A 100-year war... ends today... with my victory..."The voice echoes through the graveyard. It is th

In [21]:
#input_text = 'Original text: If you\'re happy and you know it clap your hands.    `` Dina and Friends\'\' was filmed in a real farm, because the directors wanted to give it a more realistic feeling.    If you\'re happy and you know it clap your hands.    The goats and pigs moved expectantly towards Dina, bleating and  oinking for food.    If you\'re happy and you know it...    The bright sun in the blue, cloudless sky was enough to make anyone sweat, especially if they were wearing a giant dinosaur costume.   ... then your face will surely show it.    `` Bang!\'\'    The original writer of `` Dina and Friends\'\' look down at the man in the dinosaur costume. He may not have his home, wife, or kids anymore, but his former co-worker was the one who had lost.    If you\'re happy and you now it clap your hands.     -002  <|endoftext|> Rewritten text:  A farm in the sun. The sky is blue and cloudless.Narrator: (Singing) If you\'re happy and you know it, clap your hands.Director: (To camera) "Dina and Friends" was filmed on a real farm to give it a more realistic feeling.Narrator: (Singing) If you\'re happy and you know it, clap your hands.Goats and pigs: Move expectantly towards Dina, bleating and oinking for food.Narrator: (Singing) If you\'re happy and you know it...Sun: Is shining brightly in the sky.Giant dinosaur costume: Makes anyone sweat, especially if they are wearing one.Giant dinosaur costume: Makes a loud noise and a lot of dust.Original writer: Looks down at the man in the dinosaur costume.<|endoftext|> The prompt used to generate rewritten text given orignal text is: '

In [1]:
#input_text = "I get it now. You have to give to get. Where's did I put the cereal? You ca n't have life without death. Or love without hate. I think this milk is sour. Maybe it's fine. Maybe I'm fine, just the way I am.   I have to shower before I go. Maybe the only reason everything is so shit is because I expect it to be. A positive attitude might be the only thing I need. Shit, I'm going to be late.   Learn to drive, asshole! I need to relax, like in that dream, some real Zen stuff. Love and equality and all that. Fuck, I spilled my coffee. Today is going to be hell.<|endoftext|> The cosmic tapestry of existence unraveled before me, illuminating the interconnectedness of all phenomena. I perceive the enigmatic axiom woven through the threads of life and death, love and hate. It is an axiom inscribed in the cosmic consciousness, a testament to the duality of existence.Through the prism of perception, I discern the paradoxical dance of giving and receiving. To acquire the sustenance necessary for survival, one must offer a reciprocal gift, a harmonious exchange of energy. The cosmic rhythm dictates that to emerge unscathed, one must embrace the yin and yang of life, death, and all that encompasses them.The sourness of the milk serves as a reminder of the impermanence of all temporal endeavors. Yet, within the abyss of doubt, I find solace in the recognition of my own intrinsic wholeness. Perhaps, the paradox of existence is reconciled through a profound sense of self-<|endoftext|>"  # This is your input text
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

# Generate text using the model
# Adjust the max_length parameter as needed
output_sequences = model.generate(
    input_ids=input_ids,
    max_length=len(input_ids[0])+30,  # Adjust the max length of the output text if needed
    temperature=1.0,  # Temperature controls the randomness of the output
    top_k=50,  # Top-k sampling
    top_p=0.95,  # Nucleus sampling
    repetition_penalty=1.2,
    do_sample=True,
    num_return_sequences=1  # Number of output sequences to generate
)

# Decode the output
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=False)

print("Generated text:", generated_text)

NameError: name 'tokenizer' is not defined

In [19]:
input_text

'Original text: If you\'re happy and you know it clap your hands.    `` Dina and Friends\'\' was filmed in a real farm, because the directors wanted to give it a more realistic feeling.    If you\'re happy and you know it clap your hands.    The goats and pigs moved expectantly towards Dina, bleating and  oinking for food.    If you\'re happy and you know it...    The bright sun in the blue, cloudless sky was enough to make anyone sweat, especially if they were wearing a giant dinosaur costume.   ... then your face will surely show it.    `` Bang!\'\'    The original writer of `` Dina and Friends\'\' look down at the man in the dinosaur costume. He may not have his home, wife, or kids anymore, but his former co-worker was the one who had lost.    If you\'re happy and you now it clap your hands.     -002  <|endoftext|> Rewritten text: Scene: A farm in the sun. The sky is blue and cloudless.Narrator: (Singing) If you\'re happy and you know it, clap your hands.Director: (To camera) "Dina 

In [17]:
prompt

'Adapt this text as a script for a knight in a scientist setting.'

In [18]:
a=["Adapt this text as a script for a romance novel in a knight setting."]
b=[prompt]

sharpened_cosine_similarity_batch(scs_model, a, b, sharpen_factor=3)

[tensor([0.8089], device='cuda:0')]

In [12]:
from sentence_transformers import SentenceTransformer

scs_model = SentenceTransformer("sentence-t5-base")

In [13]:
def  batch_cosine_similarity(x1, x2):

    # Normalize the embeddings to unit vectors
    x1_norm = torch.nn.functional.normalize(x1, p=2, dim=-1)
    x2_norm = torch.nn.functional.normalize(x2, p=2, dim=-1)
    
    # Compute the cosine similarity
    cos_sim = torch.mm(x1_norm, x2_norm.transpose(0, 1))
    
    return cos_sim

def sharpened_cosine_similarity_batch(scs_model, output_texts, target_texts, sharpen_factor=3):
    # Assuming scs_model.encode() returns PyTorch tensors
    target_embeddings = scs_model.encode(target_texts, convert_to_tensor=True)
    output_embeddings = scs_model.encode(output_texts, convert_to_tensor=True)
    
    # Calculate batch cosine similarities using the previously defined batch_cosine_similarity function
    cos_sims = batch_cosine_similarity(target_embeddings, output_embeddings)
    
    # Apply the sharpening factor to each similarity score and create a list of tensors
    sharpened_scores = [cos_sims[i][i].unsqueeze(0) ** sharpen_factor for i in range(cos_sims.size(0))]
    
    return sharpened_scores


[tensor([0.6031], device='cuda:0')]