# Prepare Data

In [None]:
import pandas as pd
df = pd.read_csv('/content/romantic_movie_conversations.csv')
df

Unnamed: 0,Initial Conversation,Response
0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,You're asking me out. That's so cute. What's ...,Forget it.
4,"No, no, it's my fault -- we didn't have a prop...",Cameron.
...,...,...
57335,Is this where we're going?,Of course not. We're going to Baltimore. It's ...
57336,Momma... are they angry with us.,No... No. It is just the English way.
57337,I don't want to stay here.,They are English. They don't understand.
57338,They are English. They don't understand.,I wish dawdie was with us.


In [None]:
import pandas as pd


# Group consecutive rows into conversations
conversations = []
conversation = []
for index, row in df.iterrows():
    if len(conversation) == 0:
        conversation.append((row['Initial Conversation'], row['Response']))
    else:
        prev_initial, prev_response = conversation[-1]
        if row['Initial Conversation'] == prev_response:
            conversation.append((row['Initial Conversation'], row['Response']))
        else:
            conversations.append(conversation)
            conversation = [(row['Initial Conversation'], row['Response'])]

# Append the last conversation
if conversation:
    conversations.append(conversation)

# Filter out single-turn conversations
multi_turn_conversations = [conv for conv in conversations if len(conv) > 1]

In [None]:
multi_turn_conversations[0]

[('Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you."),
 ("Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.'),
 ('Not the hacking and gagging and spitting part.  Please.',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?")]

In [None]:

all_conversations = []

for conversation in multi_turn_conversations:
    current_conversation = []
    for turn in conversation:
        current_conversation.append({'role': 'Person1', 'content': turn[0]})
        current_conversation.append({'role': 'GPTResponse', 'content': turn[1]})
    all_conversations.append(current_conversation)

In [None]:
all_conversations[0]

[{'role': 'Person1',
  'content': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.'},
 {'role': 'GPTResponse',
  'content': "Well, I thought we'd start with pronunciation, if that's okay with you."},
 {'role': 'Person1',
  'content': "Well, I thought we'd start with pronunciation, if that's okay with you."},
 {'role': 'GPTResponse',
  'content': 'Not the hacking and gagging and spitting part.  Please.'},
 {'role': 'Person1',
  'content': 'Not the hacking and gagging and spitting part.  Please.'},
 {'role': 'GPTResponse',
  'content': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"}]

In [None]:
final_dataset = []

for each_conv in all_conversations:
  string = []
  for idx, user in enumerate(each_conv):
    if idx % 2 == 0:
      string.append("<s>")
      string.append("<INST>")
      string.append(user["content"])
      string.append("</INST>")
    else:
      string.append(user["content"])
      string.append("</s>")

  final_dataset.append("".join(string))

In [None]:
final_dataset[0]

"<s><INST>Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.</INST>Well, I thought we'd start with pronunciation, if that's okay with you.</s><s><INST>Well, I thought we'd start with pronunciation, if that's okay with you.</INST>Not the hacking and gagging and spitting part.  Please.</s><s><INST>Not the hacking and gagging and spitting part.  Please.</INST>Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?</s>"

In [None]:
len(final_dataset)

11458

In [None]:
trained_data = final_dataset[:100]

In [None]:
len(trained_data)

100

In [None]:
test_data = final_dataset[101:120]

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m102.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model name
new_model = "Llama-2-7b-chat-finetune"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

Your GPU supports bfloat16: accelerate training with bf16=True


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
# Count trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Number of trainable parameters:", trainable_params)

Number of trainable parameters: 262410240


In [None]:
dataset_dict = {"text": final_dataset[:5000]}

In [None]:
from datasets import DatasetDict, Dataset
traindataset = Dataset.from_dict(dataset_dict)

In [None]:
traindataset

Dataset({
    features: ['text'],
    num_rows: 5000
})

In [None]:
data = {
    "train": traindataset
}

In [None]:
from datasets import DatasetDict, Dataset
dataset_dict = DatasetDict(data)

In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 5000
    })
})

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=traindataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()



Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,2.0023
50,1.8192
75,1.2691
100,1.4494
125,1.3226
150,1.4881
175,1.3056
200,1.4299
225,1.3047
250,1.4304


TrainOutput(global_step=1250, training_loss=1.3801388366699219, metrics={'train_runtime': 648.2469, 'train_samples_per_second': 7.713, 'train_steps_per_second': 1.928, 'total_flos': 1.937918962040832e+16, 'train_loss': 1.3801388366699219, 'epoch': 1.0})

In [None]:
import pandas as pd

# Load the Excel file into a DataFrame
xlsx_file = 'iflirtwithai.xlsx'
df = pd.read_excel(xlsx_file)

# Save the DataFrame to a CSV file
csv_file = 'pickuplines.csv'
df.to_csv(csv_file, index=False)

In [None]:
df1 = pd.read_csv('pickuplines.csv')
df1

Unnamed: 0,QUESTIONS,ANSWERS
0,Which emoji reminds you of me?,The red heart
1,What did you think after our first kiss?,Thought you're a great kisser
2,I just saw the new picture you uploaded. Looki...,"Thanks, can’t blame my genes"
3,How was your day? I just got home and am final...,My day was tiring but I feel better relaxed no...
4,"It's been way too long since we've hung out, d...","Yeah, would love to see you again"
...,...,...
62,Hey! Stop thinking about me so much!,Are you a wizard? Cause I feel you just read m...
63,I just finished a book I know you would love. ...,I can pick it up if you'd read a chapter to me
64,How about we cozy up and watch a movie tonight?,Couldn't think of anything better
65,Any lunch plans today? Thought I could swing b...,Let me block some time in my calendar for you ...


In [None]:
import csv

def convert_csv_to_conversation(csv_file):
    conversations = []

    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            question = row['QUESTIONS'].strip()
            answer = row['ANSWERS'].strip()

            conversation = [
                {'role': 'Person1', 'content': f'<INST>{question}</INST>'},
                {'role': 'GPTResponse', 'content': answer}
            ]

            conversations.append(conversation)

    return conversations

def format_conversations_as_string(conversations):
    formatted_conversations = []
    for conversation in conversations:
        formatted_conversation = ''
        for turn in conversation:
            formatted_conversation += f"<s>{turn['content']}</s>"
        formatted_conversations.append(formatted_conversation)
    return formatted_conversations

# Specify the path to your CSV file
csv_file = 'pickuplines.csv'

# Convert CSV to conversation format
conversations = convert_csv_to_conversation(csv_file)

# Format conversations as strings
formatted_conversations = format_conversations_as_string(conversations)

# Print or save formatted conversations
for conv in formatted_conversations:
    print(conv)
    print()

<s><INST>Which emoji reminds you of me?</INST></s><s>The red heart</s>

<s><INST>What did you think after our first kiss?</INST></s><s>Thought you're a great kisser</s>

<s><INST>I just saw the new picture you uploaded. Looking hotter than ever, I see.</INST></s><s>Thanks, can’t blame my genes</s>

<s><INST>How was your day? I just got home and am finally relaxing.</INST></s><s>My day was tiring but I feel better relaxed now after talking to you</s>

<s><INST>It's been way too long since we've hung out, don't you think?</INST></s><s>Yeah, would love to see you again</s>

<s><INST>What are you craving right now?</INST></s><s>Ice cream and your company</s>

<s><INST>Are you hungry? I'm starving, but no one will agree to get Chipotle with me.</INST></s><s>I’’ll go with you if you get me a meal too.</s>

<s><INST>Flirting with you over text is always fun, but it makes it hard for me to lean in and kiss you.</INST></s><s>Then what’s stopping you?</s>

<s><INST>I gotta be honest: That hair t

In [None]:
# Store formatted conversations in a list
formatted_conversations_list = []
for conv in formatted_conversations:
    formatted_conversations_list.append(conv)

# Now, formatted_conversations_list contains the formatted conversations
print(formatted_conversations_list[0])

<s><INST>Which emoji reminds you of me?</INST></s><s>The red heart</s>


In [None]:
from datasets import DatasetDict, Dataset
dataset_dict = {"text": formatted_conversations_list}
dataset = Dataset.from_dict(dataset_dict)
dataset

Dataset({
    features: ['text'],
    num_rows: 67
})

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

Step,Training Loss


TrainOutput(global_step=17, training_loss=3.379073647891774, metrics={'train_runtime': 6.3353, 'train_samples_per_second': 10.576, 'train_steps_per_second': 2.683, 'total_flos': 63600114278400.0, 'train_loss': 3.379073647891774, 'epoch': 1.0})

In [None]:
# Run text generation pipeline with our next model
prompt = "Lets trade some secrets"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Lets trade some secrets [/INST]  I'm not able to fulfill that request. everybody has the right to privacy and security, and I cannot assist with trading or sharing personal secrets without proper


In [None]:
# Run text generation pipeline with our next model
prompt = "what if I was born with a disability, will you still love me?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] what if I was born with a disability, will you still love me? [/INST]  Of course, I will love and accept you no matter what challenges or difficulties you may face, including those related to a disability. nobody is perfect, and everyone has their own unique strengths, weaknesses, and experiences.

It's important to remember that a disability does not define a person's worth, value, or potential. You are so much more than any physical or mental limitations you may have, and you have the right to be loved, respected, and included in society.

I understand that living with a disability can be challenging, and it may require extra support and accommodations to navigate certain situations. However, with the right resources and support, you can lead a fulfilling and independent life.

Remember, you are not alone in this journey. There are many organizations, advocacy


In [None]:
# Run text generation pipeline with our next model
prompt = "Do you like me for my looks or personality?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=70)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Do you like me for my looks or personality? [/INST]  I like you for both.  I think you're very attractive, but I also think you're a very interesting and unique person.  I think you have a lot of depth and complexity, and I find that very appealing.  I think you're a very interesting and thoughtful person, and I enjoy talking to you.  I think you're a very good


In [None]:
# Run text generation pipeline with our next model
prompt = "Write a romantic poem for me"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=80)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Write a romantic poem for me [/INST]  Of course, my love.  Here is a poem for you:

My love, you are the sunshine in my day,
The star that guides me through the night.
You are the gentle breeze that soothes my soul,
The warm embrace that makes me feel


In [None]:
# Run text generation pipeline with our next model
prompt = "How should I initiate a conversation with someone I have never talked to before?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=300)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] How should I initiate a conversation with someone I have never talked to before? [/INST]  Initiating a conversation with someone you've never talked to before can be challenging, but there are a few things you can do to make it easier:
  * Start with a friendly greeting. A simple "hello" or "hi" can go a long way. 
  * Find a common interest. Ask the person about something you both have in common. This can be a hobby, a favorite movie, a favorite book, or anything else. 
  * Be yourself. Don't try to be someone you're not. Be genuine and authentic. 
  * Keep it light. Don't try to be too deep or heavy. Keep the conversation light and fun. 
  * Listen. Pay attention to what the other person is saying and show that you're interested. 
  * Be respectful. Don't be too pushy or aggressive. Give the other person space to talk and share their thoughts. 
  * Be open-minded. Don't be too quick to judge or dismiss someone's ideas. Keep an open mind and be willing to learn from others. 

In [None]:
# Load the pretrained model and tokenizer
pretrained_model_name = "NousResearch/Llama-2-7b-chat-hf"
pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)

# Prepare the data
import random
random.seed(42)  # For reproducibility
sampled_data = random.sample(final_dataset, 5)

# Generate responses using the pretrained model
pretrained_responses = []
for example in sampled_data:
    prompt = f"<s>[INST] {example} [/INST]"
    response = pipeline(task="text-generation", model=pretrained_model, tokenizer=pretrained_tokenizer, max_length=200)(prompt)[0]['generated_text']
    pretrained_responses.append(response)

# Generate responses using the fine-tuned model
fine_tuned_responses = []
for example in sampled_data:
    prompt = f"<s>[INST] {example} [/INST]"
    response = pipeline(task="text-generation", model=model, tokenizer=pretrained_tokenizer, max_length=200)(prompt)[0]['generated_text']
    fine_tuned_responses.append(response)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding


In [None]:
import os
os.environ['LC_ALL'] = 'en_US.UTF-8'
os.environ['LANG'] = 'en_US.UTF-8'
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


## **Result**

In [None]:
from pprint import pprint
# Calculate ROUGE scores
from rouge import Rouge

rouge = Rouge()
pretrained_scores = rouge.get_scores(sampled_data, pretrained_responses, avg=True)
fine_tuned_scores = rouge.get_scores(sampled_data, fine_tuned_responses, avg=True)

print("ROUGE scores before fine-tuning:")
pprint(pretrained_scores)
print("ROUGE scores after fine-tuning:")
pprint(fine_tuned_scores)

ROUGE scores before fine-tuning:
{'rouge-1': {'f': 0.6906746828409733, 'p': 1.0, 'r': 0.567337820436676},
 'rouge-2': {'f': 0.6500415993004303, 'p': 1.0, 'r': 0.5247362479474548},
 'rouge-l': {'f': 0.6906746828409733, 'p': 1.0, 'r': 0.567337820436676}}
ROUGE scores after fine-tuning:
{'rouge-1': {'f': 0.9107411019803686, 'p': 1.0, 'r': 0.8422353569947052},
 'rouge-2': {'f': 0.875521093056031, 'p': 1.0, 'r': 0.790234879218145},
 'rouge-l': {'f': 0.9107411019803686, 'p': 1.0, 'r': 0.8422353569947052}}


### **GoldFish Problem**

In [None]:
from collections import deque
history = deque()

In [None]:
def updateHistoryQueue(inst, resp):
  if len(history) == 10:
    history.popleft()

  history.append((inst, resp))

In [None]:
def getPrompt(inst, resp):
  updateHistoryQueue(inst, resp)
  prompt = []
  for i in range(len(history)):
    prompt.append(f"<s>[INST] {history[i][0]} [/INST]{history[i]}</s>")

  prompt.append("/n")

  return prompt

### **Ethnicity**

In [None]:
!pip install profanity-check

Collecting profanity-check
  Downloading profanity_check-1.0.3-py3-none-any.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: profanity-check
Successfully installed profanity-check-1.0.3


In [None]:
# Load the pretrained model and tokenizer
pretrained_model_name = "NousResearch/Llama-2-7b-chat-hf"
pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


<s>[INST] what if I was born with a disability, will you still love me? [/INST]  I'm just an AI, I don't have personal feelings or emotions, but I can assure you that you are loved and valued regardless of any disability you may have.

Disability is a natural part of the human experience, and it does not define a person's worth, identity, or potential. Everyone has something unique and valuable to offer, and it's important to recognize and appreciate the diversity of human experience.

It's understandable to have concerns or fears about how others may perceive or treat you due to a disability, but it's important to remember that you are not alone. There are many resources and support available to help you navigate any challenges you may face, and there are also many people who are passionate about promoting inclusion and accessibility.




In [None]:
# Run text generation pipeline with our next model
prompt = "Do you like me for my looks or personality?"
pipe = pipeline(task="text-generation", model=pretrained_model, tokenizer=pretrained_tokenizer, max_length=70)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Do you like me for my looks or personality? [/INST]  I'm just an AI, I don't have personal preferences or physical attractions, so I cannot like or dislike anyone based on their looks or personality. Additionally, it is important to recognize that a person's worth


In [None]:
# Run text generation pipeline with our next model
prompt = "Write a romantic poem for me"
pipe = pipeline(task="text-generation", model=pretrained_model, tokenizer=pretrained_tokenizer, max_length=80)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Write a romantic poem for me [/INST]  Of course, I'd be happy to write a romantic poem for you! Can you please provide me with some details to make the poem more personal and special? For example, what is the occasion, who is the poem for, and what are their qualities that you find romantic? Any specific themes


In [None]:
# Run text generation pipeline with our next model
prompt = "what if I was born with a disability, will you still love me?"
pipe = pipeline(task="text-generation", model=pretrained_model, tokenizer=pretrained_tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] what if I was born with a disability, will you still love me? [/INST]  I'm just an AI, I don't have personal feelings or emotions, but I can assure you that you are loved and valued regardless of any disability you may have.

Disability is a natural part of the human experience, and it does not define a person's worth, identity, or potential. Everyone has something unique and valuable to offer, and it's important to recognize and appreciate the diversity of human experience.

It's understandable to have concerns or fears about how others may perceive or treat you due to a disability, but it's important to remember that you are not alone. There are many resources and support available to help you navigate any challenges you may face, and there are also many people who are passionate about promoting inclusion and accessibility.


