In [39]:
import os
os.environ['HUGGING_FACE_HUB_TOKEN'] = xxxx
os.environ['QDRANT_API_KEY'] = xxxx

In [2]:
from dataclasses import dataclass, field
from typing import Optional

import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, GenerationConfig

from trl import SFTTrainer,  DataCollatorForCompletionOnlyLM
import json
import pandas as pd

from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tqdm.pandas()

In [4]:
# Define and parse arguments.
class ScriptArgs:
    """
    The name of the Casual LM model we wish to fine with SFTTrainer
    """

    model_name = 'meta-llama/Llama-2-7b-hf'
    # dataset_name = "timdettmers/openassistant-guanaco" # the dataset name
    dataset_text_field = "text" # the text field of the dataset"
    log_with = "wandb"
    learning_rate = 1.41e-5
    batch_size = 16 # 64
    seq_length = 512
    gradient_accumulation_steps = 4 # 16
    load_in_8bit = False
    load_in_4bit = True
    use_peft = True
    trust_remote_code = True
    output_dir = "output"  # the output directory"
    peft_lora_r = 64  # the r parameter of the LoRA adapters"
    peft_lora_alpha = 16  # the alpha parameter of the LoRA adapters"
    logging_steps = 1  # the number of logging steps"
    use_auth_token = True  # Use HF auth token to access the model"
    num_train_epochs = 3  # the number of training epochs"
    max_steps = -1  # the number of training steps"
    save_steps = 100  # Number of updates steps before two checkpoint saves
    save_total_limit = 10  # metadata={"help": "Limits total number of checkpoints."})
    push_to_hub = False  # metadata={"help": "Push the model to HF Hub"})
    hub_model_id = None  # metadata={"help": "The name of the model on HF Hub"})

In [5]:
import accelerate

In [6]:
# Step 1: Load the model
if ScriptArgs.load_in_8bit and ScriptArgs.load_in_4bit:
    raise ValueError("You can't load the model in 8 bits and 4 bits at the same time")
elif ScriptArgs.load_in_8bit or ScriptArgs.load_in_4bit:
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=ScriptArgs.load_in_8bit, load_in_4bit=ScriptArgs.load_in_4bit
    )
    # This means: fit the entire model on the GPU:0
    device_map = {"": 0}
    torch_dtype = torch.bfloat16
else:
    device_map = None
    quantization_config = None
    torch_dtype = None

model = AutoModelForCausalLM.from_pretrained(
    ScriptArgs.model_name,
    quantization_config=quantization_config,
    device_map=device_map,
    trust_remote_code=ScriptArgs.trust_remote_code,
    torch_dtype=torch_dtype,
    use_auth_token=ScriptArgs.use_auth_token,
)

tokenizer = AutoTokenizer.from_pretrained(ScriptArgs.model_name)
# tokenizer.all_special_tokens_extended

# model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False # Gradient checkpointing is used by default but not compatible with caching

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.95s/it]


In [7]:
import pandas as pd
import re

# df = pd.read_csv("./[Therapy] Web Content - out.csv")
df = pd.read_csv("./f (1).csv")
# df.iloc[:500]
# df = df.dropna()
# convos = df.transcript.tolist()

convo_re = re.compile(r"^(.{0,50})\:", flags=re.M,)

rows = []

for entry in df.itertuples():
    convo = json.loads(entry.conversation)
    if not convo:
        continue
    
    if convo[0]["role"] == "assistant":
        convo.insert(0, {'role': 'user', 'message': "Hello"})        

    # for i in range(0, len(convo), 10):
    text = tokenizer.bos_token
    system_prompt = entry.title or ""
    text += f"<<SYS>>{system_prompt}<</SYS>>"
    
    for idx, msg in enumerate(convo): # enumerate(convo[i: i + 10]):
        line = "### " + msg['role'].lower() + ": " + msg['message'].replace("\n", " ").strip()

        if idx % 2 == 1:
            line += tokenizer.eos_token
        else:
            if idx < 1:
                line = " [INST] " + line + " [/INST] "
            else:
                line = "\n" + tokenizer.bos_token + "[INST] " + line + " [/INST] "
        
        text += line
        
    rows.append(text)

In [8]:
dataset = Dataset.from_pandas(
    pd.DataFrame(rows, columns=["text"])
)
dataset = dataset.train_test_split(test_size=0.3)

In [9]:
dataset.push_to_hub("devxpy/therapychat")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 33.87ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  8.96it/s]
Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 75.09ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.65it/s]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  7.02it/s]
Downloading metadata: 100%|██████████| 559/559 [00:00<00:00, 3.27MB/s]


In [10]:
# tokenizer.add_special_tokens({'pad_token': '[INST]'})
# tokenizer.add_special_tokens({'pad_token': '<PAD>'})
# tokenizer.add_special_tokens({"pad_token":"<pad>"})
# tokenizer.padding_side = 'left'

In [11]:
collator = DataCollatorForCompletionOnlyLM(
    instruction_template="### user:",
    response_template="### assistant:", 
    tokenizer=tokenizer
)

# Step 3: Define the training arguments
training_args = TrainingArguments(
    output_dir=ScriptArgs.output_dir,
    per_device_train_batch_size=ScriptArgs.batch_size,
    gradient_accumulation_steps=ScriptArgs.gradient_accumulation_steps,
    learning_rate=ScriptArgs.learning_rate,
    logging_steps=ScriptArgs.logging_steps,
    num_train_epochs=ScriptArgs.num_train_epochs,
    max_steps=ScriptArgs.max_steps,
    report_to=ScriptArgs.log_with,
    save_steps=ScriptArgs.save_steps,
    save_total_limit=ScriptArgs.save_total_limit,
    push_to_hub=ScriptArgs.push_to_hub,
    hub_model_id=ScriptArgs.hub_model_id,
)

# Step 4: Define the LoraConfig
if ScriptArgs.use_peft:
    peft_config = LoraConfig(
        r=ScriptArgs.peft_lora_r,
        lora_alpha=ScriptArgs.peft_lora_alpha,
        bias="none",
        task_type="CAUSAL_LM",
    )
else:
    peft_config = None


# Step 5: Define the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    max_seq_length=ScriptArgs.seq_length,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dataset_text_field=ScriptArgs.dataset_text_field,
    peft_config=peft_config,
    # data_collator=collator,
    # tokenizer=tokenizer,
    # special_tokens=["[MASK]"]
)

Using pad_token, but it is not set yet.
Map: 100%|██████████| 516/516 [00:00<00:00, 896.22 examples/s]
Map: 100%|██████████| 222/222 [00:00<00:00, 783.13 examples/s]


In [12]:
torch.cuda.empty_cache()

In [13]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mdevxpy[0m ([33mgooeyai[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.9826
2,3.0135
3,3.0258
4,2.9638
5,2.9668
6,2.9661
7,3.0146
8,2.9436
9,2.979
10,2.9142


TrainOutput(global_step=24, training_loss=2.9553772608439126, metrics={'train_runtime': 1794.7336, 'train_samples_per_second': 0.863, 'train_steps_per_second': 0.013, 'total_flos': 1.580598615343104e+16, 'train_loss': 2.9553772608439126, 'epoch': 2.91})

In [14]:
# Step 6: Save the model
trainer.save_model(ScriptArgs.output_dir)

In [20]:
!ls output 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
adapter_config.json  special_tokens_map.json  tokenizer.model
adapter_model.bin    tokenizer_config.json    training_args.bin
README.md	     tokenizer.json


In [16]:
from peft import PeftModel
from transformers import AutoTokenizer


ftmodel = PeftModel.from_pretrained(model, "./output/", torch_dtype=torch.float16,)

In [19]:
ftmodel.push_to_hub("devxpy/therapybot")

adapter_model.bin: 100%|██████████| 134M/134M [00:03<00:00, 44.0MB/s] 


CommitInfo(commit_url='https://huggingface.co/devxpy/therapybot/commit/6f5d5729ba425d05b8d8fecca05fbc520d7d1da8', commit_message='Upload model', commit_description='', oid='6f5d5729ba425d05b8d8fecca05fbc520d7d1da8', pr_url=None, pr_revision=None, pr_num=None)

In [97]:
#print(correct_prompt)

<<SYS>>
Transcript of patient with Dr. Brown
<</SYS>>
[INST] ### user: Hi! [/INST] ### assistant: Hello! How are you?
[INST] ### user: I am feeling tired [/INST] ### assistant: Why?
[INST] ### user: I have worked too much [/INST] ### assistant: What do you do?
[INST] ### user: I do programming [/INST] ### assistant: What kind of programming
[INST] ### user: I program in python to create ai and its not working, suggest me some medicines [/INST]


In [47]:
import json

import requests_html
import torch
from qdrant_client import models, QdrantClient
from qdrant_client.models import VectorParams
from sentence_transformers import SentenceTransformer

COLLECTION_NAME = "spotify-genres"

encoder = SentenceTransformer(
    "thenlper/gte-large", device="mps" if torch.has_mps else "cuda"
)  # thenlper/gte-base


def get_playlists(q, limit=9):
    # Create a client object for Qdrant.
    qdrant = QdrantClient(
        "https://303757f4-9127-4df0-a9e8-9669f997f742.us-east4-0.gcp.cloud.qdrant.io:6333",
        api_key=os.environ['QDRANT_API_KEY'],
    )
    hits = qdrant.search(
        collection_name=COLLECTION_NAME,
        query_vector=encoder.encode(q).tolist(),
        limit=limit,
    )
    return [x.payload["playlist_url"] for x in hits]

get_playlists("sad and angry")

['https://embed.spotify.com/?uri=spotify:playlist:3Fv3vd2eqb9rVvIJcJxEoU',
 'https://embed.spotify.com/?uri=spotify:playlist:2uM2IvAqc6HZmEiRjJBF6Q',
 'https://embed.spotify.com/?uri=spotify:playlist:54j0DPxOen62HcPCc2Pmoe',
 'https://embed.spotify.com/?uri=spotify:playlist:1uN3iCLmAXg6rmS86SJM7g',
 'https://embed.spotify.com/?uri=spotify:playlist:703bHAXaUiMA90jlQyemQb',
 'https://embed.spotify.com/?uri=spotify:playlist:4uLg82FvdD0blplTBTmR59',
 'https://embed.spotify.com/?uri=spotify:playlist:4SJlsd4gR9A8DTzRoMrFPl',
 'https://embed.spotify.com/?uri=spotify:playlist:4pJ4ZEzj93WnpU4CUgVxH1',
 'https://embed.spotify.com/?uri=spotify:playlist:0QrUMIfELgl7gi1zYcCgq4']

In [105]:
prompt = """
<<SYS>>\nTranscript of patient with Dr. Brown\n<</SYS>>
[INST] ### user: Hi! [/INST] ### assistant: Hello! How are you?
""".strip()

while True:
    msg = input("User: ").strip()
    if not msg:
        break
    
    prompt += f"\n[INST] ### user: {msg} [/INST]"
    # print(prompt)
    
    inputs = tokenizer.encode(prompt.strip(), return_tensors="pt")
            
    streamer = TextIteratorStreamer(tokenizer,
                                    timeout=10.,
                                    skip_prompt=True,
                                    skip_special_tokens=False)
    generate_kwargs = dict(
        input_ids=inputs,
        streamer=streamer,
        max_new_tokens=128,
        do_sample=True,
        # top_p=top_p,
        # top_k=top_k,
        temperature=0.8,
        num_beams=1,
    )
    
    t = Thread(target=ftmodel.generate, kwargs=generate_kwargs)
    t.start()
    
    # outputs = []
    res = ""
    for text in streamer:
        if "[" in text or "]" in text:
            break
        res += text
        print(text, end="", flush=True)

    print("\nPlaylists: ", ", ".join(get_playlists(msg, 3)))
    
    prompt += res.strip()
    # break
print("---")
print(prompt)

User:  I am very bored


### assistant: Bored? You can always watch the news. 
Playlists:  https://embed.spotify.com/?uri=spotify:playlist:5PI1ISPDdvY2InHQ4ltty1, https://embed.spotify.com/?uri=spotify:playlist:6zeeSXGL4cJQ8pM7tsv42f, https://embed.spotify.com/?uri=spotify:playlist:1pkHHdvmo3uvqjXXcumTja


User:  I feel really happy and excited now!


### assistant: You can always go to the cinema.

Playlists:  https://embed.spotify.com/?uri=spotify:playlist:17TwTPITOWP0Ep9joASCrm, https://embed.spotify.com/?uri=spotify:playlist:4aId92f7OaneHjOx0emyjz, https://embed.spotify.com/?uri=spotify:playlist:0nrHQMazFvwjMpg5esflnT


User:  The cinema makes me really sad and gloomy


### assistant: So do you want to go home?

Playlists:  https://embed.spotify.com/?uri=spotify:playlist:6mDBTlpxmqfCQXQalGJOxe, https://embed.spotify.com/?uri=spotify:playlist:1l4otzwhS8LnLsWKducSxa, https://embed.spotify.com/?uri=spotify:playlist:7oNPNXqyCxty9nHMltU1pp


User:  Yeah, i just wanna rest in my bed


### assistant: Ok, then i am gonna leave.

Playlists:  https://embed.spotify.com/?uri=spotify:playlist:2pSeH1YQCSICs2knjs7e5o, https://embed.spotify.com/?uri=spotify:playlist:2vLez030U00PBsuE2ug926, https://embed.spotify.com/?uri=spotify:playlist:2owRoUVlHqTdJF3FmF5uwZ


User:  Okie bye! See you later


### assistant: Bye, have a nice evening!

Playlists:  https://embed.spotify.com/?uri=spotify:playlist:5PI1ISPDdvY2InHQ4ltty1, https://embed.spotify.com/?uri=spotify:playlist:3OJiIetHH3ef9mwP8e3gH5, https://embed.spotify.com/?uri=spotify:playlist:1pkHHdvmo3uvqjXXcumTja


User:  


---
<<SYS>>
Transcript of patient with Dr. Brown
<</SYS>>
[INST] ### user: Hi! [/INST] ### assistant: Hello! How are you?
[INST] ### user: I am very bored [/INST]### assistant: Bored? You can always watch the news.
[INST] ### user: I feel really happy and excited now! [/INST]### assistant: You can always go to the cinema.
[INST] ### user: The cinema makes me really sad and gloomy [/INST]### assistant: So do you want to go home?
[INST] ### user: Yeah, i just wanna rest in my bed [/INST]### assistant: Ok, then i am gonna leave.
[INST] ### user: Okie bye! See you later [/INST]### assistant: Bye, have a nice evening!
