This notebook is for demonstration purpose only. It does not claim to be especially elegant or efficient.

# Reading and processing the data

In [1]:
# You need to re-boot the instance after installing
!pip install whatstk
!pip install einops



In [2]:
import pandas as pd
from whatstk import WhatsAppChat
from datetime import timedelta

In [4]:
filepath = "/content/drive/MyDrive/Chat de WhatsApp con +57 301 2386822.txt" # you need to copy your export
chat = WhatsAppChat.from_source(filepath=filepath)
df = chat.df

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Filter GIFs, stickers and images; works only for German
df = df[df["message"]!="\u200eGIF weggelassen"]
df = df[df["message"]!="\u200eSticker weggelassen\n\u200e"]
df = df[df["message"]!="\u200eBild weggelassen"]

In [6]:
# Calculate time passed since previous message
df["date_previous"] = df["date"].shift(periods=1)
df["time_delta"] = (df["date"]-df["date_previous"]).dt.total_seconds()

# Concat message and author
df["chat_message"] = df["username"] + ": " + df["message"]

# Remove first line, its just a WhatApp test line
df = df[1:]

In [7]:
# Convert messages into conversations (a conversation has multiple messages); ugly programming, but works for small data
# Step 1: Concat each message with the previous conversation
query = []
answer = []
conversation = ""
session_ix = 0
sessions_ixs = []

for ix, row in df.iterrows():
  if row["time_delta"]<3600: # This defines on how close messages should be to be in the same conversation
    session_ix = session_ix + 1
    sessions_ixs.append(session_ix)
    if conversation == "":
      conversation = row["chat_message"]
      query.append(conversation)
      answer.append("")
    else:
      conversation = conversation + "| " + row["chat_message"]
      query.append(conversation)
      answer.append(row["chat_message"])
  else:
    session_ix=0
    conversation = ""


df_model = pd.DataFrame({"query": query[:-1], "answer": answer[1:], "session_ix": sessions_ixs[:-1]})

# Step 2: Filter only for the last message of the conversation (therefore for the full conversation.
df_model["model_helper_idx"] = df_model["session_ix"] - df_model["session_ix"].shift(-1)
df_model = df_model[df_model["model_helper_idx"]>-1]

# This way is a bit clumsy, but I did some test with the intermediate conversation steps.

# Fine-tune model

In [8]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import Dataset
import transformers

In [10]:
# Import base model
model_id = "malteos/bloom-6b4-clp-german"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/850k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/508k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/427 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/32 [00:00<?, ?it/s]

model_00001-of-00032.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

model_00002-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00003-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00004-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00005-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00006-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00007-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00008-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00009-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00010-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00011-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00012-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00013-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00014-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00015-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00016-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00017-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00018-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00019-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00020-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00021-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00022-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00023-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00024-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00025-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00026-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00027-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00028-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00029-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00030-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00031-of-00032.safetensors:   0%|          | 0.00/403M [00:00<?, ?B/s]

model_00032-of-00032.safetensors:   0%|          | 0.00/16.6k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/32 [00:00<?, ?it/s]

In [11]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [1]:
# Apply PEFT (Adapter, LoRA and others)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

NameError: name 'model' is not defined

In [13]:
# Create dataset from pandas Dataframe
dataset = Dataset.from_pandas(df_model)
tokenizer.pad_token = tokenizer.eos_token
ds = dataset.map(lambda samples: tokenizer(samples["query"], truncation=True, padding=True, max_length=512), batched=True)

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

In [14]:
# Callback to print intermediate model outputs
class GenerationCallback(transformers.TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 5 == 0:
          input_text = "Daniel: Hey, wie gehts?, Lisa:"
          input_ids = self.tokenizer.encode(input_text, return_tensors="pt")
          generated_ids = self.trainer.model.generate(input_ids=input_ids.to('cuda'))
          generated_text = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
          print("Generated Text:", generated_text)

# Create an instance of your custom callback
callback = GenerationCallback()

In [16]:
# Train, make sure to adjust hyperparams
trainer = transformers.Trainer(
    model=model,
    train_dataset=ds,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=120,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

callback.trainer = trainer
callback.tokenizer = tokenizer
trainer.add_callback(callback)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

max_steps is given, it will override any value given in num_train_epochs

torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.



Step,Training Loss
1,2.3022
2,2.594
3,2.3043
4,2.4888


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.

Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.


None of the inputs have requires_grad=True. Gradients will be None



IndexError: tuple index out of range

# Make predictions

In [None]:
def answer(input_text, model, tokenizer):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    generated_ids = model.generate(input_ids=input_ids.to('cuda'), max_new_tokens=20, min_new_tokens=10, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    # Process or use the generated text as needed
    return generated_text

In [None]:
def chat(role, nonrole, model, tokenizer):
    for i in range(1, 100):
        query = input()
        if i==1:
            query = f"{role}: {query}| {nonrole}:"
        if i>1:
            query = response + f"| {role}: {query}| {nonrole}:"
        response = "| ".join(answer(query, model, tokenizer).split("| ")[:i*2])
        print(response)

In [None]:
chat("Daniel", "Lisa", model, tokenizer)