# Textual Avatar - Train LLM on your own!

This notebook intend to export your dialogs into encrpyted instruction based format and fine-tune LLama-2 on it.
It is also provided gradio demo and Telegram bot with inferencing model.
For fine tune LLama-2 you need access to capable hardware, I use Nvidia RTX-3090 for one night.

## Content

In this project, we'll follow these main steps:


### 1. [Export Meta Messenger data](#Export-Meta-Messenger-data) or 
### 2. [Export Telegram data](#Exporting-from-Telegram) or
### 3. (Optionally) [Export other data in instruction-based format](#Add-more-data)
### 4. [Preprocess and encrypt dataset file](#Preprocess-and-encrypt-dataset-file)
### 5. [Download LLM and setup machine which support fine-tune LLM](#Setup-transformers,-PEFT-and-LLM)
### 6. [Load dataset and fine-tune LLM for several hours](#Training-script,-simplified-from-HF-tutorial)
### 7. [Run chat with yourself](#Compare-with-previous-replies)
### 8. [Create chat bot](#Telegram-chat-bot)


## Privacy and Security

I provide encryption of data file, all private files could be deleted after preprocessing.

However, exercise caution and only run this notebook on trusted systems.





In [15]:
import glob
import json
import os
import pickle
import csv

import pathlib
from getpass import getpass

'''
Only export these messages
'''


def filter_message(msg: str) -> bool:
    return len(msg) > 1 and not msg.startswith("http")


TEXTUAL_AVATAR = "<TEXTUAL_AVATAR>"

'''
Instruction for LLM, using previous conversation
'''


def convert_sample_to_instruction(sample):
    previous_conversation = ""
    for message in sample['last_messages']:
        previous_conversation = previous_conversation + \
            message["author"]+":"+message["text"]+"\n"
    previous_conversation = previous_conversation[-4096:]

    instruction = f"""You are {TEXTUAL_AVATAR}, a sophisticated AI designed to engage in text conversations.
    Your goal is to provide relevant responses based on the given context.
    Imagine you have been having a conversation with {sample['counterpart']}
    Your task is to mimic a text reply to last message as {TEXTUAL_AVATAR}
    """
    input = f"""previous conversation was:\n{previous_conversation} last message is {sample['input']}"""
    response = sample['text']
    return {"instruction": instruction, "input": input, "response": response}


MAX_PREVIOUS_MESSAGES = 12
KEEP_CONVERSATION_SEC = 60*60*24

with open('textual_avatar.csv', 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=[
                            "instruction", "input", "response"])
    writer.writeheader()


# Export Meta Messenger data

To download messages from Facebook:
1. Click on Setting->Your Facebook Information -> Download your information https://www.facebook.com/dyi/
2. Choose your messages, for period **All Time**, format JSON
3. Wait (1 hour) until data collected, download it and unzip
4. Uncomment line 5 and 6 in next cell with  ```YOUR_PATH``` to unzipped folder

In [None]:
exported_fb_file_paths = glob.glob(
    f"{os.environ['HOME']}/Downloads/messages/inbox/*/message*.json")

# or enter here your location for FB archive with jsons
# YOUR_PATH=
# exported_fb_file_paths=glob.glob(YOUR_PATH+"/messages/inbox/*/message*.json")


encoding = 'latin1'
participants = {}
for json_file in exported_fb_file_paths:
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
        if len(data['messages']) > 2 and len(data['participants']) == 2:
            for participant in data["participants"]:
                name = participant["name"].encode(encoding).decode()
                if name not in participants.keys():
                    participants[name] = set()
                participants[name].add(json_file)

max_length = 0
you = ""
for key, value in participants.items():
    if len(value) > max_length:
        you = key
        max_length = len(value)


chats = {}
your_messages_idx = {}
for json_file in exported_fb_file_paths:
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
        chat = []
        yours = []
        for participant in data["participants"]:
            name = participant["name"].encode(encoding).decode()
            if len(name) < 1:
                name = "Facebook user"
            if name != you:
                participant_name = name
        if len(data['messages']) > 2 and len(data['participants']) == 2:
            for i, message in enumerate(data['messages']):
                if 'content' in message.keys():
                    author = message['sender_name'].encode(encoding).decode()
                    if author == you:
                        yours.append(i)
                        author = "TEXTUAL_AVATAR"
                    text = message['content'].encode(encoding).decode()
                    if filter_message(text):
                        new_message = {"text": text,
                                       "tstamp": message['timestamp_ms']//1000,
                                       "author": author}
                        chat.append(new_message)
            chats[participant_name] = chat
            your_messages_idx[participant_name] = yours


print(f"exporting {len(chats)} chats")

with open('text_avatar.csv', 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=[
                            "instruction", "input", "response"])
    for counterpart, chat in chats.items():
        for your_reply in your_messages_idx[counterpart]:
            last_message_to_use = max(your_reply-MAX_PREVIOUS_MESSAGES, 0)
            your_reply_tstamp = chat[your_reply]["tstamp"]

            for message_idx in reversed(range(last_message_to_use, your_reply)):
                if (your_reply_tstamp-chat[message_idx]["tstamp"]) > KEEP_CONVERSATION_SEC:
                    last_message_to_use = message_idx+1
                    break
            sample = {"counterpart": counterpart,
                      "last_messages": chat[last_message_to_use:your_reply-1],
                      "input": chat[your_reply-1]["text"],
                      "text": chat[your_reply]["text"]}
            writer.writerow(convert_sample_to_instruction(
                sample))  # adding a row


# Exporting from Telegram

To collect messages from Telegram on Mac I did the following:

1. Download the official Telegram client on Mac from here https://macos.telegram.org  (Mac App Store version not worked for me)
2. Click Setting->Advanced->Export Telegram data on the bottom. Current script work with personal chat and machine friendly JSON
3. Confirm this action in chat on other device.
4. Wait until download process finish. Edit line 4 below ```exported_tg_file_path``` with your path to result.json


In [75]:
#exported_tg_file_path = glob.glob(
#    f"{os.environ['HOME']}/Downloads/Telegram Desktop/DataExport_*/result.json")[0]
# or enter here your location for
exported_tg_file_path ="result.json"
with open(exported_tg_file_path) as exported_tg_file:
    exported = json.load(exported_tg_file)
chats = [chat for chat in exported["chats"]
         ["list"] if chat['type'] == 'personal_chat']
print(f"found {len(chats)} chats")

participants = {}
participant_names = {}
you = ""  # tricky way to find your id assume you appear in most chats

for chat in chats:
    current_chat = {}
    for message in chat['messages']:
        if message['type'] == "message":
            if 'from_id' in message.keys():
                if message['from_id'] not in participants.keys():
                    participants[message['from_id']] = set()
                participants[message['from_id']].add(chat['id'])
                participant_names[message['from_id']] = message['from']


max_length = 0
you = ""
for key, value in participants.items():
    if len(value) > max_length:
        you = key
        max_length = len(value)

exported_chats = {}
your_messages_idx = {}

for conversation in chats:
    idx = 0
    chat = []
    yours = []
    for message in conversation['messages']:  # pick counterpart user name
        if message['type'] == "message":
            if message["from_id"] != you:
                participant_name = message["from"]
                break

    for message in conversation['messages']:
        if message['type'] == "message" and "from_id" in message.keys() and filter_message(str(message["text"])):
            text = ""
            if isinstance(message["text"], list):
                for text_item in message["text"]:
                    if isinstance(text_item, str):
                        text = text+"\n"+text_item
            else:
                text = message["text"]
            if message["from_id"] == you:
                yours.append(idx)
                author = TEXTUAL_AVATAR
            else:
                author = participant_name

            new_message = {"text": text,
                           "tstamp": int(message["date_unixtime"]),
                           "author": author}
            chat.append(new_message)
            idx += 1

    exported_chats[participant_name] = chat
    your_messages_idx[participant_name] = yours


with open('textual_avatar.csv', 'a', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=[
                            "instruction", "input", "response"])
    for counterpart, chat in exported_chats.items():
        for your_reply in your_messages_idx[counterpart]:
            last_message_to_use = max(your_reply-MAX_PREVIOUS_MESSAGES, 0)
            your_reply_tstamp = chat[your_reply]["tstamp"]

            for message_idx in range(last_message_to_use, your_reply):
                if (your_reply_tstamp-chat[message_idx]["tstamp"]) > KEEP_CONVERSATION_SEC:
                    last_message_to_use = message_idx+1
                    break
            sample = {"counterpart": counterpart,
                      "last_messages": chat[last_message_to_use:your_reply-1],
                      "input": chat[your_reply-1]["text"],
                      "text": chat[your_reply]["text"]}
            writer.writerow(convert_sample_to_instruction(
                sample))  # adding a row


found 236 chats


# Add more data
### Algorithm

Download data and introduce variable path to it . Include path in final cleanup script.

Detect your id, create two dictionaries with keys as person in discussion and values as messages list and indices of your replies.

Iterate over saved chats resulting dictionary with ```counterpart```,```last_messages``` history list, previous ```input``` and you reply ```text```

Call common ```convert_sample_to_instruction``` and add row to instruction based ```textual_avatar.csv```.


# Preprocess and encrypt dataset file

The quality of result is highly depend on quality of training data.

Your probably need to filter too long and too short answers.

On other hand you may add some synthetic data using judgement from [RLHF](https://huggingface.co/blog/rlhf) or [RLAF](https://arxiv.org/pdf/2212.08073.pdf)

# Encryption 

It is highly recommended to encrypt the file with your personal data.
The code below store unencrypted data only in memory.



In [None]:
!pip install pandas --quiet
import pandas as pd
df = pd.read_csv('textual_avatar.csv')


In [None]:
## Encryption
!pip install fernet --quiet

from cryptography.fernet import Fernet
import csv

# Generate a random encryption key
encryption_key = Fernet.generate_key()
cipher_suite = Fernet(encryption_key)
input_file = 'textual_avatar.csv'

with open(input_file, 'rb') as file:
    file_data = file.read()
    encrypted_data = cipher_suite.encrypt(file_data)

with open(input_file, 'wb') as encrypted_file:
    encrypted_file.write(encrypted_data)
print("Encryption complete. Encryption key:", encryption_key.decode())



### Collecting data is ending.
### Next part you can run on another computer copying ```textual_avatar.csv` 

In [None]:
# Decryption, that could be run on other machine 
!pip install fernet tqdm pandas --quiet
import pandas as pd
from cryptography.fernet import Fernet
import csv
import getpass
import io

# Ask the user for the encryption key
encryption_key_input = getpass.getpass(prompt = 'Enter the encryption key: ')
encryption_key = encryption_key_input.encode()
cipher_suite = Fernet(encryption_key)

input_file = 'textual_avatar.csv'

with open(input_file, 'rb') as encrypted_file:
    encrypted_data = encrypted_file.read()
    decrypted_data = cipher_suite.decrypt(encrypted_data)
        
inmemorycsv=io.BytesIO(decrypted_data)

df=pd.read_csv(inmemorycsv)

In [20]:
print("before",len(df))
df=df[df.response.str.len()<1000]
df=df[df.response.str.len()>2]
print("fitered",len(df))

before 31205
fitered 17368


# Setup transformers, PEFT and LLM

Here is the example of fine-tuning on LLAMA-2 model on your CUDA-enabled machine.
For using LLAMA-2 please visit the https://ai.meta.com/resources/models-and-libraries/llama-downloads/ and accept License. 
You may also use extracted dataset for fine-tune with other models and services.


In [None]:
!pip install "transformers==4.31.0" "wandb" "scipy" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.40.2" "trl==0.4.7" "ninja" "packaging" "safetensors>=0.3.1" "cryptography" --upgrade

In [3]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
import torch
print("PyTorch", torch.cuda.is_available() and torch.__version__)
model_id = "NousResearch/Llama-2-7b-hf"  # non-gated
# from huggingface_hub import login
# login()
# model_id = "meta-llama/Llama-2-7b-hf" # gated


# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, use_cache=False, device_map="auto")
model.config.pretraining_tp = 1


tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


# LoRA config based on QLoRA paper
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)


[2023-08-25 18:58:32,177] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
PyTorch 2.0.1


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Training script, simplified from HF tutorial
I put bigger learning rate and loss curve become more smoother.
It takes a time to train, on my RTX-3090 it was 8 hours. 

In [21]:
import datasets
datasets.disable_caching() # don't save on disk
datasets.config.IN_MEMORY_MAX_SIZE = 1024**3

train_dataset = datasets.Dataset.from_pandas(df,preserve_index=False)


#Alpaca format
def format_instruction(sample):
    return f"""### Instruction:
{sample['instruction']}

### Input:
{sample['input']}

### Response:
{sample['response']}
"""

In [22]:
from trl import SFTTrainer

args = TrainingArguments(
    output_dir="llama-7-int4-textualavatar",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=1e-3,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=True  # disable tqdm since with packing values are in correct
)


max_seq_length = 2048  # max sequence length for model and packing of the dataset
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction,
    args=args,
)

trainer.train()
trainer.save_model()

# Later you can load model and merge LoRA weights to 16 bit model in this script

# Compare with previous replies
 Run the model to chat with your previous replies and observe its responses.


In [None]:
from datasets import load_dataset
from random import randrange


# Load dataset and get a sample
sample = train_dataset[randrange(len(train_dataset))]

prompt = format_instruction(sample)
print(f"Prompt:\n{prompt}\n")
print(f"Ground truth:\n{sample['response']}")

input_ids = tokenizer(prompt.split("### Response:")[0], return_tensors="pt", truncation=True).input_ids.to(model.device)

with torch.inference_mode():
    outputs = model.generate(input_ids=input_ids, max_new_tokens=1000, do_sample=True, top_p=0.95,temperature=0.8)
    decode = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
print(f"Generated out :\n{decode.split('### Response:')[1]}")


# Gradio demo
Change counterpart to the person name you want to chat.

In [72]:
!pip install  gradio --quiet
import gradio as gr

def predict(input_text, history, counterpart=""):
    sample = {'counterpart': counterpart}
    sample['last_messages'] = []
    for prev_message_pair in history[-MAX_PREVIOUS_MESSAGES//2:]:
        sample['last_messages'].append(
            {"author": counterpart, "text": prev_message_pair[0]})
        sample['last_messages'].append(
            {"author": TEXTUAL_AVATAR, "text": prev_message_pair[1]})
    sample['input'] = input_text
    sample['text'] = ""
    prompt = format_instruction(convert_sample_to_instruction(sample))
    input_ids = tokenizer(prompt.split("### Response:")[
                          0], return_tensors="pt", truncation=True).input_ids.to(model.device)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=input_ids, max_new_tokens=1000, do_sample=True, top_p=0.95, temperature=0.8)
        decode = tokenizer.batch_decode(
            outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    return decode.split('### Response:')[1]

gr.ChatInterface(predict).launch()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Running on local URL:  http://127.0.0.1:7872

To create a public link, set `share=True` in `launch()`.




# Telegram chat bot

Ask @BotFather in Telegram to create a new bot, and export API_KEY as ```TELEGRAM_BOT_KEY``` system variable

In [None]:
!pip install nest_asyncio python-telegram-bot --quiet
import nest_asyncio
nest_asyncio.apply()

import telegram
from telegram.ext import *


import os

history={}

async def hello(update: telegram.Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    print.update.chat
    await update.message.reply_text(f'Hi {update.effective_user.first_name}! I am textual avatar bot ')

async def chat(update: telegram.Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    userid = f"user{update.message.from_user.id}"
    if participant_names and userid in participant_names.keys(): # search in previous conversations
        counterpart = participant_names[userid]
    else:
        counterpart = "Telegram user"
    if counterpart not in history.keys():
        history[counterpart]=[]
    input_text = update.message.text
    output = predict(input_text, history[counterpart], counterpart )
    history[counterpart].append([input_text,output])
    await update.message.reply_text(output)


TELEGRAM_BOT_KEY = os.environ["TELEGRAM_BOT_KEY"]
app = ApplicationBuilder().token(TELEGRAM_BOT_KEY).build()
app.add_handler(CommandHandler('start', hello))
app.add_handler(MessageHandler(telegram.ext.filters.TEXT, chat))

app.run_polling()