In [1]:
import re
import csv
import codecs
import pandas as pd
from datasets import Dataset
import json
from huggingface_hub import login
from transformers import AutoTokenizer
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
if (HF_TOKEN == None):
    raise ValueError("HF_TOKEN is not set")
login(token=HF_TOKEN)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
# function to convert special \<hex> charactersinto the corresponding Latin-1 characters
def decode_latin1_escapes(text):
    def repl(match):
        hex_str = match.group(1)
        return bytes.fromhex(hex_str).decode('latin-1')
    
    return re.sub(r"\\'([0-9a-fA-F]{2})", repl, text)

# reads whatsapp chat specified at input_path, removes metadata and noise (e.g. "audio omitted", "document omitted", "image omitted") and saves the cleaned messages in a csv file
# you can specify to only include messages from francesco by setting only_francesco to True

# LOGIC FOR PROCESSING DATA CAN BE FURTHER IMPROVED E.G. SAVE ONLY MESSAGES THAT ARE NEAR IN TIME
def process_data(input_path, output_path, only_francesco=False):

    # capture metadata, sender and message
    pattern = re.compile(
        r'^.*?\[\d{2}/\d{2}/\d{2},\s*\d{2}:\d{2}:\d{2}\]\s*(.*?):\s*(.*)$'
    )
    
    data = []
    
    with open(input_path, 'r', encoding='utf-8') as infile:
        lines = infile.readlines()
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        match = pattern.match(line)
        if match:
            sender = match.group(1).strip()
            message = match.group(2).strip()
            
            if only_francesco and sender.lower() != "francesco brigante".lower():
                continue
            
            # skip messages with omitted content (e.g. audio, document, image, video, sticker, contact)
            if re.search(r'\b(?:audio|document|image|video|sticker|contact omitted)\b', message, re.IGNORECASE):
                continue
            
            # skip initial chat message
            if "Messages and calls are end-to-end encrypted" in message:
                continue
            
            # removes unfiltered special characters such as \uc0\u8206
            message = re.sub(r'\\[a-z]+\d*', '', message)

            # decode Latin-1 escaped characters like \'ec which becomes ì
            message = decode_latin1_escapes(message)
            
            # removes remaining backslashes
            message = message.replace("\\", "")

            # normalizes spaces
            message = re.sub(r'\s+', ' ', message).strip().rstrip('}')
            
            print(message)
            
            if message:
                data.append([sender, message])
    
    with open(output_path, 'w', encoding='utf-8', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Sender', 'Message'])
        writer.writerows(data)
    
    print(f"File saved successfully in: {output_path}")


### CSV to json

In [4]:
# process_conversation takes a csv file and returns a json with the conversation
# the format is hugging face standard i.e. a list of dictionaries with role and content
# role can be system, for the initial prompt, assistant (francesco) or user (other people)
def process_conversation(csv_file, chat_id):
    
    df = pd.read_csv(csv_file)
    
    system_prompt = """You are Francesco Brigante. Respond naturally as him in Italian, maintaining his characteristic communication style. Keep responses concise and contextual.""" 
    
    messages = [
        {
            "role": "system",
            "content": system_prompt
        }
    ]
    
    for _, row in df.iterrows():
        if row['Sender'] != 'Francesco Brigante':
            #content = f"{row['Sender']}: {row['Message']}" to save in format user_specific_name: message
            content = row['Message']    #to save in format user: message
            messages.append({
                "role": "user",
                "content": content
            })
        else:
            messages.append({
                "role": "assistant",
                "content": row['Message']
            })
    
    return {
        "chat_id": chat_id,  
        "messages": messages
    }

# function to create a json file merging all the chats specified at csv_files
def create_dataset(csv_files):
    all_conversations = []
    chat_id = 1
    
    for csv_file in csv_files:
        conversation = process_conversation(csv_file, chat_id)
        all_conversations.append(conversation)
        chat_id += 1
    
    # Save to JSON file
    with open('dataset.json', 'w', encoding='utf-8') as f:
        json.dump(all_conversations, f, ensure_ascii=False, indent=2)
    
    return all_conversations

In [6]:
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
print(tokenizer.chat_template)

{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['functio

In [8]:
chat = [
    {
        "role": "system",
        "content": "a"
    },
    {
        "role": "user",
        "content": "b"
    },
    {
        "role": "assistant",
        "content": "c"
    }
]

tok = tokenizer.apply_chat_template(chat, tokenize=True)
print(tok)

[151646, 64, 151644, 65, 151645, 66, 151643]


In [29]:
token = tokenizer.bos_token
token_id = tokenizer.convert_tokens_to_ids(token)
print(f"Token ID: {token_id}")


id = 151646
token = tokenizer.convert_ids_to_tokens(id)
print(f"Token for ID {id}: {token}")

Token ID: 151646
Token for ID 151646: <｜begin▁of▁sentence｜>


In [5]:
csv_files = [
    "/Users/francesco/Desktop/cloning/data/genni.csv",
    "/Users/francesco/Desktop/cloning/data/benny.csv",
    "/Users/francesco/Desktop/cloning/data/mamma.csv",
    ##aggiungere altri
]

create_dataset(csv_files)

[{'chat_id': 1,
  'messages': [{'role': 'system',
    'content': 'You are Francesco Brigante. Respond naturally as him in Italian, maintaining his characteristic communication style. Keep responses concise and contextual.'},
   {'role': 'user', 'content': 'So Genni'},
   {'role': 'assistant', 'content': 'Ue gennà'},
   {'role': 'user', 'content': 'Fammi sapere per il regalo'},
   {'role': 'assistant',
    'content': 'Si sto chiedendo agli altri invitati se partecipano'},
   {'role': 'assistant', 'content': 'Volevo prendergli un profumo'},
   {'role': 'user', 'content': 'Va bene'},
   {'role': 'assistant', 'content': 'Viene 8 genni'},
   {'role': 'user', 'content': 'Okok appena ci vediamo'},
   {'role': 'user', 'content': 'Saldo il debito'},
   {'role': 'assistant', 'content': 'Sisi'},
   {'role': 'user', 'content': 'Bello'},
   {'role': 'user',
    'content': 'Mi mandi qualche informazione riguardo la palestra'},
   {'role': 'user', 'content': 'Tipo posizione o altro'},
   {'role': 'us

### json to dataset

In [30]:
# Define special tokens
BOS_TOKEN = tokenizer.bos_token if tokenizer.bos_token else '<begin_of_sentence>'  # beginning of sentence token, used to give the instructions

# NOTE: we're using deepseek's special tokens for user and assistant roles, which have ｜ instead of |, don't confuse (｜|)
USER_TOKEN_START = '<｜User｜>' 
ASSISTANT_TOKEN_START = '<｜Assistant｜>'

#NOTE: same thing here for eos token, which is <｜end▁of▁sentence｜>, using also ▁ instead of _
EOS_TOKEN = tokenizer.eos_token if tokenizer.eos_token else "<|end_of_sentence|>"
END_TURN_TOKEN = "<|turn_end|>"

# creates a formatted prompt using the special tokens
# the prompt starts with the system message if provided, then the context window and finally the current user message
def create_formatted_prompt(messages, system_prompt, current_user_msg_idx, n_context_messages=5):
    
    prompt_parts = []

    if system_prompt:
        prompt_parts.append(f"{BOS_TOKEN}{system_prompt}{END_TURN_TOKEN}")

    # determine the start index for the context window between 0 (no system message), 1 (system message) and current_user_msg_idx - n_context_messages
    context_first_message_idx = 0
    if messages and messages[0]["role"] == "system" and messages[0]["content"] == system_prompt:
        context_first_message_idx = 1
    context_start_idx = max(context_first_message_idx, current_user_msg_idx - n_context_messages)

    # collect messages to process (context + current user message)
    messages_to_process = messages[context_start_idx : current_user_msg_idx + 1]

    # add messages to the prompt
    # there's a logic to group consecutive messages with the same role using only '\n' as separator instead of re-using the role token
    current_role = None
    current_contents = []
    for msg in messages_to_process:
        role = msg["role"]
        content = msg["content"]
        
        if role != current_role:
            # add the previous group if exists
            if current_role is not None:
                role_token = USER_TOKEN_START if current_role == "user" else ASSISTANT_TOKEN_START
                # prompt_parts.append(role_token + '\n'.join(current_contents) + END_TURN_TOKEN)
                prompt_parts.append(role_token + '\n'.join(str(c) for c in current_contents) + END_TURN_TOKEN)
                
            # start new group
            current_role = role
            current_contents = [content]
        else:
            # continue current group
            current_contents.append(content)
    
    # add the last group
    if current_role is not None:
        
        role_token = USER_TOKEN_START if current_role == "user" else ASSISTANT_TOKEN_START
        # prompt_parts.append(role_token + '\n'.join(current_contents) + END_TOKEN)
        prompt_parts.append(role_token + '\n'.join(str(c) for c in current_contents) + END_TURN_TOKEN)

    return "\n".join(prompt_parts) + f"\n{ASSISTANT_TOKEN_START}"

In [31]:
# creates a list from the dataset
def create_dataset_list(json_file_path, n_context_messages=5):

    # load the JSON file
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    dataset_list = []

    for conversation in data:
        messages = conversation['messages']
        system_prompt = ""  #default to empty

        if not messages:
            print(f"Warning: Conversation {conversation.get('chat_id', 'N/A')} is empty. Skipping.")
            continue

        # extract system message if it's the first one
        if messages[0]["role"] == "system":
            system_prompt = messages[0]["content"]
            start_message_index = 1
        else:
            start_message_index = 0
            print(f"Warning: Conversation {conversation.get('chat_id', 'N/A')} does not start with a system message. System prompt will be empty.")

        # iterate through messages to find and save ONLY user -> assistant pairs, saving the last n_context_messages messages as context
        # the new format is a list of dictionaries with prompt and response
        for i in range(start_message_index, len(messages) - 1):
            current_message = messages[i]
            next_message = messages[i+1]

            if current_message['role'] == 'user' and next_message['role'] == 'assistant':
                
                prompt = create_formatted_prompt(
                    messages,
                    system_prompt,
                    current_user_msg_idx=i,
                    n_context_messages=n_context_messages
                )

                response = f"{next_message['content']}{EOS_TOKEN}"

                dataset_list.append({
                    'prompt': prompt,
                    'response': response
                })

    return dataset_list

In [32]:
json_file_path = 'dataset.json'
dataset_list = create_dataset_list(json_file_path, n_context_messages=5)

# print some examples
print(f"Generated {len(dataset_list)} training examples.\n")
for i, example in enumerate(dataset_list[1000:1003]): # print up to 3 examples
    print(f"--- Example {i+1} ---")
    print("Prompt:")
    print(example['prompt'])
    print("\nResponse:")
    print(example['response'])
    print("---------------------\n")

if not dataset_list:
    print("ERROR: No training examples were generated.")

Generated 10900 training examples.

--- Example 1 ---
Prompt:
<｜begin▁of▁sentence｜>You are Francesco Brigante. Respond naturally as him in Italian, maintaining his characteristic communication style. Keep responses concise and contextual.<|turn_end|>
<｜Assistant｜>Dicono che hanno cambiato la chiave del portone e che l’hanno messa nella cassetta della posta
Suca vado a dormire nella villa di cugginm<|turn_end|>
<｜User｜>😔😔<|turn_end|>
<｜Assistant｜>Godo assai<|turn_end|>
<｜User｜>Ce l hai fatta?
Ci vediamo domani allora<|turn_end|>
<｜Assistant｜>

Response:
Si amo<｜end▁of▁sentence｜>
---------------------

--- Example 2 ---
Prompt:
<｜begin▁of▁sentence｜>You are Francesco Brigante. Respond naturally as him in Italian, maintaining his characteristic communication style. Keep responses concise and contextual.<|turn_end|>
<｜User｜>Ce l hai fatta?
Ci vediamo domani allora<|turn_end|>
<｜Assistant｜>Si amo
Ho guidato da potenza fino a roma<|turn_end|>
<｜User｜>Animale
Pure io una colta da Roma a napoli

In [50]:
# create a Hugging Face dataset
dataset = Dataset.from_list(dataset_list)

# creating dataset splits
train_test = dataset.train_test_split(test_size=0.2, seed=42)   #80% train, 20% test
test_valid = train_test['test'].train_test_split(test_size=0.5, seed=42)  #10% test, 10% valid

train_dataset = train_test['train']
val_dataset = test_valid['train']
test_dataset = test_valid['test']

In [64]:
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
tokenizer = AutoTokenizer.from_pretrained(model_id)




#### ADDED
#tokenizer.eos_token = "<|end|>"
tokenizer.pad_token = "<|pad|>"
# special_tokens_dict = {'eos_token': '<|end|>', 'pad_token': '<|pad|>'}
special_tokens_dict = {'pad_token': '<|pad|>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"Added {num_added_toks} special tokens.")

print("Tokenizer's EOS token:", tokenizer.eos_token)
print("Tokenizer's pad token:", tokenizer.pad_token)

Added 1 special tokens.
Tokenizer's EOS token: <｜end▁of▁sentence｜>
Tokenizer's pad token: <|pad|>


In [65]:
def tokenize_function(examples):
    #combine prompt and response
    texts = [
        p + tokenizer.eos_token + r + tokenizer.eos_token
        for p, r in zip(examples["prompt"], examples["response"])
    ]
    
    # static padding
    # out = tokenizer(
    #     texts,
    #     truncation=True,
    #     max_length=200,        # total length = prompt+response
    #     padding="max_length",  # make every example exactly `max_length`
    # )
    
    # dynamic padding
    out = tokenizer(
        texts,
        truncation=True,
        max_length = 256,
        padding=False
    )
    
    # labels are the same as input_ids, they will be automatically shifted
    out["labels"] = out["input_ids"].copy()
    
    return out


# if you used static padding, you can use the following data collator:

# from transformers import DataCollatorForSeq2Seq
# data_collator = DataCollatorForSeq2Seq(
#     tokenizer=tokenizer,
#     padding=True,
#     max_length=None
# )



# if you used dynamic padding, you can use the following data collator:

# from transformers import DataCollatorForSeq2Seq
# data_collator = DataCollatorForSeq2Seq(
#     tokenizer=tokenizer,
#     padding=True,
#     max_length=256
# )

In [66]:
# apply tokenization
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=val_dataset.column_names
)

tokenized_test = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=test_dataset.column_names
)

# save tokenized datasets
tokenized_train.save_to_disk('datasets/tokenized_train')
tokenized_val.save_to_disk('datasets/tokenized_val')
tokenized_test.save_to_disk('datasets/tokenized_test')

Map:   0%|          | 0/6922 [00:00<?, ? examples/s]

Map:   0%|          | 0/865 [00:00<?, ? examples/s]

Map:   0%|          | 0/866 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6922 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/865 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/866 [00:00<?, ? examples/s]

In [67]:
# check to see if the tokenized datasets are loaded correctly
from datasets import load_from_disk

# load each dataset from disk
tokenized_train = load_from_disk('datasets/tokenized_train')
tokenized_val = load_from_disk('datasets/tokenized_val')
tokenized_test = load_from_disk('datasets/tokenized_test')

# print
print(f"Training examples: {len(tokenized_train)}")
print(f"Validation examples: {len(tokenized_val)}")
print(f"Test examples: {len(tokenized_test)}")

# one example
print("\nOne training example:")
print(tokenized_train[1000])

Training examples: 6922
Validation examples: 865
Test examples: 866

One training example:
{'input_ids': [151646, 27, 91, 8948, 91, 29, 2610, 525, 88599, 37789, 4942, 13, 39533, 17712, 438, 1435, 304, 14811, 11, 20337, 806, 28583, 10535, 1707, 13, 13655, 14507, 63594, 323, 65151, 15757, 91, 408, 91, 397, 27, 91, 872, 91, 29, 59808, 10148, 14029, 144538, 198, 53, 2143, 264, 512, 89, 59194, 75414, 91, 408, 91, 397, 27, 91, 77091, 91, 29, 2016, 72, 220, 16, 20, 12, 16, 24, 384, 46927, 27539, 95941, 198, 32, 17020, 17020, 17020, 17020, 17020, 198, 13817, 29668, 14973, 145379, 145379, 145379, 145379, 27, 91, 408, 91, 397, 27, 91, 872, 91, 29, 26843, 281, 5054, 68616, 9497, 883, 1528, 650, 10924, 983, 25761, 78, 27, 91, 408, 91, 397, 27, 91, 77091, 91, 29, 151643, 32, 1250, 2136, 3016, 28419, 28458, 64, 27, 91, 408, 91, 29, 151643], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1