In [None]:
import re
import csv
import codecs
import pandas as pd
from datasets import Dataset
import json
from huggingface_hub import login
from transformers import AutoTokenizer
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
if (HF_TOKEN == None):
    raise ValueError("HF_TOKEN is not set")
login(token=HF_TOKEN)

In [None]:
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

### Whatsapp txt into filtered CSV

In [None]:
# function to convert special \<hex> charactersinto the corresponding Latin-1 characters
def decode_latin1_escapes(text):
    def repl(match):
        hex_str = match.group(1)
        return bytes.fromhex(hex_str).decode('latin-1')

    return re.sub(r"\\'([0-9a-fA-F]{2})", repl, text)

# reads whatsapp chat specified at input_path, removes metadata and noise (e.g. "audio omitted", "document omitted", "image omitted") and saves the cleaned messages in a csv file

# logic for processing data can be further improved e.g. store messages using timing information
def process_data(input_path, output_path, only_sender_name=None):

    # Regex to capture timestamp, sender, and message
    # group 1: Timestamp (e.g., [DD/MM/YY, HH:MM:SS])
    # group 2: Sender name
    # group 3: Message content
    pattern = re.compile(
        r'^.*?(\[\d{2}/\d{2}/\d{2},\s*\d{2}:\d{2}:\d{2}\])\s*(.*?):\s*(.*)$'
    )

    data = [] # store processed [timestamp, sender, message] tuples

    try:
        with open(input_path, 'r', encoding='utf-8') as infile:
            lines = infile.readlines()
    except FileNotFoundError:
        print(f"Error: Input file not found at {input_path}")
        return
    except Exception as e:
        print(f"An error occurred while reading the input file: {e}")
        return

    for line in lines:
        line = line.strip()

        if not line:
            continue

        match = pattern.match(line)
        if match:
            # groups
            timestamp = match.group(1).strip()
            sender = match.group(2).strip()
            message = match.group(3).strip()

            # filter by sender if only_sender_name is specified
            if only_sender_name and sender.lower() != only_sender_name.lower():
                continue

            # skip messages with omitted content by Francesco
            omitted_match = re.search(r'\b(audio|document|image|video|sticker|contact) omitted\b', message, re.IGNORECASE)
            if omitted_match:
                if sender.lower() == "francesco brigante".lower():
                    continue # skip these messages for Francesco
                else:
                    # for other senders, replace with a generic placeholder
                    omitted_type = omitted_match.group(1).lower()
                    if omitted_type == "audio":
                        message = "*manda un audio*"
                    elif omitted_type == "document":
                        message = "*manda un documento*"
                    elif omitted_type == "image":
                        message = "*manda un'immagine*"
                    elif omitted_type == "video":
                        message = "*manda un video*"
                    elif omitted_type == "sticker":
                        message = "*manda uno sticker*"
                    elif omitted_type == "contact":
                        message = "*manda un contatto*"
                    else:
                        # fallback for unexpected omitted types
                        message = "*manda un allegato*"

            if "Messages and calls are end-to-end encrypted" in message:
                continue

            if "Voice call." in message:
                continue

            # removes unfiltered special characters and decode
            message = re.sub(r'\\[a-z]+\d*', '', message)
            message = decode_latin1_escapes(message)

            # Removes remaining backslashes (e.g., from original escapes that weren't Latin-1)
            message = message.replace("\\", "")

            # Normalizes multiple spaces to a single space and removes artifacts
            message = re.sub(r'\s+', ' ', message).strip().rstrip('}')

            if message:
                data.append([timestamp, sender, message])

    try:
        with open(output_path, 'w', encoding='utf-8', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Timestamp', 'Sender', 'Message'])
            writer.writerows(data)

        print(f"File saved successfully in: {output_path}")
    except IOError as e:
        print(f"Error: Could not write to output file {output_path}. {e}")
    except Exception as e:
        print(f"An unexpected error occurred while saving the file: {e}")


In [None]:
inpput = "chat/ludovica.txt"
output = "data/ludovica.csv"
process_data(inpput, output)
inpput = "chat/mamma.txt"
output = "data/mamma.csv"
process_data(inpput, output)
inpput = "chat/genni.txt"
output = "data/genni.csv"
process_data(inpput, output)
inpput = "chat/benny.txt"
output = "data/benny.csv"
process_data(inpput, output)
inpput = "chat/cammisa.txt"
output = "data/cammisa.csv"
process_data(inpput, output)

File saved successfully in: data/ludovica.csv
File saved successfully in: data/mamma.csv
File saved successfully in: data/genni.csv
File saved successfully in: data/benny.csv
File saved successfully in: data/cammisa.csv


### CSV to json

In [None]:
# given a csv files and a chat_id, returns a dictionary with chat_id and messages
# each message is a dictionary with role, content and timestamp
def process_conversation(csv_file, chat_id):

    df = pd.read_csv(csv_file)

    messages = []

    for _, row in df.iterrows():

        timestamp = row['Timestamp']

        if row['Sender'] != 'Francesco Brigante':
            #content = f"{row['Sender']}: {row['Message']}" to save in format user_specific_name: message
            content = row['Message']    #to save in format user: message
            messages.append({
                "role": "user",
                "content": content,
                "timestamp": timestamp
            })
        else:
            messages.append({
                "role": "assistant",
                "content": row['Message'],
                "timestamp": timestamp
            })

    return {
        "chat_id": chat_id,
        "messages": messages
    }

# function to create a json file merging all the chats specified at csv_files
def create_dataset(csv_files):
    all_conversations = []
    chat_id = 1

    for csv_file in csv_files:
        conversation = process_conversation(csv_file, chat_id)
        all_conversations.append(conversation)
        chat_id += 1

    # Save to JSON file
    with open('dataset.json', 'w', encoding='utf-8') as f:
        json.dump(all_conversations, f, ensure_ascii=False, indent=2)

    return all_conversations

In [None]:
csv_files = [
    "data/ludovica.csv",
    "data/genni.csv",
    "data/mamma.csv",
    "data/cammisa.csv"
]

create_dataset(csv_files)

### json to dataset

In [None]:
from datetime import datetime, timedelta

# special tokens
BOS_TOKEN = tokenizer.bos_token if tokenizer.bos_token else '<begin_of_sentence>'

# NOTE: we're using deepseek's special tokens for user and assistant roles, which have ｜ instead of |, don't confuse (｜|) this costed me 2hrs of bug fixing
USER_TOKEN_START = '<｜User｜>'
ASSISTANT_TOKEN_START = '<｜Assistant｜>'

#NOTE: same thing here for eos token, which is <｜end▁of▁sentence｜>, using also ▁ instead of _
EOS_TOKEN = tokenizer.eos_token if tokenizer.eos_token else "<|end_of_sentence|>"
END_TURN_TOKEN = "<|turn_end|>"

system_prompt = """You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome.
Respond naturally as him in Italian, maintaining his characteristic communication style.
Keep responses concise and contextual.

Continue this conversation with the User, who is a friend of Francesco:"""


TIME_GAP_MINUTES = 30
TIME_GAP_SECONDS = TIME_GAP_MINUTES * 60

# helper function to parse timestamp strings into datetime objects
def parse_timestamp(timestamp_str):
    cleaned_ts = timestamp_str.strip('[]').strip()
    # parse format: DD/MM/YY, HH:MM:SS
    return datetime.strptime(cleaned_ts, '%d/%m/%y, %H:%M:%S')



# creates a formatted prompt using the special tokens
# the prompt starts with the system message if provided, then the context window and finally the current user message
def create_formatted_prompt(messages, current_user_msg_idx, max_context_messages=5):

    prompt_parts = []

    if system_prompt:
        prompt_parts.append(f"{BOS_TOKEN}{system_prompt}")

    current_msg = messages[current_user_msg_idx]
    current_msg_ts = parse_timestamp(current_msg['timestamp'])

    relevant_context_messages = []

    # iterate backwards from the message right before the current user message
    for i in range(current_user_msg_idx - 1, -1, -1):
        msg = messages[i]
        msg_ts = parse_timestamp(msg['timestamp'])

        time_diff = current_msg_ts - msg_ts

        # Add message if within time gap AND we haven't exceeded the max number of context messages
        if time_diff <= timedelta(seconds=TIME_GAP_SECONDS) and len(relevant_context_messages) < max_context_messages:
            relevant_context_messages.insert(0, msg) # Insert at the beginning to maintain chronological order
        else:
            # If message is too old or max context messages reached, stop
            break

    # full set of messages to include in the prompt will be the relevant context + the current user message
    messages_to_process = relevant_context_messages + [current_msg]

    # add messages to the prompt
    # there's a logic to group consecutive messages with the same role using only '\n' as separator instead of re-using the role token
    current_role = None
    current_contents = []
    for msg in messages_to_process:
        role = msg["role"]
        content = msg["content"]

        if role != current_role:
            # add the previous group if exists
            if current_role is not None:
                role_token = USER_TOKEN_START if current_role == "user" else ASSISTANT_TOKEN_START
                prompt_parts.append(role_token + '\n'.join(str(c) for c in current_contents) + END_TURN_TOKEN)

            # start new group
            current_role = role
            current_contents = [content]
        else:
            # continue current group
            current_contents.append(content)

    # add the last group
    if current_role is not None:
        role_token = USER_TOKEN_START if current_role == "user" else ASSISTANT_TOKEN_START
        prompt_parts.append(role_token + '\n'.join(str(c) for c in current_contents) + END_TURN_TOKEN)

    return "\n".join(prompt_parts) + f"\n{ASSISTANT_TOKEN_START}"

In [None]:
# creates a list from the dataset
def create_dataset_list(json_file_path, max_context_messages=5):

    # load the JSON file
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    dataset_list = []

    for conversation in data:
        messages = conversation['messages']

        if not messages:
            print(f"Warning: Conversation {conversation.get('chat_id', 'N/A')} is empty. Skipping.")
            continue

        # iterate through messages to find and save ONLY user -> assistant pairs, saving at most max_context_messages messages as context
        # the new format is a list of dictionaries with prompt and response
        for i in range(0, len(messages) - 1):
            current_message = messages[i]
            next_message = messages[i+1] # This is the immediate assistant response

            #time-stamps
            current_msg_ts = parse_timestamp(current_message['timestamp'])
            next_msg_ts = parse_timestamp(next_message['timestamp'])
            time_diff = next_msg_ts - current_msg_ts
            max_time_diff = timedelta(seconds=TIME_GAP_SECONDS)

            if current_message['role'] == 'user' and next_message['role'] == 'assistant' and time_diff <= max_time_diff:

                # Create the formatted prompt using the new time-based context logic
                prompt = create_formatted_prompt(
                    messages,
                    current_user_msg_idx=i,
                    max_context_messages=max_context_messages
                )

                response_parts = [next_message['content']]
                response_base_timestamp = parse_timestamp(next_message['timestamp'])

                # Look for subsequent assistant messages to group
                # Start checking from the message after the immediate assistant response
                for j in range(i + 2, len(messages)):
                    subsequent_msg = messages[j]

                    # Only group if it's an assistant message and its timestamp is within the defined time window from the base response timestamp
                    if subsequent_msg['role'] == 'assistant':
                        subsequent_msg_ts = parse_timestamp(subsequent_msg['timestamp'])
                        time_diff = subsequent_msg_ts - response_base_timestamp

                        if time_diff <= timedelta(seconds=TIME_GAP_SECONDS / 4):  # time gap divided by 4 for assistant responses
                            response_parts.append(subsequent_msg['content'])
                        else:
                            break
                    else:
                        break

                response = '\n'.join(response_parts) + EOS_TOKEN

                dataset_list.append({
                    'prompt': prompt,
                    'response': response
                })

    return dataset_list

In [None]:
json_file_path = 'dataset.json'
dataset_list = create_dataset_list(json_file_path, max_context_messages=5)

# print some examples
import random

if dataset_list:
    random_indices = random.sample(range(len(dataset_list)), 3)

    print(f"Generated {len(dataset_list)} training examples.\n")
    print("--- Randomly Selected Examples ---")
    for i, idx in enumerate(random_indices):
        example = dataset_list[idx]
        print(f"--- Example {i+1} (Index: {idx}) ---")
        print("Prompt:")
        print(example['prompt'])
        print("\nResponse:")
        print(example['response'])
        print("---------------------\n")
else:
    print("ERROR: No training examples were generated. The dataset_list is empty.")

Generated 14390 training examples.

--- Randomly Selected Examples ---
--- Example 1 (Index: 7984) ---
Prompt:
<｜begin▁of▁sentence｜>You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome. 
Respond naturally as him in Italian, maintaining his characteristic communication style. 
Keep responses concise and contextual.

Continue this conversation with the User, who is a friend of Francesco:
<｜User｜>*manda un audio*
*manda un audio*<|turn_end|>
<｜Assistant｜>Difficile cumba
Sto ridendo troppo
Mado<|turn_end|>
<｜User｜>Perché<|turn_end|>
<｜Assistant｜>

Response:
Per quello che dici<｜end▁of▁sentence｜>
---------------------

--- Example 2 (Index: 2343) ---
Prompt:
<｜begin▁of▁sentence｜>You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome. 
Respond naturally as him in Italian, maintaining his characteristic communication style. 
Keep responses concise and contextual.

Continue this conversation with the User, who is a friend of Francesco

In [None]:
# create a Hugging Face dataset
dataset = Dataset.from_list(dataset_list)

# creating dataset splits
train_test = dataset.train_test_split(test_size=0.2, seed=42)   #80% train, 20% test
test_valid = train_test['test'].train_test_split(test_size=0.5, seed=42)  #10% test, 10% valid

train_dataset = train_test['train']
val_dataset = test_valid['train']
test_dataset = test_valid['test']

### Tokenization

In [None]:
def tokenize_function(batch, tokenizer=tokenizer, max_length=256):
    prompts   = batch["prompt"]
    responses = batch["response"]

    tokenized_prompts = tokenizer(
        prompts,
        max_length=max_length,
        truncation=True,
        add_special_tokens=False
    )

    # compute length of tokenized prompts
    prompt_lengths = [len(tokens) for tokens in tokenized_prompts['input_ids']]

    tokenized_conversation = tokenizer(
        [p + r for p, r in zip(prompts, responses)],
        max_length=max_length,
        padding='max_length',
        truncation=True,
        add_special_tokens=False
    )

    input_ids = tokenized_conversation['input_ids']                 #full tokenized conversation
    attention_mask = tokenized_conversation['attention_mask']       #real tokens are 1, padding tokens are 0
    labels = []                                                     #used for training, labels are the same as input_ids but with the prompt part masked out


    for id in input_ids:
        label = id.copy()

        # find the index of the assistant token in the input_ids
        response_start_idx = None
        for i in reversed(range(len(label))):
            if label[i] == tokenizer.convert_tokens_to_ids(ASSISTANT_TOKEN_START):
                response_start_idx = i + 1                                                  # exclude assistant token
                break

        if response_start_idx is None:
            print("[❌] Assistant token not found in input_ids.")
            #print("Decoded text:\n", tokenizer.decode(ids, skip_special_tokens=False))
            label = [-100] * len(label)                                                     # ignore everything
        else:
            # mask everything before the assistant's response
            label[:response_start_idx] = [-100] * response_start_idx

        labels.append(label)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [None]:
# apply tokenization
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=val_dataset.column_names
)

tokenized_test = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=test_dataset.column_names
)

# save tokenized datasets
tokenized_train.save_to_disk('datasets/tokenized_train')
tokenized_val.save_to_disk('datasets/tokenized_val')
tokenized_test.save_to_disk('datasets/tokenized_test')

Map:   0%|          | 0/11512 [00:00<?, ? examples/s]

[❌] Assistant token not found in input_ids.
[❌] Assistant token not found in input_ids.
[❌] Assistant token not found in input_ids.
[❌] Assistant token not found in input_ids.
[❌] Assistant token not found in input_ids.
[❌] Assistant token not found in input_ids.
[❌] Assistant token not found in input_ids.
[❌] Assistant token not found in input_ids.
[❌] Assistant token not found in input_ids.
[❌] Assistant token not found in input_ids.


Map:   0%|          | 0/1439 [00:00<?, ? examples/s]

Map:   0%|          | 0/1439 [00:00<?, ? examples/s]

[❌] Assistant token not found in input_ids.
[❌] Assistant token not found in input_ids.


Saving the dataset (0/1 shards):   0%|          | 0/11512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1439 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1439 [00:00<?, ? examples/s]

### Check for correctness

In [None]:
# check to see if the tokenized datasets are loaded correctly
from datasets import load_from_disk

# load each dataset from disk
tokenized_train = load_from_disk('datasets/tokenized_train')
tokenized_val = load_from_disk('datasets/tokenized_val')
tokenized_test = load_from_disk('datasets/tokenized_test')

# print
print(f"Training examples: {len(tokenized_train)}")
print(f"Validation examples: {len(tokenized_val)}")
print(f"Test examples: {len(tokenized_test)}")

Training examples: 11512
Validation examples: 1439
Test examples: 1439


In [None]:
import random

# print a random or indexed example from the tokenized dataset
def print_tokenized_example(tokenized_dataset, tokenizer, index=None):
    if index is None:
        index = random.randint(0, len(tokenized_dataset) - 1)
    example = tokenized_dataset[index]

    input_ids = example['input_ids']
    labels = example['labels']
    attention_mask = example['attention_mask']

    # decode full input_ids
    decoded_input = tokenizer.decode(input_ids, skip_special_tokens=False)

    # decode labels, skipping -100 tokens
    decoded_labels = tokenizer.decode(
        [id for id in labels if id != -100],
        skip_special_tokens=False
    )

    print(f"--- Example index: {index} ---")
    print("== INPUT IDS ==")
    print(input_ids)
    print("\n== INPUT TEXT ==")
    print(decoded_input)
    print("\n== ATTENTION MASK ==")
    print(attention_mask)
    print("\n== LABELS (masked prompt) ==")
    print(labels)
    print("\n== LABEL TEXT ==")
    print(decoded_labels)

print_tokenized_example(tokenized_train, tokenizer)

--- Example index: 5582 ---
== INPUT IDS ==
[151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151

In [None]:
# function to check correctness of tokenized examples verifying some properties:
# 1. Attention mask should match padding tokens in input_ids
# 2. Labels are correctly masking the prompt tokens, i.e. only the response part should be not -100
def verify_tokenized_example(example, tokenizer, assistant_token=ASSISTANT_TOKEN_START, eos_token_id=151643):
    input_ids = example['input_ids']
    labels = example['labels']
    attention_mask = example['attention_mask']

    # sanity check
    if len(input_ids) != len(labels) or len(input_ids) != len(attention_mask):
        print("[❌] Mismatch in tensor lengths.")
        return False

    # 1 check padding (0s in attention mask == padding token IDs)
    pad_id = tokenizer.pad_token_id or eos_token_id                                 # use EOS as pad if pad_token is not defined (standard for deepseek)
    num_padding_tokens = max(0, input_ids.count(pad_id) - 1)                        # exclude the last token which is EOS
    num_attention_mask_zeros = attention_mask.count(0)

    if num_padding_tokens != num_attention_mask_zeros:
        print(f"[❌] Padding mismatch: found {num_padding_tokens} pad IDs but {num_attention_mask_zeros} zeros in attention_mask.")
        return False
    else:
        print(f"[✅] Attention mask matches padding: {num_padding_tokens}.")

    # 2 visit in reverse to find the last assistant token and count response tokens i.e. non -100 labels
    response_token_count = 0
    for i in reversed(range(len(labels))):
        if labels[i] != -100:
            response_token_count += 1
        elif input_ids[i] == tokenizer.convert_tokens_to_ids(assistant_token):
            break

    actual_response_tokens = sum(1 for l in labels if l != -100)

    if actual_response_tokens != response_token_count:
        print(f"[❌] Label mismatch: expected {response_token_count} response tokens, got {actual_response_tokens}.")
        return False
    else:
        print(f"[✅] Labels correctly mask prompt tokens, {response_token_count} response tokens detected.")

    return True

for example in tokenized_train:
    if not verify_tokenized_example(example, tokenizer):
        print("Error in tokenized example!")
        break

[✅] Attention mask matches padding: 148.
[✅] Labels correctly mask prompt tokens, 12 response tokens detected.
[✅] Attention mask matches padding: 169.
[✅] Labels correctly mask prompt tokens, 6 response tokens detected.
[✅] Attention mask matches padding: 176.
[✅] Labels correctly mask prompt tokens, 7 response tokens detected.
[✅] Attention mask matches padding: 139.
[✅] Labels correctly mask prompt tokens, 3 response tokens detected.
[✅] Attention mask matches padding: 172.
[✅] Labels correctly mask prompt tokens, 9 response tokens detected.
[✅] Attention mask matches padding: 141.
[✅] Labels correctly mask prompt tokens, 5 response tokens detected.
[✅] Attention mask matches padding: 145.
[✅] Labels correctly mask prompt tokens, 11 response tokens detected.
[✅] Attention mask matches padding: 123.
[✅] Labels correctly mask prompt tokens, 25 response tokens detected.
[✅] Attention mask matches padding: 140.
[✅] Labels correctly mask prompt tokens, 7 response tokens detected.
[✅] Att

In [None]:
# counts the number of examples with full attention, i.e. no padding tokens
# useful to set correct batch size for training
def count_full_attention(dataset):
    full_attention_count = 0

    for i, example in enumerate(dataset):
        attention_mask = example['attention_mask']
        if all(token == 1 for token in attention_mask):
            full_attention_count += 1

    percentage = (full_attention_count / len(dataset)) * 100

    print(f"✅ Percentage of examples with full attention (no padding): {full_attention_count} / {len(dataset)} = {percentage:.2f}%")
    return percentage

count_full_attention(tokenized_train)

✅ Percentage of examples with full attention (no padding): 92 / 11512 = 0.80%


0.7991660875608061