In [30]:
import os
import math
import random
from pathlib import Path
import os
import torch
USE_CUDA = True

# Paths - edit if needed
DATA_DIR = Path('/Desktop/Dataset')
USER_A_CSV = DATA_DIR / 'userA_chats.csv'
USER_B_CSV = DATA_DIR / 'userB_chats.csv'
# optional local xlsx if you downloaded it manually
CONV_XLSX = Path('conversationfile.xlsx')

OUTPUT_DIR = Path('./chatrec_output')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Model choice (local HuggingFace name; must be available offline in cache)
# Options: 'gpt2', 'distilgpt2', 'gpt2-medium' (choose small if CPU-only)
MODEL_NAME = 'distilgpt2'

# Data handling
MAX_CONTEXT_TOKENS = 512   # tokens to take from A's history (keep smaller for speed)
MAX_REPLY_TOKENS = 128     # max tokens to generate for reply
MAX_TOTAL_TOKENS = 512 + 128  # must be <= model config max_position_embeddings

# Training hyperparameters
NUM_EPOCHS = 3
BATCH_SIZE = 4  # per device
GRAD_ACCUM_STEPS = 4
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 100
SEED = 42

# Generation parameters (for inference)
GEN_MAX_LENGTH = 128
GEN_TEMPERATURE = 0.8
GEN_TOP_K = 50
GEN_TOP_P = 0.9
NUM_RETURN_SEQUENCES = 1

# Misc
USE_CUDA = True
DEVICE = 'cuda' if (USE_CUDA and (os.environ.get('CUDA_VISIBLE_DEVICES') or torch.cuda.is_available())) else 'cpu'
print('Device:', DEVICE)

Device: cpu


In [31]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)
from sklearn.model_selection import train_test_split
import joblib

# nltk BLEU
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# simple rouge-l implementation provided below (no external package required)

# reproducibility
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\heyja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<torch._C.Generator at 0x1b61a956890>

In [32]:
from typing import List

# ROUGE-L (LCS based)
def _lcs(a: List[str], b: List[str]) -> int:
    # classic DP for longest common subsequence
    n, m = len(a), len(b)
    dp = [[0] * (m + 1) for _ in range(n + 1)]
    for i in range(n - 1, -1, -1):
        for j in range(m - 1, -1, -1):
            if a[i] == b[j]:
                dp[i][j] = 1 + dp[i + 1][j + 1]
            else:
                dp[i][j] = max(dp[i + 1][j], dp[i][j + 1])
    return dp[0][0]


def rouge_l_score(reference: str, hypothesis: str) -> float:
    r_tokens = nltk.word_tokenize(reference.lower())
    h_tokens = nltk.word_tokenize(hypothesis.lower())
    if len(r_tokens) == 0 or len(h_tokens) == 0:
        return 0.0
    lcs = _lcs(r_tokens, h_tokens)
    prec = lcs / len(h_tokens)
    rec = lcs / len(r_tokens)
    if prec + rec == 0:
        return 0.0
    beta = 1.2
    f_score = ((1 + beta*2) * prec * rec) / (rec + beta*2 * prec + 1e-12)
    return f_score


def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ''
    return ' '.join(s.strip().split())

In [33]:
def read_chat_file(path: Path, user_label: str) -> pd.DataFrame:
    path = Path(path)
    if not path.exists():
        print(f"Warning: {path} not found")
        return pd.DataFrame()
    df = pd.read_csv(path)
    # heuristics to find message column
    msg_cols = [c for c in df.columns if c.lower() in ('message', 'text', 'msg', 'content')]
    if len(msg_cols) == 0:
        # fallback: try last column
        msg_col = df.columns[-1]
    else:
        msg_col = msg_cols[0]
    df = df.rename(columns={msg_col: 'message'})
    # add user label
    df['sender'] = user_label
    # ensure timestamp if present
    ts_cols = [c for c in df.columns if 'time' in c.lower() or 'timestamp' in c.lower()]
    if len(ts_cols) > 0:
        df['timestamp'] = pd.to_datetime(df[ts_cols[0]], errors='coerce')
    else:
        # create artificial timestamp using index (keep ordering)
        df['timestamp'] = pd.NaT
    # conversation id if exists
    if 'conversation_id' not in df.columns and 'conv_id' in df.columns:
        df = df.rename(columns={'conv_id': 'conversation_id'})
    return df[['conversation_id'] if 'conversation_id' in df.columns else [] + ['timestamp', 'sender', 'message']]

# read both
userA = read_chat_file("C:/Users/heyja/Downloads/userA_chats.csv", 'A')
userB = read_chat_file('C:/Users/heyja/Downloads/userB_chats.csv', 'B')

print('userA rows:', len(userA), 'userB rows:', len(userB))

# if both files empty and xlsx exists, try reading it (user may have downloaded the sheet locally)
if len(userA) == 0 and len(userB) == 0 and CONV_XLSX.exists():
    try:
        conv = pd.read_excel('C:/Users/heyja/Downloads/conversationfile.xlsx')
        print('Read conversationfile.xlsx shape', conv.shape)
        # try to detect sender column
        if 'sender' in conv.columns and 'message' in conv.columns:
            conversations = conv[['conversation_id'] if 'conversation_id' in conv.columns else [] + ['timestamp', 'sender', 'message']]
        else:
            # assume alternating two-person: tag alternately
            conv = conv.reset_index(drop=True)
            conv['sender'] = ['A' if i % 2 == 0 else 'B' for i in range(len(conv))]
            conv['message'] = conv.iloc[:,0].astype(str)
            conversations = conv[['timestamp', 'sender', 'message']]
    except Exception as e:
        print('Failed to read xlsx:', e)
        conversations = pd.DataFrame()
else:
    # combine and sort
    combined = pd.concat([userA, userB], ignore_index=True, sort=False)
    # if timestamp present use it; else keep original ordering
    if combined['timestamp'].notna().any():
        combined = combined.sort_values('timestamp').reset_index(drop=True)
    else:
        combined = combined.reset_index(drop=True)
    conversations = combined[['timestamp', 'sender', 'message']]

print('Constructed conversations dataframe shape:', conversations.shape)
conversations.head(6)

userA rows: 11 userB rows: 11
Constructed conversations dataframe shape: (22, 3)


Unnamed: 0,timestamp,sender,message
0,2025-10-07 10:15:12,B,"""Hey, did you see the client's feedback on the..."
1,2025-10-07 10:15:45,A,"""Just saw it. They want a lot of changes to th..."
2,2025-10-07 10:16:05,B,"""Yeah, that's what I was thinking. It's a big ..."
3,2025-10-07 10:16:38,A,"""I'll start on the revisions. Can you update t..."
4,2025-10-07 10:17:01,B,"""Will do. I'll block out the rest of the week ..."
5,2025-10-07 10:20:19,B,"""Any plans for Saturday?"""


In [34]:
def build_examples_from_timeline(df: pd.DataFrame, max_history_msgs=10):
    df = df.copy().reset_index(drop=True)
    examples = []
    # iterate over timeline, find pattern B -> A
    for i in range(1, len(df)):
        prev = df.loc[i-1]
        cur = df.loc[i]
        if prev['sender'] == 'B' and cur['sender'] == 'A':
            # collect previous A messages before position i-1 (history)
            history_msgs = []
            j = i-2
            while j >= 0 and len(history_msgs) < max_history_msgs:
                if df.loc[j]['sender'] == 'A':
                    history_msgs.append(clean_text(str(df.loc[j]['message'])))
                j -= 1
            history_msgs = list(reversed(history_msgs))
            examples.append({
                'a_history': ' \n '.join(history_msgs),
                'b_message': clean_text(str(prev['message'])),
                'a_reply': clean_text(str(cur['message'])),
            })
    return pd.DataFrame(examples)

examples = build_examples_from_timeline(conversations, max_history_msgs=8)
print('Examples built:', len(examples))
examples.sample(3)

# if zero examples, try a fallback: pair alternate messages
if len(examples) == 0 and len(conversations) > 1:
    fallback = []
    for i in range(0, len(conversations) - 1, 2):
        a_msg = conversations.loc[i]['message']
        b_msg = conversations.loc[i+1]['message']
        fallback.append({'a_history': '', 'b_message': str(b_msg), 'a_reply': str(a_msg)})
    examples = pd.DataFrame(fallback)
    print('Fallback pairing created examples:', len(examples))

# split train/val
train_df, val_df = train_test_split(examples, test_size=0.1, random_state=SEED) if len(examples) > 10 else (examples, examples)
print('Train examples:', len(train_df), 'Val examples:', len(val_df))

Examples built: 10
Train examples: 10 Val examples: 10


In [35]:
print('Loading tokenizer and model:', MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
# ensure tokenizer has a pad token (gpt2 doesn't by default)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
# resize embeddings if we added pad token
model.resize_token_embeddings(len(tokenizer))
model.to(DEVICE)

print('Model max length (config):', getattr(model.config, 'n_positions', 'unknown'))

Loading tokenizer and model: distilgpt2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Model max length (config): 1024


In [36]:
PREFIX_TEMPLATE = "A_HISTORY: {a_history} \n B: {b_message} \n A:"

class ChatReplyDataset(Dataset):
    def _init_(self, df: pd.DataFrame, tokenizer: AutoTokenizer, max_context_tokens=MAX_CONTEXT_TOKENS, max_reply_tokens=MAX_REPLY_TOKENS):
        self.tokenizer = tokenizer
        self.examples = []
        self.max_context_tokens = max_context_tokens
        self.max_reply_tokens = max_reply_tokens
        for _, row in df.iterrows():
            a_hist = row.get('a_history', '') or ''
            b_msg = row.get('b_message', '') or ''
            a_reply = row.get('a_reply', '') or ''
            prompt = PREFIX_TEMPLATE.format(a_history=a_hist, b_message=b_msg)
            # full text = prompt + ' ' + reply
            full_text = prompt + ' ' + a_reply
            # tokenize without truncation first (we will manually enforce)
            enc = tokenizer(full_text, return_tensors='pt', truncation=False)
            input_ids = enc['input_ids'][0]
            # find where reply starts (by tokenizing prompt)
            prompt_enc = tokenizer(prompt, return_tensors='pt', truncation=False)
            prompt_len = prompt_enc['input_ids'].size(1)
            # enforce length constraints from the right: keep last max_context_tokens of prompt tokens
            # and the reply truncated to max_reply_tokens
            # If prompt too long, truncate prompt tokens from the left (keep recent context)
            # We'll compute token segments:
            reply_ids = input_ids[prompt_len:]
            prompt_ids = input_ids[:prompt_len]
            # truncate reply
            if reply_ids.size(0) > max_reply_tokens:
                reply_ids = reply_ids[:max_reply_tokens]
            # truncate prompt to keep recent tokens
            if prompt_ids.size(0) > max_context_tokens:
                prompt_ids = prompt_ids[-max_context_tokens:]
            # compose final sequence
            final_ids = torch.cat([prompt_ids, reply_ids], dim=0)
            # labels: -100 for prompt tokens, actual ids for reply tokens
            labels = final_ids.clone()
            labels[:prompt_ids.size(0)] = -100
            # pad to model max length if necessary
            max_len = self.tokenizer.model_max_length
            if final_ids.size(0) > max_len:
                final_ids = final_ids[-max_len:]
                labels = labels[-max_len:]
            self.examples.append({'input_ids': final_ids, 'labels': labels})

    def _len_(self):
        return len(self.examples)

    def _getitem_(self, idx):
        item = self.examples[idx]
        return {k: v.clone().detach() for k, v in item.items()}

# collate function to pad batch
from transformers import DataCollatorForLanguageModeling

def collate_fn(batch):
    input_ids = [b['input_ids'] for b in batch]
    labels = [b['labels'] for b in batch]
    input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
    attention_mask = (input_ids_padded != tokenizer.pad_token_id).long()
    return {'input_ids': input_ids_padded, 'attention_mask': attention_mask, 'labels': labels_padded}

# build dataset objects
train_dataset = ChatReplyDataset(train_df, tokenizer) if len(train_df) > 0 else None
eval_dataset = ChatReplyDataset(val_df, tokenizer) if len(val_df) > 0 else None
print('Train dataset size:', len(train_dataset) if train_dataset else 0)

TypeError: ChatReplyDataset() takes no arguments

In [None]:
if train_dataset is not None and len(train_dataset) > 0:
    training_args = TrainingArguments(
        output_dir=str(OUTPUT_DIR / 'model'),
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUM_STEPS,
        evaluation_strategy='epoch' if eval_dataset is not None and len(eval_dataset) > 0 else 'no',
        save_strategy='epoch',
        logging_strategy='steps',
        logging_steps=50,
        save_total_limit=2,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        warmup_steps=WARMUP_STEPS,
        fp16=(DEVICE == 'cuda'),
        remove_unused_columns=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset if eval_dataset is not None and len(eval_dataset) > 0 else None,
        data_collator=collate_fn,
    )

    # quick sanity check run (optional):
    print('Starting training...')
    trainer.train()
    print('Training finished')

    # save
    model_save_path = OUTPUT_DIR / 'model' / 'final'
    model.save_pretrained(str(model_save_path))
    tokenizer.save_pretrained(str(model_save_path))
    print('Model and tokenizer saved to', model_save_path)
else:
    print('No training data found; skipping training')

In [None]:
def generate_reply(model, tokenizer, a_history: str, b_message: str, gen_kwargs=None):
    if gen_kwargs is None:
        gen_kwargs = {}
    prompt = PREFIX_TEMPLATE.format(a_history=a_history, b_message=b_message) + ' '
    # ensure prompt fits model
    input_ids = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=tokenizer.model_max_length - 1).input_ids.to(DEVICE)
    # generate
    outputs = model.generate(
        input_ids=input_ids,
        max_length=min(tokenizer.model_max_length, input_ids.shape[1] + GEN_MAX_LENGTH),
        do_sample=True,
        temperature=GEN_TEMPERATURE,
        top_k=GEN_TOP_K,
        top_p=GEN_TOP_P,
        num_return_sequences=NUM_RETURN_SEQUENCES,
        pad_token_id=tokenizer.pad_token_id,
        **gen_kwargs
    )
    # decode the completed sequence and return only the part after the prompt
    generated = []
    for out in outputs:
        out_text = tokenizer.decode(out[input_ids.shape[1]:], skip_special_tokens=True)
        generated.append(out_text.strip())
    return generated

# Evaluate on validation set (if exists)
if eval_dataset is not None and len(eval_dataset) > 0:
    refs = []
    hyps = []
    bleu_scores = []
    rouge_scores = []
    for idx, row in val_df.reset_index(drop=True).iterrows():
        a_hist = row['a_history']
        b_msg = row['b_message']
        ref = row['a_reply']
        gen = generate_reply(model, tokenizer, a_hist, b_msg)[0]
        refs.append(ref)
        hyps.append(gen)
        # BLEU (sentence-level small smoothing)
        try:
            bleu = sentence_bleu([nltk.word_tokenize(ref.lower())], nltk.word_tokenize(gen.lower()), smoothing_function=SmoothingFunction().method1)
        except Exception:
            bleu = 0.0
        bleu_scores.append(bleu)
        rouge_scores.append(rouge_l_score(ref, gen))
    avg_bleu = float(np.mean(bleu_scores))
    avg_rouge_l = float(np.mean(rouge_scores))
    print(f'Validation BLEU: {avg_bleu:.4f}  ROUGE-L: {avg_rouge_l:.4f}')
    # compute perplexity via trainer.evaluate if possible
    try:
        eval_metrics = trainer.evaluate(eval_dataset=eval_dataset)
        eval_loss = eval_metrics.get('eval_loss')
        perplexity = math.exp(eval_loss) if eval_loss is not None and eval_loss < 100 else float('inf')
        print('Eval loss:', eval_loss, 'Perplexity:', perplexity)
    except Exception as e:
        print('Could not compute perplexity via trainer:', e)

    # save a small csv with results
    res_df = pd.DataFrame({'reference': refs, 'generated': hyps, 'bleu': bleu_scores, 'rouge_l': rouge_scores})
    res_df.to_csv(OUTPUT_DIR / 'val_generation_results.csv', index=False)
    print('Saved generation results to', OUTPUT_DIR / 'val_generation_results.csv')
else:
    print('No eval dataset - skipping generation evaluation')

In [None]:
class ChatRecPredictor:
    def _init_(self, model_dir: str):
        self.model_dir = model_dir
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
        self.model = AutoModelForCausalLM.from_pretrained(model_dir)
        self.device = DEVICE
        self.model.to(self.device)

    def predict(self, a_history: str, b_message: str, num_return_sequences=1):
        prompt = PREFIX_TEMPLATE.format(a_history=a_history, b_message=b_message) + ' '
        input_ids = self.tokenizer(prompt, return_tensors='pt', truncation=True, max_length=self.tokenizer.model_max_length - 1).input_ids.to(self.device)
        outputs = self.model.generate(
            input_ids=input_ids,
            max_length=min(self.tokenizer.model_max_length, input_ids.shape[1] + GEN_MAX_LENGTH),
            do_sample=True,
            temperature=GEN_TEMPERATURE,
            top_k=GEN_TOP_K,
            top_p=GEN_TOP_P,
            num_return_sequences=num_return_sequences,
            pad_token_id=self.tokenizer.pad_token_id
        )
        results = []
        for out in outputs:
            out_text = self.tokenizer.decode(out[input_ids.shape[1]:], skip_special_tokens=True)
            results.append(out_text.strip())
        return results

# if we saved model above, create predictor and joblib dump
model_dir = str((OUTPUT_DIR / 'model' / 'final'))
if os.path.exists(model_dir):
    predictor = ChatRecPredictor(model_dir)
    joblib.dump(predictor, OUTPUT_DIR / 'Model.joblib')
    print('Saved Model.joblib to', OUTPUT_DIR / 'Model.joblib')
else:
    # if model not saved (training skipped), save a small metadata joblib instructing user how to load
    meta = {'note': 'No trained model found. Run training cells first and save model to ./chatrec_output/model/final'}
    joblib.dump(meta, OUTPUT_DIR / 'Model.joblib')
    print('Saved placeholder Model.joblib to', OUTPUT_DIR / 'Model.joblib')

In [None]:
from pprint import pprint

if (OUTPUT_DIR / 'Model.joblib').exists():
    loaded = joblib.load(OUTPUT_DIR / 'Model.joblib')
    if isinstance(loaded, ChatRecPredictor) or hasattr(loaded, 'predict'):
        print('Loaded predictor from joblib. Testing with a sample:')
        sample_a_hist = 'I was thinking of going for a run in the morning. Sleep was short yesterday.'
        sample_b = 'Do you want to join the soccer match tonight?'
        preds = loaded.predict(sample_a_hist, sample_b, num_return_sequences=2)
        pprint(preds)
    else:
        print('Model.joblib is a placeholder; train and save model first')
else:
    print('No Model.joblib found')