In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer

In [9]:
# Load your dataset with the correct delimiter
df = pd.read_csv('chit_chat.csv', delimiter=';')

# Ensure that the columns are named correctly
df.columns = ['input', 'response']

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [10]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")

# Set padding token to be the same as the end-of-sequence token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    inputs = examples['input']
    targets = examples['response']
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/6 [00:00<?, ? examples/s]



Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [11]:
# Fine-Tuning the model
# =================================
# Load the DialoGPT model
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=4,   # batch size for training
    per_device_eval_batch_size=4,    # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    learning_rate=3e-5,              # set a lower learning rate
    evaluation_strategy="epoch",     # evaluate at the end of each epoch
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train_dataset,  # training dataset
    eval_dataset=tokenized_val_dataset     # evaluation dataset
)

# Start training
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,3.765067
2,No log,3.765204
3,No log,3.764621
4,No log,3.762946
5,8.696100,3.75895


TrainOutput(global_step=10, training_loss=8.696096801757813, metrics={'train_runtime': 55.7341, 'train_samples_per_second': 0.538, 'train_steps_per_second': 0.179, 'total_flos': 6965255208960.0, 'train_loss': 8.696096801757813, 'epoch': 5.0})

In [12]:
model.save_pretrained('LLM_MODELS/fine-tuned-dialoGPT')
tokenizer.save_pretrained('LLM_MODELS/fine-tuned-dialoGPT')

('LLM_MODELS/fine-tuned-dialoGPT/tokenizer_config.json',
 'LLM_MODELS/fine-tuned-dialoGPT/special_tokens_map.json',
 'LLM_MODELS/fine-tuned-dialoGPT/vocab.json',
 'LLM_MODELS/fine-tuned-dialoGPT/merges.txt',
 'LLM_MODELS/fine-tuned-dialoGPT/added_tokens.json',
 'LLM_MODELS/fine-tuned-dialoGPT/tokenizer.json')

In [16]:
# Load the fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained('LLM_MODELS/fine-tuned-dialoGPT')
tokenizer = AutoTokenizer.from_pretrained('LLM_MODELS/fine-tuned-dialoGPT')

In [2]:
tokenizer = AutoTokenizer.from_pretrained("LLM_MODELS/fine-tuned-dialoGPT/") # saved Tokenizer
model = AutoModelForCausalLM.from_pretrained("LLM_MODELS//fine-tuned-dialoGPT/") # saved Model

def generate_chit_chat_response(query):
    # Encoding the input
    input_ids = tokenizer.encode(query + tokenizer.eos_token, return_tensors='pt')
    # Generate the response
    response_ids = model.generate(input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
    # Decode the generated response
    response = tokenizer.decode(response_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response

# Test the model 
print(generate_chit_chat_response("Hello"))
print(generate_chit_chat_response("how are you?"))
print(generate_chit_chat_response("can you help me?"))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Hello! :D
I'm good, how are you?
message me your info and list
