In [None]:
%pip install transformers
%pip install torch
%pip install datasets
%pip install pandas
%pip install numpy
%pip install scikit-learn

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
# Load response dataset
psych_df = pd.read_csv('Psych_data.csv')

# Frame the question as if it's coming from a client and the answer as a response from the therapist
formatted_psych_df = pd.DataFrame({
    'input_text': 'Client: ' + psych_df['question'] + ' \nTherapist: ',
    'target_text': psych_df['answer']
})


In [None]:
from datasets import Dataset

# Create the tokeniser and tokenise the data
tokenizer = AutoTokenizer.from_pretrained('microsoft/phi-2')
tokenized_data = tokenizer(
    formatted_psych_df['input_text'].tolist(),
    formatted_psych_df['target_text'].tolist(),
    max_length=2048,
    padding='max_length',
    truncation=True,
    return_tensors="pt"
)


# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict({
    'input_ids': tokenized_data['input_ids'],
    'attention_mask': tokenized_data['attention_mask'],
    'labels': tokenized_data['labels']
})

# Split the dataset into train and validation sets
train_test_split = dataset.train_test_split(test_size=0.2)

In [None]:
from transformers import TrainingArguments

# Load the  model
model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", flash_attn=True, flash_rotary=True, fused_dense=True, device_map="cuda", trust_remote_code=True)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=8,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=100,
    save_total_limit=3,
    fp16=True,
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test_split['train'],
    eval_dataset=train_test_split['test']
)

In [None]:
trainer.train()

# Save the model
model.save_pretrained('output/model')

# Save the tokenizer as well, as it's part of the trained model
tokenizer.save_pretrained('output/tokenizer')