In [None]:
!pip install -U datasets huggingface_hub fsspec


In [None]:
from datasets import load_dataset

dataset = load_dataset("dair-ai/emotion")
print(dataset['train'][0])


In [None]:
label_names = dataset["train"].features["label"].names
label_mapping = {i: label for i, label in enumerate(label_names)}
label_mapping

In [None]:
train_text = dataset['train']['text']
train_labels = dataset['train']['label']

val_text = dataset['validation']['text']
val_labels = dataset['validation']['label']

test_text = dataset['test']['text']
test_labels = dataset['test']['label']


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
vocab_size = 20000
max_length = 100
trunc_type='post'
padding_type='post'

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_text)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(train_text)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

val_sequences = tokenizer.texts_to_sequences(val_text)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(test_text)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
import numpy as np
train_padded = np.array(training_padded)
train_labels = np.array(train_labels)

val_padded = np.array(val_padded)
val_labels = np.array(val_labels)

test_padded = np.array(testing_padded)
test_labels = np.array(test_labels)

In [None]:
class EmotionClassifier(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_rate, output_dim):
    super().__init__()
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.pooling = tf.keras.layers.GlobalAveragePooling1D()
    self.layer1 = tf.keras.layers.Dense(hidden_dim, activation = 'relu')
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.output_layer = tf.keras.layers.Dense(output_dim, activation = 'softmax')

  def call(self, x, training=False):
    x = self.embedding(x)
    x = self.pooling(x)
    x = self.layer1(x)
    x = self.dropout(x, training=training)
    return self.output_layer(x)


In [None]:
model = EmotionClassifier(
    vocab_size=vocab_size,
    embedding_dim=128,
    hidden_dim=32,
    dropout_rate = 0.3,
    output_dim=len(label_mapping)
)


model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=2,
    min_delta=0.001,
    restore_best_weights=True
)
history = model.fit(train_padded, train_labels, epochs=40, validation_data=(val_padded, val_labels), callbacks = [callback], verbose=2)

In [None]:
test_loss, test_accuracy = model.evaluate(test_padded, test_labels, verbose=2)

print(f"\n Test Accuracy: {test_accuracy * 100:.2f}%")
print(f" Test Loss: {test_loss:.4f}")

In [None]:
model.save("emotion_classifier.h5")
with open("tokenizer.json", "w") as f:
    f.write(tokenizer.to_json())

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_GPT = AutoModelForCausalLM.from_pretrained(model_id).to("cuda")
tokenizer_GPT = AutoTokenizer.from_pretrained(model_id)

tokenizer_GPT.pad_token = tokenizer_GPT.eos_token

!pip install transformers datasets peft accelerate


In [None]:
extra_df = pd.read_csv("/content/therapist_prompts_100_total.csv")
extra_df.head()

prompts = []
therapist = []

for index, row in extra_df.iterrows():
  prompts.append(row['prompt'])
  therapist.append(row['response'])


In [None]:
label_map = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}


In [None]:
def get_emotion(text):
  if not isinstance(text, str):
    return None

  seq = tokenizer.texts_to_sequences([text])
  padded = pad_sequences(seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
  pred = model.predict(padded)
  label = label_map[pred.argmax()]
  return label

formatted_data = []

for q, r, e in zip(questions, responses, emotions):
  prompt = f"User feels {e}. They said {q}\nTherapist: "
  formatted_data.append({"prompt": prompt, "response": r})

extra_df.columns = extra_df.columns.str.strip()


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType

In [None]:
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType

LoRA_data = []

for p, t in zip(prompts, therapist):
    LoRA_data.append({"prompt": p, "response": t})  # fixed variable names

dataset = Dataset.from_list(LoRA_data)

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj"]
)

model_GPT = get_peft_model(model_GPT, lora_config)
model_GPT.print_trainable_parameters()

In [None]:
def tokenize(example):
    full_text = example["prompt"] + example["response"]
    tokens = tokenizer_GPT(
        full_text,
        truncation=True,
        padding="max_length",
        max_length=256
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(tokenize, remove_columns=["prompt", "response"])

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./lora_therapist_model",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_steps=5,
    save_strategy="epoch",
    learning_rate=1e-4,
    fp16=True,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_GPT,
    mlm=False
)

trainer = Trainer(
    model=model_GPT,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()

In [None]:
def generate_response(user_input):
    emotion = get_emotion(user_input)
    prompt = f"User (feeling {emotion}): {user_input}\nTherapist:"

    inputs = tokenizer_GPT(prompt, return_tensors="pt", return_attention_mask=True).to(model_GPT.device)

    output = model_GPT.generate(
        **inputs,
        max_new_tokens=90,
        do_sample=True,
        top_p=0.9,
        temperature=0.8,
        pad_token_id=tokenizer_GPT.eos_token_id,
        eos_token_id=tokenizer_GPT.eos_token_id,
        repetition_penalty=1.2
    )

    return tokenizer_GPT.decode(output[0], skip_special_tokens=True)

response = generate_response("I feel completely lost and scared.")
print(response)

In [None]:
model_GPT.save_pretrained("./lora_therapist_adapters")
tokenizer_GPT.save_pretrained("./lora_therapist_adapters")
