# Career Q&A Chatbot - Model Training

In [1]:
# Install dependencies
!pip install transformers tensorflow pandas scikit-learn sacrebleu gradio torch

Collecting torch
  Using cached torch-2.6.0-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting sympy==1.13.1 (from torch)
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Using cached torch-2.6.0-cp311-cp311-win_amd64.whl (204.2 MB)
Downloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   - -------------------------------------- 0.3/6.2 MB ? eta -:--:--
   - -------------------------------------- 0.3/6.2 MB ? eta -:--:--
   ------ --------------------------------- 1.0/6.2 MB 2.0 MB/s eta 0:00:03
   ---------- ----------------------------- 1.6/6.2 MB 2.3 MB/s eta 0:00:03
   ----------- ---------------------------- 1.8/6.2 M


[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import tensorflow as tf
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load dataset
df = pd.read_csv("Data/CareerDataset.csv")
print("Dataset columns:", df.columns)
print("\nSample data:\n", df.head())

Dataset columns: Index(['role', 'question', 'answer'], dtype='object')

Sample data:
              role                                           question  \
0  Data Scientist                     What does a Data Scientist do?   
1  Data Scientist  What are the main responsibilities of a Data S...   
2  Data Scientist  What is the job description for a Data Scientist?   
3  Data Scientist  What skills are required to become a Data Scie...   
4  Data Scientist  What are the essential skills for a successful...   

                                              answer  
0  A Data Scientist extracts meaningful insights ...  
1  Responsibilities include data cleaning, analyz...  
2  A Data Scientist is responsible for collecting...  
3  Skills required include expertise in Python or...  
4  A successful Data Scientist needs strong analy...  


In [4]:
df = df[["question", "answer"]]

# Add prefix for T5 task conditioning
df["input_text"] = "answer career question: " + df["question"]
df["target_text"] = df["answer"]

# Split data
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

In [5]:
# Tokenization function (fixed for tuples)
def tokenize_data(input_text_tensor, target_text_tensor):
    input_text = input_text_tensor.numpy().decode("utf-8")
    target_text = target_text_tensor.numpy().decode("utf-8")
    
    inputs = tokenizer(
        input_text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="tf"
    )
    targets = tokenizer(
        target_text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="tf"
    )
    return (
        inputs["input_ids"].squeeze(), 
        inputs["attention_mask"].squeeze(), 
        targets["input_ids"].squeeze()
    )

# Map wrapper for tuples
def map_wrapper(input_text, target_text):
    return tf.py_function(
        tokenize_data,
        [input_text, target_text],
        (tf.int32, tf.int32, tf.int32)
    )

# Convert tuples to dict
def to_dict(input_ids, attention_mask, labels):
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Dataset pipelines
train_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (train_df["input_text"], train_df["target_text"])
    )
    .map(map_wrapper)
    .map(to_dict)
    .shuffle(1000)
    .batch(8)
    .prefetch(tf.data.AUTOTUNE)
)

val_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (val_df["input_text"], val_df["target_text"])
    )
    .map(map_wrapper)
    .map(to_dict)
    .batch(8)
    .prefetch(tf.data.AUTOTUNE)
)

In [6]:
def map_wrapper(input_text, target_text):
    # Use tf.py_function with output signature as a TUPLE of types
    return tf.py_function(
        tokenize_data,
        [input_text, target_text],
        (tf.int32, tf.int32, tf.int32)  # Output types for input_ids, attention_mask, labels
    )

# Convert tuples to dictionaries in the dataset pipeline
def to_dict(input_ids, attention_mask, labels):
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Create datasets
# Create datasets with TUPLE structure (input_text, target_text)
train_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (train_df["input_text"], train_df["target_text"])
    )
    .map(map_wrapper)
    .batch(8)
    .prefetch(tf.data.AUTOTUNE)
)

val_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (val_df["input_text"], val_df["target_text"])
    )
    .map(map_wrapper)
    .batch(8)
    .prefetch(tf.data.AUTOTUNE)
)


In [7]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

# Load model and tokenizer (fixed)
model = TFT5ForConditionalGeneration.from_pretrained("t5-small", from_pt=True)
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(optimizer=optimizer)

# Train
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3
)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Epoch 1/3


OperatorNotAllowedInGraphError: in user code:

    File "c:\Users\Hp\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Hp\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\engine\training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Hp\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\engine\training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\Hp\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\modeling_tf_utils.py", line 1658, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\Hp\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None

    OperatorNotAllowedInGraphError: Exception encountered when calling layer 'tft5_for_conditional_generation' (type TFT5ForConditionalGeneration).
    
    in user code:
    
        File "c:\Users\Hp\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\modeling_tf_utils.py", line 1298, in run_call_with_unpacked_inputs  *
            return func(self, **unpacked_inputs)
        File "c:\Users\Hp\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\models\t5\modeling_tf_t5.py", line 1340, in call  *
            encoder_outputs = self.encoder(
        File "c:\Users\Hp\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
    
        OperatorNotAllowedInGraphError: Exception encountered when calling layer 'encoder' (type TFT5MainLayer).
        
        in user code:
        
            File "c:\Users\Hp\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\modeling_tf_utils.py", line 1298, in run_call_with_unpacked_inputs  *
                return func(self, **unpacked_inputs)
            File "c:\Users\Hp\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\models\t5\modeling_tf_t5.py", line 693, in call  *
                batch_size, seq_length = input_shape
        
            OperatorNotAllowedInGraphError: Iterating over a symbolic `tf.Tensor` is not allowed: AutoGraph did convert this function. This might indicate you are trying to use an unsupported feature.
        
        
        Call arguments received by layer 'encoder' (type TFT5MainLayer):
          • input_ids=tf.Tensor(shape=<unknown>, dtype=int32)
          • attention_mask=None
          • encoder_hidden_states=None
          • encoder_attention_mask=None
          • inputs_embeds=None
          • head_mask=None
          • encoder_head_mask=None
          • past_key_values=None
          • use_cache=None
          • output_attentions=False
          • output_hidden_states=False
          • return_dict=True
          • training=True
    
    
    Call arguments received by layer 'tft5_for_conditional_generation' (type TFT5ForConditionalGeneration):
      • input_ids={'input_ids': 'tf.Tensor(shape=<unknown>, dtype=int32)', 'labels': 'tf.Tensor(shape=<unknown>, dtype=int32)'}
      • attention_mask=None
      • decoder_input_ids=None
      • decoder_attention_mask=None
      • head_mask=None
      • decoder_head_mask=None
      • encoder_outputs=None
      • past_key_values=None
      • inputs_embeds=None
      • decoder_inputs_embeds=None
      • labels=None
      • use_cache=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • training=True
