#	Notebook Outline: Building a Financial Chatbot with Transformers
## 📂 1. Setup and Imports
Description: Load libraries and set up the environment.
Content:


In [17]:
# Install necessary libraries
%pip install transformers datasets torch tensorflow

# Import required libraries
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report


Note: you may need to restart the kernel to use updated packages.


## 🔍2. Load and Explore the Dataset
Description: Load the CSV file, inspect the data, and handle missing values.
Content:


In [18]:
# Load the dataset
file_path = "Data/Financial-QA-10k.csv"
df = pd.read_csv(file_path)

# Preview the dataset
df.head()

# Check for missing values
df.isnull().sum()


question    2
answer      2
context     1
ticker      0
filing      0
dtype: int64

## 🧹 3. Data Preprocessing
Description: Clean the data, normalize text, and split it into training and test sets.
Content:


In [19]:
from sklearn.model_selection import train_test_split

# Drop unnecessary columns (like ticker and filing for now)
df = df[['question', 'answer', 'context']]

# Combine question and context for richer input
df['input_text'] = df['question'] + " " + df['context']

# Split the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['input_text'].values, df['answer'].values, test_size=0.2, random_state=42
)


## 🧠 4. Tokenization
Description: Convert text into model-ready tokens using Hugging Face's tokenizer.
Content:


In [20]:
from transformers import AutoTokenizer

# Sample data for demonstration
train_texts = ["This is a sample training text.", "Another training text."]
val_texts = ["This is a sample validation text.", "Another validation text."]

# Choose a pre-trained model (like DistilBERT)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the input
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)

## 🔧 5. Model Setup and Training
Description: Load the pre-trained model and fine-tune it using your dataset.
Content:


In [21]:
pip install --upgrade tensorflow keras

Note: you may need to restart the kernel to use updated packages.


In [22]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf

# Sample data for demonstration
train_texts = ["This is a sample training text.", "Another training text."]
val_texts = ["This is a sample validation text.", "Another validation text."]
train_labels = [0, 1]  # Sample labels for demonstration
val_labels = [0, 1]    # Sample labels for demonstration

# Choose a pre-trained model (like DistilBERT)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the input
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)

# Convert encodings to TensorFlow datasets
def gen_dataset(encodings, labels):
    return tf.data.Dataset.from_tensor_slices((
        {key: tf.constant(value) for key, value in encodings.items()},
        tf.constant(labels)
    ))

train_dataset = gen_dataset(train_encodings, train_labels)
val_dataset = gen_dataset(val_encodings, val_labels)

# Load the pre-trained model
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Train the model
model.fit(train_dataset.batch(2), validation_data=val_dataset.batch(2), epochs=3)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/3


AttributeError: in user code:

    File "c:\Users\mbuto\.conda\envs\heart\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\mbuto\.conda\envs\heart\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        
    File "c:\Users\mbuto\.conda\envs\heart\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        
    File "c:\Users\mbuto\.conda\envs\heart\lib\site-packages\transformers\modeling_tf_utils.py", line 1630, in train_step
        x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data)

    AttributeError: module 'keras.utils' has no attribute 'unpack_x_y_sample_weight'


## 📈 6. Model Evaluation
Description: Measure model performance using accuracy, F1-score, and more.
Content:

In [None]:
# Make predictions
predictions = model.predict(val_dataset.batch(16)).logits
pred_labels = np.argmax(predictions, axis=1)

# Evaluate the model
print(classification_report(val_labels, pred_labels))


## 🗨️ 7. Chatbot Interaction
Description: Build an interactive function where users can ask financial questions.
Content:

In [None]:
def chat_with_bot():
    print("Welcome to the Financial Chatbot! Type 'exit' to end the chat.")
    
    while True:
        user_input = input("User: ")
        if user_input.lower() == "exit":
            break
        
        # Tokenize and predict
        inputs = tokenizer(user_input, return_tensors="tf", truncation=True, padding=True, max_length=512)
        outputs = model(inputs)
        predicted_label = np.argmax(outputs.logits.numpy())
        
        # Fetch the predicted answer
        predicted_answer = df['answer'].unique()[predicted_label]
        print(f"Chatbot: {predicted_answer}")

chat_with_bot()
