In [3]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 6.1 MB/s eta 0:00:01
Collecting pytz>=2020.1
  Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Collecting numpy>=1.22.4
  Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl (5.3 MB)
Collecting tzdata>=2022.7
  Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: tzdata, pytz, numpy, pandas
Successfully installed numpy-2.0.2 pandas-2.2.3 pytz-2025.2 tzdata-2025.2


In [4]:
import pandas as pd

In [5]:
# Load the dataset
df = pd.read_csv('Input_Data/Customer_Service_Training_Dataset.csv')

In [9]:
# Drop the 'tags' column
df.drop(columns=['tags'], inplace=True)

# Remove any duplicate rows to avoid training bias
df.drop_duplicates(inplace=True)

# Remove missing values
df.dropna(inplace=True)

KeyError: "['tags'] not found in axis"

In [11]:

df['category'].value_counts()

category
ACCOUNT             1412
ORDER                966
REFUND               718
INVOICE              526
PAYMENT              509
FEEDBACK             500
CONTACT              478
SHIPPING_ADDRESS     474
DELIVERY             472
CANCELLATION_FEE     246
NEWSLETTER           238
Name: count, dtype: int64

In [9]:
pip install transformers

Collecting transformers
  Downloading transformers-4.50.2-py3-none-any.whl.metadata (39 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading transformers-4.50.2-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl (418 kB)
Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: safetensors, tokenizers, transformers
Successfully installed safetensors-0.5.3 tokenizers-0.21.1 transformers-4.50.2
Note: you may need to rest

In [15]:
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder

In [16]:
# For finetuning with a BERT model, we need to convert the 'utterance' column to tokens and then convert 'intent' and 'category' to categorical encodings
# For now, these are tokenised to work with the "bert-base-cased", but this can easily be changed in the future if we change models.

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
df['tokenized'] = df['utterance'].apply(lambda x: tokenizer.tokenize(x))

intent_encoder = LabelEncoder()
df['intent_label'] = intent_encoder.fit_transform(df['intent'])

category_encoder = LabelEncoder()
df['category_label'] = category_encoder.fit_transform(df['category'])

In [27]:
# Now we encode the user messages
max_length = 128

def encode_text(text):
    encoded = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")
    return encoded['input_ids'][0], encoded['attention_mask'][0]

df[['input_ids', 'attention_mask']] = df['utterance'].apply(lambda x: encode_text(x)).apply(pd.Series)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6539 entries, 0 to 6538
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   utterance       6539 non-null   object
 1   intent          6539 non-null   int8  
 2   category        6539 non-null   int8  
 3   tokenized       6539 non-null   object
 4   intent_label    6539 non-null   int64 
 5   category_label  6539 non-null   int64 
 6   input_ids       6539 non-null   object
 7   attention_mask  6539 non-null   object
dtypes: int64(2), int8(2), object(4)
memory usage: 319.4+ KB


In [28]:
# We can save the final DataFrame to a new CSV file, and then use it later for training
df.to_csv('Cleaned_Data/Customer_Service_Training_Dataset_Final.csv', index=False)