In [1]:
import pandas as pd
import requests
import re
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer,GPT2LMHeadModel, Trainer, TrainingArguments

In [2]:

url = "https://datasets-server.huggingface.co/rows?dataset=Kaludi%2FCustomer-Support-Responses&config=default&split=train&offset=0&length=100"
response = requests.get(url)
data = response.json()

df = pd.DataFrame([row['row'] for row in data['rows']])

In [3]:
df

Unnamed: 0,query,response
0,My order hasn't arrived yet.,We apologize for the inconvenience. Can you pl...
1,I received a damaged product.,We apologize for the inconvenience. Can you pl...
2,I need to return an item.,Certainly. Please provide your order number an...
3,I want to change my shipping address.,No problem. Can you please provide your order ...
4,I have a question about my bill.,We'd be happy to help. Can you please provide ...
...,...,...
69,How do I schedule a consultation or appointment?,We'd be happy to help. Can you please provide ...
70,Can I get a copy of my receipt?,Certainly. Can you please provide your order n...
71,Can I use a competitor's coupon at your store?,"In some cases, we may accept competitor coupon..."
72,Do you have a recycling program?,"Yes, we do have a recycling program. Can you p..."


In [4]:
def clean_text(text):
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    text = text.lower()
    return text

df['query_clean'] = df['query'].apply(clean_text)
df['response_clean'] = df['response'].apply(clean_text)

In [6]:
df.head()

Unnamed: 0,query,response,query_clean,response_clean
0,My order hasn't arrived yet.,We apologize for the inconvenience. Can you pl...,my order hasnt arrived yet,we apologize for the inconvenience can you ple...
1,I received a damaged product.,We apologize for the inconvenience. Can you pl...,i received a damaged product,we apologize for the inconvenience can you ple...
2,I need to return an item.,Certainly. Please provide your order number an...,i need to return an item,certainly please provide your order number and...
3,I want to change my shipping address.,No problem. Can you please provide your order ...,i want to change my shipping address,no problem can you please provide your order n...
4,I have a question about my bill.,We'd be happy to help. Can you please provide ...,i have a question about my bill,wed be happy to help can you please provide yo...


In [5]:
# Tokenization
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
df['query_clean'] = df['query_clean'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
df['response_clean'] = df['response_clean'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

In [6]:
df

Unnamed: 0,query,response,query_clean,response_clean
0,My order hasn't arrived yet.,We apologize for the inconvenience. Can you pl...,"[1820, 1502, 468, 429, 5284, 1865]","[732, 16521, 329, 262, 31818, 460, 345, 3387, ..."
1,I received a damaged product.,We apologize for the inconvenience. Can you pl...,"[72, 2722, 257, 9694, 1720]","[732, 16521, 329, 262, 31818, 460, 345, 3387, ..."
2,I need to return an item.,Certainly. Please provide your order number an...,"[72, 761, 284, 1441, 281, 2378]","[39239, 306, 3387, 2148, 534, 1502, 1271, 290,..."
3,I want to change my shipping address.,No problem. Can you please provide your order ...,"[72, 765, 284, 1487, 616, 8440, 2209]","[3919, 1917, 460, 345, 3387, 2148, 534, 1502, ..."
4,I have a question about my bill.,We'd be happy to help. Can you please provide ...,"[72, 423, 257, 1808, 546, 616, 2855]","[19103, 307, 3772, 284, 1037, 460, 345, 3387, ..."
...,...,...,...,...
69,How do I schedule a consultation or appointment?,We'd be happy to help. Can you please provide ...,"[4919, 466, 1312, 7269, 257, 18103, 393, 12557]","[19103, 307, 3772, 284, 1037, 460, 345, 3387, ..."
70,Can I get a copy of my receipt?,Certainly. Can you please provide your order n...,"[5171, 1312, 651, 257, 4866, 286, 616, 14507]","[39239, 306, 460, 345, 3387, 2148, 534, 1502, ..."
71,Can I use a competitor's coupon at your store?,"In some cases, we may accept competitor coupon...","[5171, 1312, 779, 257, 13861, 34990, 379, 534,...","[259, 617, 2663, 356, 743, 2453, 20319, 45972,..."
72,Do you have a recycling program?,"Yes, we do have a recycling program. Can you p...","[4598, 345, 423, 257, 25914, 1430]","[8505, 356, 466, 423, 257, 25914, 1430, 460, 3..."


In [7]:
df.drop(['query','response'],axis=1,inplace=True)

In [8]:
train_data, val_data = train_test_split(df, test_size=0.1, random_state=42)

In [9]:
train_data

Unnamed: 0,query_clean,response_clean
12,"[72, 387, 1151, 2722, 257, 2882, 284, 616, 305...","[732, 16521, 329, 262, 5711, 460, 345, 3387, 2..."
55,"[271, 612, 257, 5175, 598, 329, 534, 3650]","[8505, 356, 466, 423, 257, 5175, 598, 460, 345..."
65,"[5171, 1312, 1295, 257, 2183, 1502]","[19103, 307, 3772, 284, 3342, 345, 460, 345, 3..."
31,"[4919, 466, 1312, 32793, 12522, 422, 534, 13129]","[22474, 7926, 284, 766, 345, 467, 460, 345, 33..."
9,"[72, 18548, 1064, 262, 2378, 545, 2045, 329]","[22474, 994, 284, 1037, 460, 345, 3387, 2148, ..."
...,...,...
20,"[5171, 1312, 751, 281, 2378, 284, 281, 4683, 1...","[4053, 466, 674, 1266, 284, 1037, 460, 345, 33..."
60,"[4919, 466, 1312, 1487, 616, 3053, 15387]","[732, 460, 1037, 351, 326, 460, 345, 3387, 214..."
71,"[5171, 1312, 779, 257, 13861, 34990, 379, 534,...","[259, 617, 2663, 356, 743, 2453, 20319, 45972,..."
14,"[10919, 318, 262, 3722, 286, 616, 18215, 1624]","[19103, 307, 3772, 284, 2198, 329, 345, 460, 3..."


In [10]:
val_data

Unnamed: 0,query_clean,response_clean
4,"[72, 423, 257, 1808, 546, 616, 2855]","[19103, 307, 3772, 284, 1037, 460, 345, 3387, ..."
63,"[5171, 1312, 651, 257, 9014, 636, 329, 616, 1720]","[39239, 306, 460, 345, 3387, 2148, 262, 1720, ..."
18,"[5171, 1312, 1295, 257, 11963, 1502]","[39239, 306, 460, 345, 3387, 2148, 262, 1720, ..."
0,"[1820, 1502, 468, 429, 5284, 1865]","[732, 16521, 329, 262, 31818, 460, 345, 3387, ..."
28,"[72, 18548, 2604, 656, 616, 1848]","[732, 16521, 329, 262, 31818, 460, 345, 3387, ..."
73,"[4919, 466, 1312, 989, 257, 2626, 393, 9909, 6...","[22474, 7926, 284, 3285, 326, 460, 345, 3387, ..."
10,"[320, 1719, 5876, 11524, 257, 30879, 2438]","[732, 16521, 329, 262, 31818, 460, 345, 3387, ..."
34,"[533, 612, 597, 1459, 20699, 393, 4200]","[19103, 307, 3772, 284, 4175, 345, 286, 597, 1..."


In [11]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [12]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

In [15]:
trainer.train()

  0%|          | 0/51 [00:00<?, ?it/s]

KeyError: 6