In [1]:
import pandas as pd
from datasets import load_dataset

In [2]:
dataset = load_dataset("banking77")
dataset

Using custom data configuration default
Reusing dataset banking77 (C:\Users\Giebels\.cache\huggingface\datasets\banking77\default\1.1.0\aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b)
100%|██████████| 2/2 [00:00<00:00, 86.97it/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})

In [4]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [5]:
dataset.column_names

{'train': ['text', 'label'], 'test': ['text', 'label']}

In [6]:
dataset.data

{'train': MemoryMappedTable
 text: string
 label: int64
 ----
 text: [["I am still waiting on my card?","What can I do if my card still hasn't arrived after 2 weeks?","I have been waiting over a week. Is the card still coming?","Can I track my card while it is in the process of delivery?","How do I know if I will get my card, or if it is lost?","When did you send me my new card?","Do you have info about the card on delivery?","What do I do if I still have not received my new card?","Does the package with my card have tracking?","I ordered my card but it still isn't here",...,"Are your cards available in the EU?","I'm in the EU. Can I get one of your cards?","Is this card available to United States Residents?","can I use it all over the world?","Do you work in Greece?","What countries do your accounts support?","I live in the US and I would like to get a card. How do I go about getting one?","I just moved to the US how do I get a card?","You provide support in what countries?","What cou

In [7]:
train = dataset.data['train']
test = dataset.data['test']

In [8]:
df_labels = pd.read_excel("../data/intent_labels.xlsx")
df_labels.head()

Unnamed: 0,label,intent
0,0,activate_my_card
1,1,age_limit
2,2,apple_pay_or_google_pay
3,3,atm_support
4,4,automatic_top_up


In [9]:
train_df = train.to_pandas().merge(df_labels, on=["label"], how="left")
test_df = test.to_pandas().merge(df_labels, on=["label"], how="left")

In [10]:
def average_per_intent(df_):
    
    dfg = df_.groupby("intent").count().reset_index()
    min_ = dfg['text'].min()
    max_ = dfg['text'].max()
    avg_ = dfg['text'].mean()
    
    return f"Min: {min_}, Max: {max_}, Average: {avg_}"

In [11]:
# Clean data

# Strip punctuation from intent names
train_df["intent"] = train_df["intent"].str.replace('[^\w\s]','', regex=True)
test_df["intent"] = test_df["intent"].str.replace('[^\w\s]','', regex=True)

# Strip punctuation and whitespace from utterances
train_df["text"] = train_df["text"].str.replace('[^\w\s]','', regex=True).str.strip().str.strip("_")
test_df["text"] = test_df["text"].str.replace('[^\w\s]','', regex=True).str.strip().str.strip("_")

# Drop duplicates
train_df = train_df.drop_duplicates(subset=["text", "intent"]).reset_index(drop=True)
test_df = test_df.drop_duplicates(subset=["text", "intent"]).reset_index(drop=True)

# Remove utterances longer than 200 characters for large training set
train_df = train_df.loc[train_df["text"].str.len().le(200), :]

# Save base train/test sets
train_df.to_excel("../data/train.xlsx", index=False)
test_df.to_excel("../data/test.xlsx", index=False)

print(train_df.shape)
print(test_df.shape)

print("Training data utterances per intent")
average_per_intent(train_df)

(9855, 3)
(3079, 3)
Training data utterances per intent


'Min: 35, Max: 186, Average: 127.98701298701299'

In [12]:
# Remove long utterances for better representative train data
train_df1 = train_df.loc[train_df["text"].str.len().le(120), :]
test_df1 = train_df.drop(train_df1.index, axis=0)
print(train_df1.shape)
print(test_df1.shape)

# Undersample training data
train_df2 = train_df1.groupby('intent', group_keys=False).apply(lambda x: x.sample(frac=0.3))
test_df2 = train_df1.drop(train_df2.index, axis=0)
print(train_df2.shape)
print(test_df2.shape)



(9199, 3)
(656, 3)
(2756, 3)
(6443, 3)


In [13]:
# Join all test sets together
test_df_all = pd.concat([test_df2, test_df1, test_df], axis=0)
print(test_df_all.shape)
average_per_intent(test_df_all)


(10178, 3)


'Min: 65, Max: 175, Average: 132.1818181818182'

In [14]:
train_df2.to_excel("../data/train_small.xlsx", index=False)
test_df_all.to_excel("../data/test_large.xlsx", index=False)

In [18]:
# Generate tiny training set
# Undersample training data
train_df3 = train_df1.groupby('intent', group_keys=False).apply(lambda x: x.sample(frac=0.05))
test_df3 = train_df1.drop(train_df3.index, axis=0)
print(train_df3.shape)
print(test_df3.shape)

average_per_intent(train_df3)

(461, 3)
(8738, 3)


'Min: 2, Max: 8, Average: 5.987012987012987'

In [19]:
# Join all test sets together
test_df_all_large = pd.concat([test_df3, test_df1, test_df], axis=0)
print(test_df_all_large.shape)
average_per_intent(test_df_all_large)

(12473, 3)


'Min: 73, Max: 218, Average: 161.98701298701297'

In [20]:
train_df3.to_excel("../data/train_tiny.xlsx", index=False)
test_df_all_large.to_excel("../data/test_massive.xlsx", index=False)