In [7]:
#!pip install datasets
#!pip install transformers
#!pip install evaluate
#!pip install torch

In [8]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import evaluate
from datasets import load_dataset
from transformers import Trainer

In [9]:
# Check GPU
import torch
torch.cuda.is_available()
#torch.cuda.current_device()
#torch.cuda.get_device_name(0)

False

## System setting

In [10]:
# seed number
random_state = 42

# model to be used, from hugging face
model_name = "bert-base-cased"

## Data Pre-processing

In [11]:
#pandas loading file
raw_data_df = pd.read_csv('./all_years_10000_per_class.csv')

In [12]:
# The class should be in balance
#raw_data_df.groupby(['selected_chart_code']).size().plot(kind='bar')

In [13]:
#raw_data_df.head(5)

In [14]:
#building target labels
chart_code_to_target_label = pd.DataFrame(sorted(raw_data_df['selected_chart_code'].unique()))
# rename this column from 0 to a presentive name
chart_code_to_target_label.columns = ['selected_chart_code']
# use index as the label
chart_code_to_target_label['label'] = chart_code_to_target_label.index
# adding label to raw data
processed_data_df = pd.merge(raw_data_df, chart_code_to_target_label, how = "inner", on=['selected_chart_code'])

In [15]:
# viz chart code to target label relationship
# chart_code_to_target_label

In [16]:
# viz processed data
# processed_data_df.head()

### Training, validation and test data split

In [17]:
# 80% training and 20% test with stratify sampling enable to ensure balance
X_train, X_test, y_train, y_test = train_test_split(
    processed_data_df.loc[:, processed_data_df.columns != "label"],
    processed_data_df['label'], 
    test_size = 0.2, 
    stratify=processed_data_df['label'],
    random_state=random_state
    )

# As train_test_split doesn't support the split of validation, we will further split it from the training set
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.1,
    stratify=y_train,
    random_state=random_state
    )

In [18]:
# All classes should in balance
# X_train.groupby(['selected_chart_code']).size().plot(kind='bar')

# All classes should in balance
# X_test.groupby(['selected_chart_code']).size().plot(kind='bar')

# All classes should in balance
# X_val.groupby(['selected_chart_code']).size().plot(kind='bar')

## Feature Engineering

### Input for tokenisation

In [19]:
#Initiate
train_set = pd.DataFrame()
test_set = pd.DataFrame()
val_set = pd.DataFrame()
cols = ['transactiondate','etamount','transdescription']

In [20]:
# Concate a few columns to form a new text descritpion as the training input
train_set['text'] = X_train[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
train_set['label'] = y_train

In [21]:
# Same feature engineering for test
test_set['text'] = X_test[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
test_set['label'] = y_test

In [22]:
# Same feature engineering for validateion se
val_set['text'] = X_val[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
val_set['label'] = y_val

### Converting to Huggingface dataset format

In [23]:
# Need to convert to Huggingface Dataset format which is reuqired for later tonisation process
train_hf_dataset = Dataset.from_pandas(train_set)
test_hf_dataset = Dataset.from_pandas(test_set)
val_hf_dataset = Dataset.from_pandas(val_set)

### Tokenisation

In [None]:
tokeniser = AutoTokenizer.from_pretrained(model_name)

def tokenise_function(examples):
    return tokeniser(examples["text"], padding= "max_length",truncation=True, return_tensors = "pt")

# tokenising each dataset
tokenised_train_set = train_hf_dataset.map(tokenise_function, batched=True)
tokenised_test_set = test_hf_dataset.map(tokenise_function, batched=True)
tokenised_val_set = val_hf_dataset.map(tokenise_function, batched=True)

  0%|          | 0/555 [00:00<?, ?ba/s]

  0%|          | 0/154 [00:00<?, ?ba/s]

## Define Trainer

### Metrics

In [1]:
# defining metrics

def compute_metrics (eval_pred):
    metric = evaluate.load("f1")
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    return metric.compute(predictions=preds, references = labels, average = "micro")

### Trainer

In [2]:
# define model
num_labels = len(train_set['label'].unique())
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# training arguments
training_args = TrainingArguments(
    output_dir = "test_trainer", 
    evaluation_strategy = "steps",
    logging_steps = 3000

)

# trainer
trainer =  Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_train_set,
    eval_dataset=tokenised_val_set,
    compute_metrics = compute_metrics,
)

NameError: name 'train_set' is not defined

### Training

In [None]:
trainer.train()