In [None]:
#Set the path to the assignment folder in GDrive
from google.colab import drive

try:
  drive.mount('/content/gdrive')
except:
  print("Load the drive manually from the left panel first. Then run this again,")
  pathBase="/content/drive/MyDrive/"

Mounted at /content/gdrive


# 1.preparation

In [None]:
#Import necessary Python libraries

import numpy as np

import pandas as pd
import torch

import torch.nn as nn

import torch.nn.functional as F

import torch.optim as optim

from nltk.corpus import stopwords

from collections import Counter

import string

import re

import seaborn as sns

from tqdm import tqdm

import matplotlib.pyplot as plt

from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.preprocessing import LabelEncoder


In [None]:
dataset = pd.read_csv("/content/gdrive/MyDrive/5000TravelQuestionsDataset.csv",encoding='latin-1')
dataset

Unnamed: 0,Question,Coarse-grain class
0,What are the special things we (husband and me...,TTD
1,What are the companies which organize shark fe...,TTD
2,Is it safe for female traveller to go alone to...,TGU
3,What are the best places around Cape Town for ...,TTD
4,What are the best places to stay for a family ...,ACM
...,...,...
4995,What is the best area to be based for sightsee...,TTD
4996,What are the good value traditional bars and r...,FOD
4997,What are the hotels near Alicante bus station?,ACM
4998,Where to stay in La Gomera to mountain biking?,TTD


# 2 RoBERTa


### 2.1 clean data

In [None]:
# eliminate any leading or trailing whitespaces
dataset['Question'] = dataset['Question'].str.strip()
dataset['Coarse-grain class'] = dataset['Coarse-grain class'].str.strip()

unique_label = dataset['Coarse-grain class'].unique()
unique_label

array(['TTD', 'TGU', 'ACM', 'TRS', 'WTH', 'FOD', 'ENT'], dtype=object)

In [None]:
# Using LabelEncoder to convert categorical labels to numerical values
label_encoder = LabelEncoder()
dataset['label_encoded'] = label_encoder.fit_transform(dataset['Coarse-grain class'])

# Retrieve the corresponding relationship between label and label_encoded
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Display the label to label_encoded mapping
print(label_mapping)

{'ACM': 0, 'ENT': 1, 'FOD': 2, 'TGU': 3, 'TRS': 4, 'TTD': 5, 'WTH': 6}


In [None]:
dataset = dataset[['Question', 'label_encoded']]
dataset.head()

Unnamed: 0,Question,label_encoded
0,What are the special things we (husband and me...,5
1,What are the companies which organize shark fe...,5
2,Is it safe for female traveller to go alone to...,3
3,What are the best places around Cape Town for ...,5
4,What are the best places to stay for a family ...,0


### 2.2 split data

In [None]:
# Split dataset into random train and test subsets
import numpy as np
from sklearn.model_selection import train_test_split

# Split the dataset into training, testing, and validation subsets
# 4000 samples for training, 700 for testing, and 300 for validation
train_data, test_data = train_test_split(dataset, test_size=700, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=300, random_state=42)

# Print the shapes of the resulting subsets to verify the split
print("Training Data Shape:", train_data.shape)
print("Testing Data Shape:", test_data.shape)
print("Validation Data Shape:", val_data.shape)

Training Data Shape: (4000, 2)
Testing Data Shape: (700, 2)
Validation Data Shape: (300, 2)


In [None]:
# Removing the index column
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
train_data.head()

Unnamed: 0,Question,label_encoded
0,Is there anywhere to store luggage at Gapyeong...,3
1,When does Disneyland Paris Half Marathon held?,5
2,What is your take on the Redrox Villa?,0
3,How can I get to Osanri prayer mountain from y...,4
4,Is there a provider in Samoa you would recomme...,3


### 2.3 prepare and tokenize data

In [None]:
! pip install -U accelerate # a HF library that enables the same PyTorch code to be run across any distributed configuration
! pip install -U transformers
! pip install datasets


Collecting accelerate
  Downloading accelerate-0.24.0-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, accelerate
Successfully installed accelerate-0.24.0 huggingface-hub-0.18.0
Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m81.

In [None]:
#reuse Distillbert clean data and split data processes
# prepare the datasets

from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

print(train_dataset)
print(val_dataset)
print(test_dataset)

Dataset({
    features: ['Question', 'label_encoded'],
    num_rows: 4000
})
Dataset({
    features: ['Question', 'label_encoded'],
    num_rows: 300
})
Dataset({
    features: ['Question', 'label_encoded'],
    num_rows: 700
})


In [None]:
#load a RoBERTa tokenizer to preprocess the text field
from transformers import AutoTokenizer
tokenizer_roberta = AutoTokenizer.from_pretrained("roberta-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
#Create a preprocessing function to tokenize text and truncate sequences

def preprocess_function(examples):
    return tokenizer_roberta(examples["Question"], truncation=True, padding="max_length")


In [None]:
#apply the preprocessing function over the sample dataset

tokenized_train_dataset1 = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset1 = val_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset1 = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

In [None]:
# Reorder the columns to match the required sequence of Trainer
tokenized_train_dataset1 =tokenized_train_dataset1.map(lambda example: {'input_ids': example['input_ids'],
                                            'attention_mask': example['attention_mask'],
                                            'label_encoded': example['label_encoded'],
                                            'Question': example['Question']},
                            batched=True)

tokenized_val_dataset1 =tokenized_val_dataset1.map(lambda example: {'input_ids': example['input_ids'],
                                            'attention_mask': example['attention_mask'],
                                            'label_encoded': example['label_encoded'],
                                            'Question': example['Question']},
                            batched=True)

# Set the format of the dataset for the PyTorch Trainer with the correct column sequence
tokenized_train_dataset1.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'label_encoded']
)

tokenized_val_dataset1.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'label_encoded']
)


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
#rename columns
tokenized_train_dataset1 = tokenized_train_dataset1.rename_column("label_encoded", "labels").rename_column("Question", "text")
tokenized_val_dataset1 = tokenized_val_dataset1.rename_column("label_encoded", "labels").rename_column("Question", "text")
tokenized_test_dataset1 = tokenized_test_dataset1.rename_column("label_encoded", "labels").rename_column("Question", "text")

In [None]:
tokenized_train_dataset1.features

{'text': Value(dtype='string', id=None),
 'labels': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [None]:
#Now create a batch of examples using DataCollatorWithPadding.

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer_roberta)


In [None]:
# create a smaller train subset to fine-tune
small_train_dataset1 = tokenized_train_dataset1.shuffle(seed=42).select(range(100))
small_eval_dataset1 = tokenized_val_dataset1.shuffle(seed=42).select(range(100))
small_test_dataset1 = tokenized_test_dataset1.shuffle(seed=42).select(range(100))

### 2.4 Evaluate

In [None]:
#load evaluate metric

!pip install evaluate
import evaluate

accuracy = evaluate.load("accuracy")



In [None]:
#Then create a function that passes your predictions and labels to compute to calculate the accuracy

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
# maps of the expected ids to their labels
id2label = {0: "ACM", 1: "ENT", 2: "FOD", 3: "TGU", 4: "TRS", 5: "TTD", 6: "WTH"}
label2id = {"ACM": 0, "ENT": 1, "FOD": 2, "TGU": 3, "TRS": 4, "TTD": 5, "WTH": 6}


### 2.5 Fine-tuning model

In [None]:
#Load RoBERTa model

from transformers import RobertaForSequenceClassification, TrainingArguments, Trainer

model_roberta = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=7, id2label=id2label, label2id=label2id
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 2.5.1 Using Entire Dataset

#### 2.5.1.1 feature-based FT

In [None]:
model_roberta

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
for param in model_roberta.parameters():
  param.requires_grad = False

for param in model_roberta.classifier.parameters():
  param.requires_grad = True


In [None]:
#Define your training hyperparameters in TrainingArguments

from transformers import TrainingArguments

training_args_roberta1 = TrainingArguments(
    output_dir="roberta1",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

#Pass the training arguments to Trainer along with the model, dataset, tokenizer, data collator, and compute_metrics function

trainer_roberta1 = Trainer(
    model=model_roberta,
    args=training_args_roberta1,
    train_dataset=tokenized_train_dataset1,
    eval_dataset=tokenized_val_dataset1,
    tokenizer=tokenizer_roberta,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

#finetune your model
trainer_roberta1.train()


In [None]:
#testing

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get predictions from the Trainer
predictions_fb = trainer_roberta1.predict(tokenized_test_dataset1)

# Extract predicted labels and true labels
predicted_labels = predictions_fb.predictions.argmax(-1)
true_labels = tokenized_test_dataset1['labels']

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

#### 2.5.1.2 global FT

In [None]:
#Define your training hyperparameters in TrainingArguments

from transformers import TrainingArguments

training_args_roberta = TrainingArguments(
    output_dir="roberta1",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

#Pass the training arguments to Trainer along with the model, dataset, tokenizer, data collator, and compute_metrics function

trainer_roberta = Trainer(
    model=model_roberta,
    args=training_args_roberta,
    train_dataset=tokenized_train_dataset1,
    eval_dataset=tokenized_val_dataset1,
    tokenizer=tokenizer_roberta,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

#finetune your model
trainer_roberta.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.431886,0.873333
2,0.594000,0.359224,0.893333


TrainOutput(global_step=500, training_loss=0.5940443115234375, metrics={'train_runtime': 14294.1332, 'train_samples_per_second': 0.56, 'train_steps_per_second': 0.035, 'total_flos': 2104982937600000.0, 'train_loss': 0.5940443115234375, 'epoch': 2.0})

In [None]:
#testing

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get predictions from the Trainer
predictions_gl = trainer_roberta.predict(tokenized_test_dataset1)

# Extract predicted labels and true labels
predicted_labels = predictions_gl.predictions.argmax(-1)
true_labels = tokenized_test_dataset1['labels']

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.8671428571428571
Precision: 0.8684365910756521
Recall: 0.8671428571428571
F1 Score: 0.867166292021723


### 2.5.2 Using Small Dataset

#### 2.5.2.1 feature-based FT

In [None]:
model_roberta

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
for param in model_roberta.parameters():
  param.requires_grad = False

for param in model_roberta.classifier.parameters():
  param.requires_grad = True


In [None]:
#Define your training hyperparameters in TrainingArguments

from transformers import TrainingArguments

training_args_roberta2 = TrainingArguments(
    output_dir="roberta2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

#Pass the training arguments to Trainer along with the model, dataset, tokenizer, data collator, and compute_metrics function

trainer_roberta2 = Trainer(
    model=model_roberta,
    args=training_args_roberta2,
    train_dataset=small_train_dataset1,
    eval_dataset=small_eval_dataset1,
    tokenizer=tokenizer_roberta,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

#finetune your model
trainer_roberta2.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.85028,0.22
2,No log,1.848027,0.22


TrainOutput(global_step=14, training_loss=1.8580517087663924, metrics={'train_runtime': 206.1048, 'train_samples_per_second': 0.97, 'train_steps_per_second': 0.068, 'total_flos': 52624573440000.0, 'train_loss': 1.8580517087663924, 'epoch': 2.0})

In [None]:
#testing

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get predictions from the Trainer
predictions_fbs = trainer_roberta2.predict(small_test_dataset1)

# Extract predicted labels and true labels
predicted_labels = predictions_fbs.predictions.argmax(-1)
true_labels = small_test_dataset1['labels']

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.15
Precision: 0.0225
Recall: 0.15
F1 Score: 0.0391304347826087


  _warn_prf(average, modifier, msg_start, len(result))


#### 2.5.2.2 global FT

In [None]:
#Define your training hyperparameters in TrainingArguments

from transformers import TrainingArguments

training_args_roberta3 = TrainingArguments(
    output_dir="roberta1",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

#Pass the training arguments to Trainer along with the model, dataset, tokenizer, data collator, and compute_metrics function

trainer_roberta3 = Trainer(
    model=model_roberta,
    args=training_args_roberta3,
    train_dataset=small_train_dataset1,
    eval_dataset=small_eval_dataset1,
    tokenizer=tokenizer_roberta,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

#finetune your model
trainer_roberta3.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.879838,0.28
2,No log,1.869007,0.28


TrainOutput(global_step=14, training_loss=1.9097349984305245, metrics={'train_runtime': 467.0124, 'train_samples_per_second': 0.428, 'train_steps_per_second': 0.03, 'total_flos': 52624573440000.0, 'train_loss': 1.9097349984305245, 'epoch': 2.0})

In [None]:
#testing

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get predictions from the Trainer
predictions_gls = trainer_roberta3.predict(small_test_dataset1)

# Extract predicted labels and true labels
predicted_labels = predictions_gls.predictions.argmax(-1)
true_labels = small_test_dataset1['labels']

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.24
Precision: 0.1112773190869354
Recall: 0.24
F1 Score: 0.1427825005183496


  _warn_prf(average, modifier, msg_start, len(result))


### 2.6 adjust hyperparameter


## 2.6.1 adjust learning rate: 2e-5 to 3e-5

In [None]:
#Define your training hyperparameters in TrainingArguments

from transformers import TrainingArguments

training_args_lr = TrainingArguments(
    output_dir="roberta_lr",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

#Pass the training arguments to Trainer along with the model, dataset, tokenizer, data collator, and compute_metrics function

trainer_lr = Trainer(
    model=model_roberta,
    args=training_args_lr,
    train_dataset=small_train_dataset1,
    eval_dataset=small_eval_dataset1,
    tokenizer=tokenizer_roberta,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

#finetune your model
trainer_lr.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.887662,0.25
2,No log,1.869549,0.25


TrainOutput(global_step=14, training_loss=1.9344829831804549, metrics={'train_runtime': 515.2271, 'train_samples_per_second': 0.388, 'train_steps_per_second': 0.027, 'total_flos': 52624573440000.0, 'train_loss': 1.9344829831804549, 'epoch': 2.0})

In [None]:
#testing

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get predictions from the Trainer
predictions_lr = trainer_lr.predict(small_test_dataset1)

# Extract predicted labels and true labels
predicted_labels_lr = predictions_lr.predictions.argmax(-1)
true_labels_lr = small_test_dataset1['labels']

# Calculate evaluation metrics
accuracy_lr = accuracy_score(true_labels_lr, predicted_labels_lr)
precision_lr, recall_lr, f1_lr, _ = precision_recall_fscore_support(true_labels_lr, predicted_labels_lr, average='weighted')

# Print the results
print(f"Accuracy: {accuracy_lr}")
print(f"Precision: {precision_lr}")
print(f"Recall: {recall_lr}")
print(f"F1 Score: {f1_lr}")

Accuracy: 0.24
Precision: 0.0576
Recall: 0.24
F1 Score: 0.0929032258064516


  _warn_prf(average, modifier, msg_start, len(result))


## 2.6.2 adjust batch size: 16 to 32

In [None]:
#Define your training hyperparameters in TrainingArguments

from transformers import TrainingArguments

training_args_bs = TrainingArguments(
    output_dir="roberta_bs",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

#Pass the training arguments to Trainer along with the model, dataset, tokenizer, data collator, and compute_metrics function

trainer_bs = Trainer(
    model=model_roberta,
    args=training_args_bs,
    train_dataset=small_train_dataset1,
    eval_dataset=small_eval_dataset1,
    tokenizer=tokenizer_roberta,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

#finetune your model
trainer_bs.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.830701,0.33
2,No log,1.811069,0.35


TrainOutput(global_step=8, training_loss=1.8716765642166138, metrics={'train_runtime': 520.4838, 'train_samples_per_second': 0.384, 'train_steps_per_second': 0.015, 'total_flos': 52624573440000.0, 'train_loss': 1.8716765642166138, 'epoch': 2.0})

In [None]:
#testing

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get predictions from the Trainer
predictions_bs = trainer_bs.predict(small_test_dataset1)

# Extract predicted labels and true labels
predicted_labels_bs = predictions_bs.predictions.argmax(-1)
true_labels_bs = small_test_dataset1['labels']

# Calculate evaluation metrics
accuracy_bs = accuracy_score(true_labels_bs, predicted_labels_bs)
precision_bs, recall_bs, f1_bs, _ = precision_recall_fscore_support(true_labels_bs, predicted_labels_bs, average='weighted')

# Print the results
print(f"Accuracy: {accuracy_bs}")
print(f"Precision: {precision_bs}")
print(f"Recall: {recall_bs}")
print(f"F1 Score: {f1_bs}")

Accuracy: 0.25
Precision: 0.10807692307692308
Recall: 0.25
F1 Score: 0.14915422885572138


  _warn_prf(average, modifier, msg_start, len(result))


## 2.6.3 adjust epochs: 2 to 3

In [None]:
#Define your training hyperparameters in TrainingArguments

from transformers import TrainingArguments

training_args_ep = TrainingArguments(
    output_dir="roberta_ep",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

#Pass the training arguments to Trainer along with the model, dataset, tokenizer, data collator, and compute_metrics function

trainer_ep = Trainer(
    model=model_roberta,
    args=training_args_ep,
    train_dataset=small_train_dataset1,
    eval_dataset=small_eval_dataset1,
    tokenizer=tokenizer_roberta,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

#finetune your model
trainer_ep.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.66249,0.3
2,No log,1.588599,0.42
3,No log,1.553415,0.44


TrainOutput(global_step=21, training_loss=1.6701209658668155, metrics={'train_runtime': 705.7591, 'train_samples_per_second': 0.425, 'train_steps_per_second': 0.03, 'total_flos': 78936860160000.0, 'train_loss': 1.6701209658668155, 'epoch': 3.0})

In [None]:
#testing

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get predictions from the Trainer
predictions_ep = trainer_ep.predict(small_test_dataset1)

# Extract predicted labels and true labels
predicted_labels_ep = predictions_ep.predictions.argmax(-1)
true_labels_ep = small_test_dataset1['labels']

# Calculate evaluation metrics
accuracy_ep = accuracy_score(true_labels_ep, predicted_labels_ep)
precision_ep, recall_ep, f1_ep, _ = precision_recall_fscore_support(true_labels_ep, predicted_labels_ep, average='weighted')

# Print the results
print(f"Accuracy: {accuracy_ep}")
print(f"Precision: {precision_ep}")
print(f"Recall: {recall_ep}")
print(f"F1 Score: {f1_ep}")

Accuracy: 0.35
Precision: 0.381948051948052
Recall: 0.35
F1 Score: 0.2386647029945999


  _warn_prf(average, modifier, msg_start, len(result))


# Appendix:

# 1 DistilBERT


In [None]:
! pip install -U accelerate # a HF library that enables the same PyTorch code to be run across any distributed configuration
! pip install -U transformers
! pip install datasets


Collecting accelerate
  Downloading accelerate-0.24.0-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, accelerate
Successfully installed accelerate-0.24.0 huggingface-hub-0.18.0
Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m81.

## 1.1 clean data

In [None]:
# eliminate any leading or trailing whitespaces
dataset['Question'] = dataset['Question'].str.strip()
dataset['Coarse-grain class'] = dataset['Coarse-grain class'].str.strip()

unique_label = dataset['Coarse-grain class'].unique()
unique_label

array(['TTD', 'TGU', 'ACM', 'TRS', 'WTH', 'FOD', 'ENT'], dtype=object)

In [None]:
# Using LabelEncoder to convert categorical labels to numerical values
label_encoder = LabelEncoder()
dataset['label_encoded'] = label_encoder.fit_transform(dataset['Coarse-grain class'])

# Retrieve the corresponding relationship between label and label_encoded
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Display the label to label_encoded mapping
print(label_mapping)

{'ACM': 0, 'ENT': 1, 'FOD': 2, 'TGU': 3, 'TRS': 4, 'TTD': 5, 'WTH': 6}


In [None]:
dataset = dataset[['Question', 'label_encoded']]
dataset.head()

Unnamed: 0,Question,label_encoded
0,What are the special things we (husband and me...,5
1,What are the companies which organize shark fe...,5
2,Is it safe for female traveller to go alone to...,3
3,What are the best places around Cape Town for ...,5
4,What are the best places to stay for a family ...,0


## 1.2 split train, validation and test

In [None]:
# Split dataset into random train and test subsets
import numpy as np
from sklearn.model_selection import train_test_split

# Split the dataset into training, testing, and validation subsets
# 4000 samples for training, 700 for testing, and 300 for validation
train_data, test_data = train_test_split(dataset, test_size=700, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=300, random_state=42)

# Print the shapes of the resulting subsets to verify the split
print("Training Data Shape:", train_data.shape)
print("Testing Data Shape:", test_data.shape)
print("Validation Data Shape:", val_data.shape)

Training Data Shape: (4000, 2)
Testing Data Shape: (700, 2)
Validation Data Shape: (300, 2)


In [None]:
# Removing the index column
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
train_data.head()

Unnamed: 0,Question,label_encoded
0,Is there anywhere to store luggage at Gapyeong...,3
1,When does Disneyland Paris Half Marathon held?,5
2,What is your take on the Redrox Villa?,0
3,How can I get to Osanri prayer mountain from y...,4
4,Is there a provider in Samoa you would recomme...,3


## 1.3 prepare and tokenize data

In [None]:
# prepare the datasets

from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

print(train_dataset)
print(val_dataset)
print(test_dataset)

Dataset({
    features: ['Question', 'label_encoded'],
    num_rows: 4000
})
Dataset({
    features: ['Question', 'label_encoded'],
    num_rows: 300
})
Dataset({
    features: ['Question', 'label_encoded'],
    num_rows: 700
})


In [None]:
#load a DistilBERT tokenizer to preprocess the text field
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
#Create a preprocessing function to tokenize text and truncate sequences to be no longer than DistilBERT’s maximum input length

def preprocess_function(examples):
    return tokenizer(examples["Question"], truncation=True, padding="max_length")


In [None]:
#apply the preprocessing function over the sample dataset

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

In [None]:
# Reorder the columns to match the required sequence of Trainer
tokenized_train_dataset =tokenized_train_dataset.map(lambda example: {'input_ids': example['input_ids'],
                                            'attention_mask': example['attention_mask'],
                                            'label_encoded': example['label_encoded'],
                                            'Question': example['Question']},
                            batched=True)

tokenized_val_dataset =tokenized_val_dataset.map(lambda example: {'input_ids': example['input_ids'],
                                            'attention_mask': example['attention_mask'],
                                            'label_encoded': example['label_encoded'],
                                            'Question': example['Question']},
                            batched=True)

# Set the format of the dataset for the PyTorch Trainer with the correct column sequence
tokenized_train_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'label_encoded']
)

tokenized_val_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'label_encoded']
)


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
#rename columns
tokenized_train_dataset = tokenized_train_dataset.rename_column("label_encoded", "labels").rename_column("Question", "text")
tokenized_val_dataset = tokenized_val_dataset.rename_column("label_encoded", "labels").rename_column("Question", "text")
tokenized_test_dataset = tokenized_test_dataset.rename_column("label_encoded", "labels").rename_column("Question", "text")

In [None]:
tokenized_train_dataset.features

{'text': Value(dtype='string', id=None),
 'labels': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [None]:
#Now create a batch of examples using DataCollatorWithPadding.

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
# create a smaller train subset to fine-tune
small_train_dataset = tokenized_train_dataset.shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_val_dataset.shuffle(seed=42).select(range(100))
small_test_dataset = tokenized_test_dataset.shuffle(seed=42).select(range(100))

## 1.4 Evaluate

In [None]:
#load evaluate metric

!pip install evaluate
import evaluate

accuracy = evaluate.load("accuracy")



In [None]:
#Then create a function that passes your predictions and labels to compute to calculate the accuracy

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
# maps of the expected ids to their labels
id2label = {0: "ACM", 1: "ENT", 2: "FOD", 3: "TGU", 4: "TRS", 5: "TTD", 6: "WTH"}
label2id = {"ACM": 0, "ENT": 1, "FOD": 2, "TGU": 3, "TRS": 4, "TTD": 5, "WTH": 6}


## 1.5 Train model

In [None]:
#Load DistilBERT model

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=7, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 1.5.1 entire data-global FT

In [None]:
#Define your training hyperparameters in TrainingArguments

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="DistillBert1",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [None]:
#Distillbert global fine-tuning

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

#finetune your model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.484942,0.856667


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.484942,0.856667
2,0.708600,0.403124,0.88


TrainOutput(global_step=500, training_loss=0.7085625, metrics={'train_runtime': 28139.8983, 'train_samples_per_second': 0.284, 'train_steps_per_second': 0.018, 'total_flos': 1059833683968000.0, 'train_loss': 0.7085625, 'epoch': 2.0})

In [None]:
#testing on final test data

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get predictions from the Trainer
predictions = trainer.predict(tokenized_test_dataset)

# Extract predicted labels and true labels
predicted_labels = predictions.predictions.argmax(-1)
true_labels = tokenized_test_dataset['labels']

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.8542857142857143
Precision: 0.8554317549793158
Recall: 0.8542857142857143
F1 Score: 0.8527731837542832


### 1.5.2 small dataset-feature-based FT

In [None]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
#feature-based fine-tuning

for param in model.parameters():
  param.requires_grad = False

for param in model.pre_classifier.parameters():
  param.requires_grad = True

for param in model.classifier.parameters():
  param.requires_grad = True

In [None]:
#Define your training hyperparameters in TrainingArguments

from transformers import TrainingArguments

training_args1 = TrainingArguments(
    output_dir="DistillBert1",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

# fine-tuning

trainer1 = Trainer(
    model=model,
    args=training_args1,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

#finetune your model
trainer1.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.916796,0.13
2,No log,1.911339,0.13


TrainOutput(global_step=14, training_loss=1.9449427468436105, metrics={'train_runtime': 104.826, 'train_samples_per_second': 1.908, 'train_steps_per_second': 0.134, 'total_flos': 26495842099200.0, 'train_loss': 1.9449427468436105, 'epoch': 2.0})

In [None]:
#testing on small test data

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get predictions from the Trainer
predictions = trainer1.predict(small_test_dataset)

# Extract predicted labels and true labels
predicted_labels = predictions.predictions.argmax(-1)
true_labels = small_test_dataset['labels']

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.18
Precision: 0.11535714285714285
Recall: 0.18
F1 Score: 0.09317231814051365


  _warn_prf(average, modifier, msg_start, len(result))


### 1.5.3 small dataset-global FT

In [None]:
#global fine-tuning on small data

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

#finetune your model
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.913174,0.16
2,No log,1.908293,0.18


TrainOutput(global_step=14, training_loss=1.9230961118425642, metrics={'train_runtime': 568.3608, 'train_samples_per_second': 0.352, 'train_steps_per_second': 0.025, 'total_flos': 26495842099200.0, 'train_loss': 1.9230961118425642, 'epoch': 2.0})

In [None]:
#testing on small test data

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get predictions from the Trainer
predictions = trainer.predict(small_test_dataset)

# Extract predicted labels and true labels
predicted_labels = predictions.predictions.argmax(-1)
true_labels = small_test_dataset['labels']

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.2
Precision: 0.09760416666666666
Recall: 0.2
F1 Score: 0.0799254658385093


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#testing on small test data

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get predictions from the Trainer
predictions = trainer.predict(small_test_dataset)

# Extract predicted labels and true labels
predicted_labels = predictions.predictions.argmax(-1)
true_labels = small_test_dataset['labels']

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# 2 LR-raw data

## 2.1 prepare data

In [None]:
# load the data
raw_data = dataset['Question']
raw_labels = dataset['Coarse-grain class']

# eliminate any leading or trailing whitespaces
raw_data = raw_data.str.strip()
raw_labels = raw_labels.str.strip()

In [None]:
# encode these categorical values into numerical labels
labels_encoded = raw_labels.factorize()[0]

In [None]:
# eliminate stopwords
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

filtered_text_data = []
for sentence in raw_data:
    words = sentence.split()
    filtered_sentence = [word for word in words if word.lower() not in stop_words]
    filtered_text_data.append(" ".join(filtered_sentence))

for sentence in filtered_text_data:
    print(sentence)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


special things (husband me) 5 day stay Cape Town?
companies organize shark feeding events scuba divers?
safe female traveller go alone Cape Town?
best places around Cape Town safari?
best places stay family stay away nightlife?
train services travels cape town Oudtshoorn?
best places spend 2 weeks relaxing honeymoon South Africa?
use travellers cheques credit cards Cape Town?
warm enough swim early September Cape Town?
best beaches shelling CapeTown?
want find central location Cape Town stay family ease access food necessities?
differences Shamwari Kwandwe game reserves best?
find adventure activity camps South Africa?
select apartment best views Lawhill apartments?
okay use Afrique Boutique Hotel shuttle service from/to airport pick taxi?
direct flights Port Elizabeth Victoria falls?
anyone suggest route go Cape Town Cape Point?
anyone suggest itinerary Port Elizabeth 2 weeks self driving road trip?
resort would recommend around Cape Town couple 5 year old towards end October?
anyone 

In [None]:
raw_data = filtered_text_data

In [None]:
# encode question to numerical values
vocab = {}
index = 0

text_data = raw_data

indexed_text_data = []
for text_sequence in text_data:
    word_indices = []

    words = text_sequence.split()
    for word in words:
        if word not in vocab:
            vocab[word] = index
            index += 1
        word_indices.append(vocab[word])
    indexed_text_data.append(word_indices)

print(indexed_text_data)


[[0, 1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 7, 8], [21, 22, 23, 7, 24, 25], [21, 22, 6, 26, 6, 27, 28], [29, 30, 31, 32, 33, 34], [21, 22, 35, 36, 37, 38, 39, 40, 41], [42, 43, 44, 45, 46, 7, 8], [47, 48, 49, 50, 51, 7, 8], [21, 52, 53, 54], [55, 56, 57, 58, 7, 24, 6, 26, 59, 60, 61, 62], [63, 64, 65, 66, 67, 68], [56, 69, 70, 71, 40, 41], [72, 73, 21, 74, 75, 76], [77, 42, 78, 79, 80, 81, 82, 83, 84, 85, 86], [87, 88, 89, 90, 91, 92], [93, 94, 95, 19, 7, 24, 7, 96], [93, 94, 97, 89, 90, 36, 37, 98, 99, 100, 101], [102, 103, 104, 23, 7, 24, 105, 4, 106, 107, 108, 109, 110], [93, 104, 111, 112, 74, 6, 113, 36, 114, 115, 116, 117], [118, 119, 120, 121, 122, 123, 124, 125], [126, 127, 128, 129, 130, 88, 131], [132, 133, 45, 46, 134, 40, 135, 132, 136, 137], [138, 139, 42, 19, 7, 24, 140], [94, 141, 142, 22, 143, 144, 145, 146, 147, 148, 149, 150, 151, 5, 6, 152, 153], [154, 155, 156, 157, 158], [21, 22, 35, 39, 159], [93, 94, 160, 97, 161, 5, 6, 159], [

## 2.2 split data

In [None]:
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence

# # Split the dataset into training, testing, and validation subsets
# 4000 samples for training, 700 for testing, and 300 for validation
padded_text_data = pad_sequence([torch.tensor(indices) for indices in indexed_text_data], batch_first=True, padding_value=0)

train_data, tmp_data, train_labels, tmp_labels = train_test_split(padded_text_data, labels_encoded, train_size=4000, random_state=23
)
test_data, val_data, test_labels, val_labels = train_test_split(
    tmp_data, tmp_labels, test_size=300, random_state=23
)

# Print the shapes of the resulting subsets to verify the split
print("Training Data Shape:", train_data.shape)
print("Testing Data Shape:", test_data.shape)
print("Validation Data Shape:", val_data.shape)

Training Data Shape: torch.Size([4000, 21])
Testing Data Shape: torch.Size([700, 21])
Validation Data Shape: torch.Size([300, 21])


## 2.3 LR embedding layer with Raw data

In [None]:
# Step 2: Create data loaders
# Step 3: Define the logistic regression model with an embedding layer
# Step 4: Define the loss function and optimizer
# Step 5: Train the model
# Step 6: Evaluate the model on the test set

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import classification_report

torch.manual_seed(23)

# Step 2: Create data loaders
batch_size = 16
train_dataset = TensorDataset(torch.tensor(train_data), torch.tensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(torch.tensor(test_data), torch.tensor(test_labels))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# Step 3: Define the logistic regression model with an embedding layer
class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes, embedding_dim):
        super(LogisticRegression, self).__init__()
        self.embed = nn.Embedding(input_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        x = self.embed(x)
        x = x.mean(1)
        x = self.fc(x)
        return x


input_size = len(vocab)
num_classes = 7
embedding_dim = 100
model_LR_embedding = LogisticRegression(input_size, num_classes, embedding_dim)

# Step 4: Define the loss function and optimizer
optimizer = optim.Adam(model_LR_embedding.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()


# define train function
def train(model, iterator, optimizer, criterion):
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch[0].long())
        loss = criterion(predictions, batch[1])
        loss.backward()
        optimizer.step()

# define evaluate function
def evaluate(model, iterator, criterion):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():

        for batch in iterator:
            predictions = model(batch[0].long())
            loss = criterion(predictions, batch[1])
            total_loss += loss.item()
            total_correct += (predictions.argmax(1) == batch[1]).sum().item()
            predictions_array.extend(predictions.argmax(1).tolist())
            used_labels_array.extend(batch[1].tolist())

    return total_loss / len(iterator), total_correct / len(iterator.dataset)

# Step 5: Train the model
predictions_array = []
used_labels_array = []

start_time = time.time()
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    train(model_LR_embedding, train_loader, optimizer, criterion)

# Step 6: Evaluate the model on the test set
    test_loss, test_acc = evaluate(model_LR_embedding, test_loader, criterion)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTest Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

end_time = time.time()
total_distilbert_time = end_time - start_time

report = classification_report(predictions_array, used_labels_array)
print('******************Report************************')
print(f"Total training time: {total_distilbert_time} seconds")
print(report)


  train_dataset = TensorDataset(torch.tensor(train_data), torch.tensor(train_labels))
  test_dataset = TensorDataset(torch.tensor(test_data), torch.tensor(test_labels))


Epoch: 01
	Test Loss: 1.114 | Test Acc: 64.00%
Epoch: 02
	Test Loss: 0.768 | Test Acc: 76.29%
Epoch: 03
	Test Loss: 0.742 | Test Acc: 77.29%
Epoch: 04
	Test Loss: 0.700 | Test Acc: 78.00%
Epoch: 05
	Test Loss: 0.733 | Test Acc: 78.43%
Epoch: 06
	Test Loss: 0.750 | Test Acc: 78.57%
Epoch: 07
	Test Loss: 0.773 | Test Acc: 78.29%
Epoch: 08
	Test Loss: 0.805 | Test Acc: 77.71%
Epoch: 09
	Test Loss: 0.812 | Test Acc: 78.14%
Epoch: 10
	Test Loss: 0.828 | Test Acc: 78.14%
******************Report************************
Total training time: 13.253870248794556 seconds
              precision    recall  f1-score   support

           0       0.78      0.64      0.71      1813
           1       0.74      0.74      0.74      1818
           2       0.79      0.84      0.81       945
           3       0.85      0.82      0.83      1469
           4       0.73      0.90      0.81       187
           5       0.67      0.90      0.77       528
           6       0.61      0.81      0.69       240


#3 LR with FastText


## 3.1 clean data

In [None]:
# eliminate any leading or trailing whitespaces
dataset['Question'] = dataset['Question'].str.strip()
dataset['Coarse-grain class'] = dataset['Coarse-grain class'].str.strip()

unique_label = dataset['Coarse-grain class'].unique()
unique_label

array(['TTD', 'TGU', 'ACM', 'TRS', 'WTH', 'FOD', 'ENT'], dtype=object)

In [None]:
# Using LabelEncoder to convert categorical labels to numerical values
label_encoder = LabelEncoder()
dataset['label_encoded'] = label_encoder.fit_transform(dataset['Coarse-grain class'])

# Retrieve the corresponding relationship between label and label_encoded
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Display the label to label_encoded mapping
print(label_mapping)

{'ACM': 0, 'ENT': 1, 'FOD': 2, 'TGU': 3, 'TRS': 4, 'TTD': 5, 'WTH': 6}


In [None]:
dataset = dataset[['Question', 'label_encoded']]
dataset.head()

Unnamed: 0,Question,label_encoded
0,What are the special things we (husband and me...,5
1,What are the companies which organize shark fe...,5
2,Is it safe for female traveller to go alone to...,3
3,What are the best places around Cape Town for ...,5
4,What are the best places to stay for a family ...,0


## 3.2 split data

In [None]:
# Split data into training, validation, and test sets
train_data, temp_data = train_test_split(dataset, train_size=4000, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=700, random_state=42)

# Extract text and labels for each set
train_texts, train_labels = train_data['Question'].tolist(), train_data['label_encoded'].tolist()
val_texts, val_labels = val_data['Question'].tolist(), val_data['label_encoded'].tolist()
test_texts, test_labels = test_data['Question'].tolist(), test_data['label_encoded'].tolist()

# Printing to verify the splits
print("Training Data Shape:", train_data.shape)
print("Testing Data Shape:", test_data.shape)
print("Validation Data Shape:", val_data.shape)

Training Data Shape: (4000, 2)
Testing Data Shape: (700, 2)
Validation Data Shape: (300, 2)


## 3.3 implement FastText

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import classification_report
from gensim.models import FastText
import os

torch.manual_seed(23)

#Build vocabulary for the FastText model
all_data = train_texts + test_texts  # Combine train and test data
all_data_tokenized = [text.split() for text in all_data]  # Tokenize the data

# Save the tokenized data to a text file
with open("tokenized_data.txt", "w") as file:
    for sentence in all_data_tokenized:
        file.write(' '.join(sentence) + '\n')



In [None]:
# Build vocabulary for the FastText model
corpus_iterable = [sentence for sentence in all_data_tokenized]  # Use the tokenized data directly
fasttext_model = FastText(vector_size=100, window=5, min_count=5, workers=4, sg=1)
fasttext_model.build_vocab(corpus_iterable=corpus_iterable)
fasttext_model.train(corpus_iterable=corpus_iterable, total_examples=len(all_data_tokenized), epochs=10)

# Delete the temporary file
os.remove("tokenized_data.txt")

In [None]:
from gensim.models import FastText
import numpy as np

embedding_dim = 100
train_embeddings = np.array([np.mean([fasttext_model.wv[word] for word in sentence.split() if word in fasttext_model.wv] or [np.zeros(embedding_dim)], axis=0) for sentence in train_texts])
test_embeddings = np.array([np.mean([fasttext_model.wv[word] for word in sentence.split() if word in fasttext_model.wv] or [np.zeros(embedding_dim)], axis=0) for sentence in test_texts])

# Now convert the embeddings to PyTorch tensors and create the data loaders
batch_size = 16
train_dataset = TensorDataset(torch.tensor(train_embeddings), torch.tensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(torch.tensor(test_embeddings), torch.tensor(test_labels))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)


In [None]:
# Logistic regression model with FastText embeddings
class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.fc = nn.Linear(input_size, num_classes)

    def forward(self, x):
        return self.fc(x)

# Training function
def train(model, iterator, optimizer, criterion):
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch[0].float())
        loss = criterion(predictions, batch[1])
        loss.backward()
        optimizer.step()

# Evaluation function
def evaluate(model, iterator, criterion):
    model.eval()
    with torch.no_grad():
        total_loss = 0
        all_predictions = []
        all_labels = []
        for batch in iterator:
            predictions = model(batch[0].float())
            loss = criterion(predictions, batch[1])
            total_loss += loss.item()
            all_predictions.extend(torch.argmax(predictions, 1))
            all_labels.extend(batch[1])

        return total_loss / len(iterator), classification_report(all_labels, all_predictions)

# Initialize the model, criterion, and optimizer
input_size = fasttext_model.vector_size
num_classes = len(set(train_labels))
model = LogisticRegression(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training and evaluation loop
epochs = 10
for epoch in range(epochs):
    train(model, train_loader, optimizer, criterion)
    test_loss, report = evaluate(model, test_loader, criterion)
    print(f'Epoch {epoch + 1}/{epochs}, Test Loss: {test_loss}')
    print('Classification Report:\n', report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10, Test Loss: 1.392391557043249
Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.34      0.39       105
           1       0.00      0.00      0.00        30
           2       0.00      0.00      0.00        73
           3       0.36      0.70      0.48       179
           4       0.73      0.45      0.56       119
           5       0.41      0.45      0.42       168
           6       1.00      0.62      0.76        26

    accuracy                           0.44       700
   macro avg       0.42      0.37      0.37       700
weighted avg       0.42      0.44      0.41       700



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/10, Test Loss: 1.3556429472836582
Classification Report:
               precision    recall  f1-score   support

           0       0.31      0.67      0.42       105
           1       0.00      0.00      0.00        30
           2       0.67      0.05      0.10        73
           3       0.42      0.54      0.47       179
           4       0.64      0.65      0.64       119
           5       0.58      0.35      0.43       168
           6       0.85      0.65      0.74        26

    accuracy                           0.46       700
   macro avg       0.50      0.41      0.40       700
weighted avg       0.50      0.46      0.44       700



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3/10, Test Loss: 1.310755205425349
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.48      0.50       105
           1       0.00      0.00      0.00        30
           2       0.40      0.38      0.39        73
           3       0.53      0.31      0.39       179
           4       0.55      0.73      0.63       119
           5       0.42      0.62      0.50       168
           6       0.86      0.69      0.77        26

    accuracy                           0.49       700
   macro avg       0.47      0.46      0.45       700
weighted avg       0.48      0.49      0.47       700



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 4/10, Test Loss: 1.277159502560442
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.34      0.42       105
           1       0.00      0.00      0.00        30
           2       0.34      0.42      0.38        73
           3       0.44      0.57      0.49       179
           4       0.68      0.62      0.65       119
           5       0.47      0.48      0.48       168
           6       0.72      0.69      0.71        26

    accuracy                           0.49       700
   macro avg       0.45      0.45      0.45       700
weighted avg       0.48      0.49      0.48       700



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 5/10, Test Loss: 1.2615465250882236
Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.61      0.51       105
           1       0.00      0.00      0.00        30
           2       0.50      0.34      0.41        73
           3       0.46      0.55      0.50       179
           4       0.67      0.66      0.66       119
           5       0.50      0.45      0.48       168
           6       0.75      0.69      0.72        26

    accuracy                           0.51       700
   macro avg       0.48      0.47      0.47       700
weighted avg       0.50      0.51      0.50       700



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 6/10, Test Loss: 1.255484702912244
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.61      0.56       105
           1       0.00      0.00      0.00        30
           2       0.44      0.34      0.38        73
           3       0.49      0.44      0.46       179
           4       0.63      0.69      0.66       119
           5       0.44      0.54      0.49       168
           6       0.76      0.73      0.75        26

    accuracy                           0.51       700
   macro avg       0.47      0.48      0.47       700
weighted avg       0.49      0.51      0.50       700



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 7/10, Test Loss: 1.262424798174338
Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.62      0.51       105
           1       0.00      0.00      0.00        30
           2       0.45      0.36      0.40        73
           3       0.46      0.50      0.48       179
           4       0.57      0.72      0.64       119
           5       0.54      0.40      0.46       168
           6       0.71      0.77      0.74        26

    accuracy                           0.50       700
   macro avg       0.45      0.48      0.46       700
weighted avg       0.49      0.50      0.49       700

Epoch 8/10, Test Loss: 1.2373356846245853
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.41      0.50       105
           1       0.33      0.03      0.06        30
           2       0.35      0.47      0.40        73
           3       0.46      0.59      0.52       179
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
