In [1]:
import pandas as pd
import numpy as np
import evaluate
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, BertForSequenceClassification
from peft import LoraConfig, TaskType, get_peft_model

In [2]:
df = pd.read_csv("../data/Britannia.csv")
negative_reviews = df.loc[df["Negative_Review"] != "No Negative", "Negative_Review"]
positive_reviews = df.loc[df["Positive_Review"] != "No Positive", "Positive_Review"]
review_df = pd.DataFrame(
    {
        "Review": pd.concat([negative_reviews, positive_reviews], ignore_index=True),
        "Sentiment": ["Negative"] * len(negative_reviews)
        + ["Positive"] * len(positive_reviews),
    }
)
review_df.describe()

Unnamed: 0,Review,Sentiment
count,8361,8361
unique,7689,2
top,Location,Negative
freq,151,4262


In [3]:
y = [1 if sentiment == "Positive" else 0 for sentiment in review_df["Sentiment"].values]
X_train, X_test, y_train, y_test = train_test_split(
    review_df["Review"],
    y,
    test_size=0.2,
    random_state=42,
    stratify=review_df["Sentiment"],
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42, stratify=y_train
)
print(X_train.shape, X_val.shape, X_test.shape)

(5016,) (1672,) (1673,)


In [4]:
train_df = pd.DataFrame({"text": X_train, "label": y_train})
validation_df = pd.DataFrame({"text": X_val, "label": y_val})
test_df = pd.DataFrame({"text": X_test, "label": y_test})

train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict(
    {"train": train_dataset, "validation": validation_dataset, "test": test_dataset}
)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 5016
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 1672
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 1673
    })
})


In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5016 [00:00<?, ? examples/s]

Map:   0%|          | 0/1672 [00:00<?, ? examples/s]

Map:   0%|          | 0/1673 [00:00<?, ? examples/s]

In [6]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=1, lora_alpha=1, lora_dropout=0.1
)

model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-uncased", num_labels=2
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 38,402 || all params: 167,396,356 || trainable%: 0.0229


In [7]:
metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="../model/deep_learning",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    eval_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.693474,0.493421
2,No log,0.683853,0.602273
3,No log,0.582361,0.694378
4,0.671500,0.543325,0.720694
5,0.671500,0.513235,0.744617
6,0.671500,0.487441,0.757177
7,0.531600,0.471441,0.763158


In [None]:
trainer.evaluate(tokenized_datasets["test"])