In [None]:
# filename: train_mbart_classification.py

import pandas as pd
import torch
from transformers import MBartForSequenceClassification, MBartTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Load your dataset (replace 'your_dataset.csv' with your actual file)
df = pd.read_csv('/content/fixed_data.csv')

# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Reset the indices of the DataFrames
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
print(len(train_df))
print(len(test_df))

3016
754


In [None]:
!pip install transformers pandas scikit-learn torch




In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize input

        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Prepare input tensors
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load mBART tokenizer and model
tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-50', src_lang='vi_VN')
model = MBartForSequenceClassification.from_pretrained('facebook/mbart-large-50', num_labels=2)

# Create training and test datasets
train_dataset = CustomDataset(train_df['content'].tolist(), train_df['label'].tolist(), tokenizer)
test_dataset = CustomDataset(test_df['content'].tolist(), test_df['label'].tolist(), tokenizer)

# Define metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions[0].argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1
    }

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_total_limit=2
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./mbart_classification_model')
tokenizer.save_pretrained('./mbart_classification_tokenizer')

# Evaluate the model on the test set
trainer.evaluate()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Some weights of MBartForSequenceClassification were not initialized from the model checkpoint at facebook/mbart-large-50 and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3571,0.475122,0.866048,0.36478
2,0.3938,0.325671,0.904509,0.689655
3,0.2732,0.345011,0.909814,0.714286


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


In [None]:
from transformers import MBartForSequenceClassification, MBartTokenizer, pipeline

# Load saved model and tokenizer
model_path = './mbart_classification_model'
tokenizer_path = './mbart_classification_tokenizer'

model = MBartForSequenceClassification.from_pretrained(model_path)
tokenizer = MBartTokenizer.from_pretrained(tokenizer_path)

# Create a text classification pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Make predictions on test_df with truncation and padding
def predict_with_truncation(text):
    # Tokenize with truncation and padding to match model's expected input length
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    # Get prediction
    prediction = model(**inputs).logits.argmax(-1).item()
    # if label and prediction = 1, print text
    if prediction == 1:
        print(text)

    # Map prediction to label (assuming your labels are 0 and 1)
    label_mapping = {0: 'LABEL_0', 1: 'LABEL_1'}
    return label_mapping[prediction]

test_df['predicted_label'] = test_df['content'].apply(predict_with_truncation)

# Assuming your labels are 'LABEL_0' and 'LABEL_1'
# Convert predicted labels to numeric for evaluation
label_mapping = {'LABEL_0': 0, 'LABEL_1': 1}
test_df['predicted_label_numeric'] = test_df['predicted_label'].map(label_mapping)

# Calculate accuracy and F1 score
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(test_df['label'], test_df['predicted_label_numeric'])
f1 = f1_score(test_df['label'], test_df['predicted_label_numeric'])

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

In [None]:
!zip -r models.zip mbart_classification_model mbart_classification_tokenizer

In [None]:
# auto download zip model.zip
from google.colab import files
files.download('models.zip')

In [None]:
# prompt: test on a specific text

# Assuming you have already trained and saved your model and tokenizer
# and you have the 'test_df' DataFrame with 'content' and 'label' columns

model_path = './mbart_classification_model'
tokenizer_path = './mbart_classification_tokenizer'

model = MBartForSequenceClassification.from_pretrained(model_path)
tokenizer = MBartTokenizer.from_pretrained(tokenizer_path)

def predict_with_truncation(text):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    prediction = model(**inputs).logits.argmax(-1).item()
    label_mapping = {0: 'LABEL_0', 1: 'LABEL_1'}
    return label_mapping[prediction]


# Example text to test on
text_to_test = "cộng sản có những triêu trò chống lại nhân quyền, đàn áp quyền con người"

predicted_label = predict_with_truncation(text_to_test)
print(f"Predicted label for the text: {predicted_label}")