In [1]:
!pip install datasets evaluate accelerate -U



In [2]:
import pandas as pd
import numpy as np
import torch
import zipfile
import evaluate
import json
from datasets import Dataset, load_metric
from transformers import BartTokenizerFast
from transformers import BartForSequenceClassification, Trainer, TrainingArguments, EvalPrediction, AutoConfig

In [3]:
train_df = pd.read_csv("/content/train_hdfs_labeled.csv")
eval_df = pd.read_csv("/content/eval_hdfs_labeled.csv")

In [4]:
train_df["label"].value_counts()

label
0    5000
1    1000
Name: count, dtype: int64

In [5]:
eval_df["label"].value_counts()

label
0    1000
1    1000
Name: count, dtype: int64

In [6]:
train_ds = Dataset.from_pandas(train_df)
eval_ds = Dataset.from_pandas(eval_df)

In [7]:
with zipfile.ZipFile("/content/hdfs.zip", 'r') as zip_ref:
  zip_ref.extractall("tokenizer")

In [8]:
tokenizer = tokenizer = BartTokenizerFast.from_pretrained("/content/tokenizer")

In [9]:
def preprocess_token(dataset):
  encoded_sequence = tokenizer(dataset["text"], truncation = True, padding = 'max_length')
  encoded_sequence['label'] = dataset["label"]
  return encoded_sequence

In [10]:
train_dataset = train_ds.map(preprocess_token, batched = True, batch_size = 1, remove_columns = ["text"])
eval_dataset = eval_ds.map(preprocess_token, batched = True, batch_size = 1, remove_columns = ["text"])

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [11]:
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label",])

In [12]:
eval_dataset[11]

{'label': tensor(0),
 'input_ids': tensor([  0, 284,  29,  ...,   1,   1,   1]),
 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])}

In [13]:
def compute_metrics(p: EvalPrediction):
    # metric = evaluate.load("confusion_matrix")
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis = 1)
    result = {}
    result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
    result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"]
    # result["confusion_matrix"] = metric.compute(references=p.label_ids, predictions=preds)
    return result

In [14]:
id2label = {0: "Normal", 1: "Anomaly"}
label2id = {"Normal": 0, "Anomaly": 1}

In [15]:
training_args = TrainingArguments(
  output_dir = "models/",      # Output directory
  learning_rate=5e-5,
  num_train_epochs = 1,             # Total number of training epochs
  per_device_train_batch_size = 1,  # Batch size per device during training
  per_device_eval_batch_size = 1,   # Batch size for evaluation
  warmup_steps = 500,                # Number of warmup steps for learning rate scheduler
  weight_decay = 0.01,               # Strength of weight decay
  gradient_accumulation_steps = 4,
  gradient_checkpointing=True,
  evaluation_strategy="epoch",
  save_strategy="epoch",
  load_best_model_at_end=True,
)



In [16]:
model = BartForSequenceClassification.from_pretrained("facebook/bart-base", num_labels = 2, id2label=id2label, label2id=label2id, ignore_mismatched_sizes = True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
trainer = Trainer(
  model = model,                     # The instantiated model to be trained
  args = training_args,              # Training arguments, defined above
  compute_metrics = compute_metrics, # A function to compute the metrics
  train_dataset = train_dataset,     # Training dataset
  eval_dataset = eval_dataset,       # Evaluation dataset
  tokenizer = tokenizer              # The tokenizer that was used
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss


In [None]:
trainer.evaluate()

In [None]:
import os
import zipfile

In [None]:
def get_all_file_paths(directory):

    # initializing empty file paths list
    file_paths = []

    # crawling through directory and subdirectories
    for root, directories, files in os.walk(directory):
        for filename in files:
            # join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)

    # returning all file paths
    return file_paths

In [None]:
directory = '/content/models/checkpoint-1500'

    # calling function to get all file paths in the directory
file_paths = get_all_file_paths(directory)

In [None]:
print('Following files will be zipped:')
for file_name in file_paths:
  print(file_name)

In [None]:
# ZipFile = zipfile.ZipFile
# with ZipFile('/content/drive/MyDrive/Workspace/model.zip','w') as zip:
#   for file in file_paths:
#     zip.write(file)