In [None]:
!sudo apt install git-lfs --upgrade

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.


In [None]:
!git init .
!git lfs install

Initialized empty Git repository in /content/.git/
Updated git hooks.
Git LFS initialized.


In [None]:
%%capture
!pip install datasets transformers wandb

In [None]:
import wandb
from datasets import load_dataset
from transformers import (
    AutoFeatureExtractor,
    AutoModelForImageClassification,
    Trainer,
    TrainingArguments,
)
from sklearn.model_selection import StratifiedShuffleSplit
from torchvision.transforms import (
    CenterCrop,
    RandomErasing,
    RandomAutocontrast,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    RandomAdjustSharpness,
    ToTensor,
)
import torch
import datasets
import numpy as np
from datasets import load_metric

In [None]:
%env WANDB_PROJECT=snorkel_training_data
%env WANDB_ENTITY=imagein

env: WANDB_PROJECT=snorkel_training_data
env: WANDB_ENTITY=imagein


In [None]:
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [None]:
!git config --global credential.helper store

In [None]:
MODEL_CHECKPOINT = "facebook/convnext-base-224"
TEST_DATASET_CHECKPOINT = "ImageIN/ImageIn_annotations"
TRAIN_DATASET_CHECKPOINT = "ImageIN/unlabelled_IA_with_snorkel_labels"

In [None]:
FP_16 = torch.cuda.is_available()

In [None]:
train_dataset = load_dataset(TRAIN_DATASET_CHECKPOINT, split="train",use_auth_token=True)
test_dataset = load_dataset(TEST_DATASET_CHECKPOINT, split='train')



## Remove any data from train that is also in test data

In [None]:
test_images = set(test_dataset['image'])

In [None]:
train_dataset = train_dataset.filter(lambda x: x['image'] not in test_images)



In [None]:
from toolz import frequencies

In [None]:
freqs = frequencies(train_dataset['snorkel_label'])
freqs

{0: 16163, 1: 50657}

In [None]:
not_illustrated = train_dataset.filter(lambda x: x['snorkel_label'] == 1)
illustrated = train_dataset.filter(lambda x: x['snorkel_label'] == 0)



In [None]:
import random

In [None]:
max_sample = freqs[0] 
sample = random.sample(range(len(not_illustrated)),max_sample)
not_illustrated = not_illustrated.select(sample)

In [None]:
from datasets import concatenate_datasets

In [None]:
train_dataset = concatenate_datasets([not_illustrated, illustrated])

In [None]:
train_dataset

Dataset({
    features: ['image', 'manifest_url', 'license', 'label', 'attribution', 'loaded_image', 'detr_preds_count', 'manuscript_count', 'mean_rgb', 'illustration_classifier', 'snorkel_label_model_probs', 'snorkel_label'],
    num_rows: 32326
})

In [None]:
train_dataset.column_names

['image',
 'manifest_url',
 'license',
 'label',
 'attribution',
 'loaded_image',
 'detr_preds_count',
 'manuscript_count',
 'mean_rgb',
 'illustration_classifier',
 'snorkel_label_model_probs',
 'snorkel_label']

In [None]:
train_dataset = train_dataset.remove_columns(
    [name for name in train_dataset.column_names if name not in {"loaded_image", "snorkel_label"}]
)

In [None]:
test_dataset = test_dataset.remove_columns(
    [name for name in test_dataset.column_names if name not in {"loaded_image", "choice"}]
)

In [None]:
train_ds = train_dataset.rename_column("snorkel_label", "labels")
valid_ds = test_dataset.rename_column("choice", "labels")

In [None]:
valid_ds.unique('labels')

['not-illustrated', 'illustrated']

In [None]:
valid_ds = valid_ds.cast_column('labels', datasets.ClassLabel(names=['illustrated','not-illustrated']))



In [None]:
#ds = train_ds.train_test_split(0.3)

In [None]:
#train_ds, valid_ds = ds['train'], ds['test']

In [None]:
def prepare_transforms(model_checkpoint, train_ds, valid_ds, test_ds=None):
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
    normalize = Normalize(
        mean=feature_extractor.image_mean, std=feature_extractor.image_std
    )
    _train_transforms = Compose(
        [
            Resize((feature_extractor.size, feature_extractor.size)),
            RandomAdjustSharpness(0.1),
            RandomAutocontrast(),
            ToTensor(),
            normalize,
            RandomErasing(),
        ]
    )

    _val_transforms = Compose(
        [
            Resize((feature_extractor.size, feature_extractor.size)),
            ToTensor(),
            normalize,
        ]
    )

    def train_transforms(examples):
        examples["pixel_values"] = [
            _train_transforms(image.convert("RGB"))
            for image in examples["loaded_image"]
        ]
        return examples

    def val_transforms(examples):
        examples["pixel_values"] = [
            _val_transforms(image.convert("RGB")) for image in examples["loaded_image"]
        ]
        return examples

    train_ds.set_transform(train_transforms)
    valid_ds.set_transform(val_transforms)

    return train_ds, valid_ds

In [None]:
train_ds, valid_ds = prepare_transforms(MODEL_CHECKPOINT, train_ds, valid_ds)

loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--convnext-base-224/snapshots/eda2970bc74154a2af92300316deecd49f72bea8/preprocessor_config.json
Feature extractor ConvNextFeatureExtractor {
  "crop_pct": 0.875,
  "do_normalize": true,
  "do_resize": true,
  "feature_extractor_type": "ConvNextFeatureExtractor",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 3,
  "size": 224
}



In [None]:
train_ds, valid_ds

(Dataset({
     features: ['loaded_image', 'labels'],
     num_rows: 32326
 }), Dataset({
     features: ['labels', 'loaded_image'],
     num_rows: 1896
 }))

In [None]:
valid_ds[0]

{'labels': 1,
 'loaded_image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1757x3005 at 0x7F2108337750>,
 'pixel_values': tensor([[[1.4269, 1.4440, 1.4269,  ..., 1.5810, 1.5810, 1.5982],
          [1.4269, 1.4440, 1.3927,  ..., 1.5810, 1.5639, 1.5125],
          [1.3927, 1.4440, 1.4440,  ..., 1.5810, 1.5639, 1.5639],
          ...,
          [1.3242, 1.3584, 1.4098,  ..., 1.6324, 1.6324, 1.6153],
          [1.3755, 1.3755, 1.3927,  ..., 1.6324, 1.6324, 1.6153],
          [1.3584, 1.3584, 1.3755,  ..., 1.6667, 1.6667, 1.6495]],
 
         [[1.2206, 1.2206, 1.2206,  ..., 1.3431, 1.3256, 1.3782],
          [1.2031, 1.2206, 1.1856,  ..., 1.3606, 1.3081, 1.2731],
          [1.1681, 1.2206, 1.2206,  ..., 1.3606, 1.3431, 1.3256],
          ...,
          [1.1681, 1.2206, 1.2556,  ..., 1.4832, 1.5007, 1.4832],
          [1.1331, 1.2031, 1.2556,  ..., 1.4657, 1.4832, 1.4657],
          [1.1331, 1.1506, 1.2206,  ..., 1.4657, 1.4657, 1.4482]],
 
         [[0.6182, 0.6182, 0.6182,  ...,

In [None]:
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["labels"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [None]:
id2label = dict(enumerate(train_ds.features["labels"].names))
label2id = {v: k for k, v in id2label.items()}

In [None]:
model = AutoModelForImageClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--convnext-base-224/snapshots/eda2970bc74154a2af92300316deecd49f72bea8/config.json
Model config ConvNextConfig {
  "_name_or_path": "facebook/convnext-base-224",
  "architectures": [
    "ConvNextForImageClassification"
  ],
  "depths": [
    3,
    3,
    27,
    3
  ],
  "drop_path_rate": 0.0,
  "hidden_act": "gelu",
  "hidden_sizes": [
    128,
    256,
    512,
    1024
  ],
  "id2label": {
    "0": "illustrated",
    "1": "not-illustrated"
  },
  "image_size": 224,
  "initializer_range": 0.02,
  "label2id": {
    "illustrated": 0,
    "not-illustrated": 1
  },
  "layer_norm_eps": 1e-12,
  "layer_scale_init_value": 1e-06,
  "model_type": "convnext",
  "num_channels": 3,
  "num_stages": 4,
  "patch_size": 4,
  "torch_dtype": "float32",
  "transformers_version": "4.22.2"
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--facebook--convnext-base-2

In [None]:
model_name = MODEL_CHECKPOINT.split("/")[1]
dataset_name = TRAIN_DATASET_CHECKPOINT.split("/")[1]

In [None]:
args = TrainingArguments(
    "imagein",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    hub_model_id=f"ImageIN/{model_name}_finetuned_on_{dataset_name}",
    overwrite_output_dir=True,
    push_to_hub=True,
    label_smoothing_factor=0.2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.1,
    disable_tqdm=False,
    fp16=FP_16,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="logs",
    remove_unused_columns=False,
    save_total_limit=10,
    seed=42, 
    report_to="wandb",
)

PyTorch: setting up devices


In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_CHECKPOINT)

loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--convnext-base-224/snapshots/eda2970bc74154a2af92300316deecd49f72bea8/preprocessor_config.json
Feature extractor ConvNextFeatureExtractor {
  "crop_pct": 0.875,
  "do_normalize": true,
  "do_resize": true,
  "feature_extractor_type": "ConvNextFeatureExtractor",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 3,
  "size": 224
}



In [None]:
def compute_metrics(eval_pred):
    precision_metric = load_metric("precision")
    recall_metric = load_metric("recall")
    f1_metric = load_metric("f1")
    accuracy_metric = load_metric("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = precision_metric.compute(
        predictions=predictions, references=labels, average="macro"
    )["precision"]
    recall = recall_metric.compute(
        predictions=predictions, references=labels, average="macro"
    )["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")[
        "f1"
    ]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)[
        "accuracy"
    ]
    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
)

/content/imagein is already a clone of https://huggingface.co/ImageIN/convnext-base-224_finetuned_on_unlabelled_IA_with_snorkel_labels. Make sure you pull the latest changes with `repo.git_pull()`.
Using cuda_amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 32326
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 20210
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3611,0.346744,0.984269,0.972874,0.978409,0.984177
2,0.3524,0.345275,0.98534,0.978962,0.9821,0.986814
3,0.3466,0.343839,0.985404,0.984701,0.985052,0.988924
4,0.3433,0.343425,0.984995,0.980759,0.982854,0.987342
5,0.3404,0.34588,0.98534,0.978962,0.9821,0.986814
6,0.3384,0.345342,0.98534,0.978962,0.9821,0.986814
7,0.3382,0.343724,0.98643,0.982181,0.984283,0.988397
8,0.3358,0.344064,0.985727,0.982905,0.984306,0.988397
9,0.3349,0.344801,0.985727,0.982905,0.984306,0.988397
10,0.3325,0.344316,0.98643,0.982181,0.984283,0.988397


***** Running Evaluation *****
  Num examples = 1896
  Batch size = 16
Saving model checkpoint to imagein/checkpoint-2021
Configuration saved in imagein/checkpoint-2021/config.json
Model weights saved in imagein/checkpoint-2021/pytorch_model.bin
Feature extractor saved in imagein/checkpoint-2021/preprocessor_config.json
Feature extractor saved in imagein/preprocessor_config.json
Several commits (2) will be pushed upstream.
***** Running Evaluation *****
  Num examples = 1896
  Batch size = 16
Saving model checkpoint to imagein/checkpoint-4042
Configuration saved in imagein/checkpoint-4042/config.json
Model weights saved in imagein/checkpoint-4042/pytorch_model.bin
Feature extractor saved in imagein/checkpoint-4042/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1896
  Batch size = 16
Saving model checkpoint to imagein/checkpoint-6063
Configuration saved in imagein/checkpoint-6063/config.json
Model weights saved in imagein/checkpoint-6063/pytorch_model.bin
Featu

TrainOutput(global_step=20210, training_loss=0.34263711520671136, metrics={'train_runtime': 10557.4405, 'train_samples_per_second': 30.619, 'train_steps_per_second': 1.914, 'total_flos': 2.5566335888419308e+19, 'train_loss': 0.34263711520671136, 'epoch': 10.0})

In [None]:
trainer.push_to_hub("training finished")

Saving model checkpoint to imagein
Configuration saved in imagein/config.json
Model weights saved in imagein/pytorch_model.bin
Feature extractor saved in imagein/preprocessor_config.json
Several commits (3) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/334M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/ImageIN/convnext-base-224_finetuned_on_unlabelled_IA_with_snorkel_labels
   b34bec0..1d7bc72  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/ImageIN/convnext-base-224_finetuned_on_unlabelled_IA_with_snorkel_labels
   b34bec0..1d7bc72  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Image Classification', 'type': 'image-classification'}, 'metrics': [{'name': 'Precision', 'type': 'precision', 'value': 0.986429696015502}, {'name': 'Recall', 'type': 'recall', 'value': 0.9821813379753294}, {'name': 'F1', 'type': 'f1', 'value': 0.984282848439815}, {'name': 'Accuracy', 'type': 'accuracy', 'value': 0.9883966244725738}]}
To https://huggingface.co/ImageIN/convnext-base-224_finetuned_on_unlabelled_IA_with_snorkel_labels
   1d7bc72..56d2df5  main -> main

   1d7bc72..56d2df5  m

'https://huggingface.co/ImageIN/convnext-base-224_finetuned_on_unlabelled_IA_with_snorkel_labels/commit/1d7bc725280a215cef5a3e2146ed02e1a8b95b57'

In [None]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▃▃▃▃▃▃▅▃▅▃▅▂▆▇▅▆▆▃▇▅▆▅▁▆▇▇█▇█▇▇▇▆▇▇▇▇▆▅
eval/f1,▁▃▃▃▃▄▄▅▃▅▃▅▂▆▇▅▆▆▃▇▅▆▅▂▆▇▇█▇█▇▇▇▆▇▇▇▇▆▅
eval/loss,█▂▂▂▂▁▁▂▃▂▃▂▄▂▂▂▃▃▃▂▃▃▂▆▂▂▃▁▂▁▂▂▂▂▂▂▂▂▂▂
eval/precision,▆▆▆▆▆▄▄▇▆▇▆▅▆▇▇▅▇▇▆█▇▇▄▁▅▆▆█▆▇██▆▅▆▆██▆▅
eval/recall,▁▄▄▄▄▅▅▅▄▅▄▆▃▅▇▆▅▅▄▆▅▅▇▆▇██▇██▆▆█▇██▆▆▇▆
eval/runtime,▃█▁▂▂▁▂▂▁▃▁▃▄▂▂▂▂▃▃▂▂▄▃▃▃▃▂▃▃▃▄▂▂▃▃▃▃▂▃▂
eval/samples_per_second,▆▁█▇▇█▇▇█▆█▆▅▇▇▇▆▅▆▇▇▅▆▆▆▆▆▆▆▆▅▇▇▆▆▆▆▇▆▇
eval/steps_per_second,▆▁█▇▇█▇▇█▆█▆▅▇▇▇▆▅▆▇▇▅▆▆▆▆▇▆▆▆▅▇▇▆▆▆▆▇▆▇
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████

0,1
eval/accuracy,0.98243
eval/f1,0.97654
eval/loss,0.07492
eval/precision,0.97217
eval/recall,0.98113
eval/runtime,77.1187
eval/samples_per_second,7.378
eval/steps_per_second,0.467
train/epoch,50.0
train/global_step,4150.0
