In [2]:
import requests
import math
# import matplotlib.pyplot as plt
import shutil
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from getpass import getpass
from PIL import Image, UnidentifiedImageError
from requests.exceptions import HTTPError
from io import BytesIO
from pathlib import Path
import torch
import pytorch_lightning as pl
from huggingface_hub import HfApi, HfFolder, Repository
from torch.utils.data import DataLoader
from torchmetrics import Accuracy
from torchvision.datasets import ImageFolder
from transformers import ViTFeatureExtractor, ViTForImageClassification

In [3]:
DATA_DIR = Path('book_cover_temp')

CATEGORIES = next(os.walk(DATA_DIR), (None, None, []))[1]
print(CATEGORIES)

['Art-Photography', 'Biography', 'Business-Finance-Law', 'Childrens-Books', 'Computing', 'Crafts-Hobbies', 'Crime-Thriller', 'Dictionaries-Languages', 'Entertainment', 'Food-Drink', 'Graphic-Novels-Anime-Manga', 'Health', 'History-Archaeology', 'Home-Garden', 'Humour', 'Medical', 'Mind-Body-Spirit', 'Natural-History', 'Personal-Development', 'Poetry-Drama', 'Reference', 'Religion', 'Romance', 'Science-Fiction-Fantasy-Horror', 'Science-Geography', 'Society-Social-Sciences', 'Sport', 'Stationery', 'Teaching-Resources-Education', 'Technology-Engineering', 'Teen-Young-Adult', 'Transport', 'Travel-Holiday-Guides']


In [3]:
test_ds = ImageFolder(DATA_DIR)

In [4]:
# For some reason kernal keep crashing when trying to plot test dataset samples
# The sampling code was moved to another notebook

# plt.subplot()

: 

: 

In [11]:
# def display_book_samples(DATA_DIR):
#     CATEGORIES = next(os.walk(DATA_DIR), (None, None, []))[1]
#     num_examples_per_class = 3
#     f, axes = plt.subplots(nrows=len(CATEGORIES), ncols=num_examples_per_class)

#     axes = axes.ravel()
#     for classes in os.listdir(DATA_DIR):
#         d = os.path.join(DATA_DIR, classes)
#         if os.path.isdir(d):
#             print(d)
#         # for image_idx, image_path in enumerate(sorted(folder.glob('*'))):
#         #     if image_path.suffix in ds.extensions:
#         #         image = Image.open(image_path)
#         #         plt.subplot(len(ds.classes), num_examples_per_class, i)
#         #         ax = plt.gca()
#         #         ax.set_title(
#         #             class_name,
#         #             size='xx-large',
#         #             pad=5,
#         #             loc='left',
#         #             y=0,
#         #             backgroundcolor='white'
#         #         )
#         #         ax.axis('off')
#         #         plt.imshow(image)
#         #         i += 1

#         #         if image_idx + 1 == num_examples_per_class:
#         #             break

In [12]:
# display_book_samples(DATA_DIR)

: 

: 

In [5]:
label2id = {}
id2label = {}

for i, class_name in enumerate(CATEGORIES):
    label2id[class_name] = str(i)
    id2label[str(i)] = class_name

In [6]:
class ImageClassificationCollator:
    def __init__(self, feature_extractor):
        self.feature_extractor = feature_extractor
 
    def __call__(self, batch):
        encodings = self.feature_extractor([x[0] for x in batch], return_tensors='pt')
        encodings['labels'] = torch.tensor([x[1] for x in batch], dtype=torch.long)
        return encodings 

In [7]:
feature_extractor = ViTFeatureExtractor.from_pretrained('fine_tuned_models')
model = ViTForImageClassification.from_pretrained(
    'fine_tuned_models',
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label
)
collator = ImageClassificationCollator(feature_extractor)
test_loader = DataLoader(test_ds, batch_size=8, collate_fn=collator,  shuffle=False, pin_memory=True)

In [8]:
class Classifier(pl.LightningModule):

    def __init__(self, model, lr: float = 2e-5, **kwargs):
        super().__init__()
        self.save_hyperparameters('lr', *list(kwargs))
        self.model = model
        self.forward = self.model.forward
        self.val_acc = Accuracy()

    # def training_step(self, batch, batch_idx):
    #     outputs = self(**batch)
    #     self.log(f"train_loss", outputs.loss)
    #     return outputs.loss

    # def validation_step(self, batch, batch_idx):
    #     outputs = self(**batch)
    #     self.log(f"val_loss", outputs.loss)
    #     acc = self.val_acc(outputs.logits.argmax(1), batch['labels'])
    #     self.log(f"val_acc", acc, prog_bar=True)
    #     return outputs.loss

    def test_step(self, batch, batch_idx):
        # this is the test loop
        outputs = self(**batch)
        self.log(f"test_loss", outputs.loss)
        acc = self.val_acc(outputs.logits.argmax(1), batch['labels'])
        self.log(f"test_acc", acc, prog_bar=True)
        return outputs.loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

In [17]:
pl.seed_everything(42)
classifier = Classifier(model, lr=2e-5)
trainer = pl.Trainer(gpus=1)
trainer.test(classifier, dataloaders=test_loader)

Global seed set to 42
  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.19685038924217224
        test_loss            3.119323253631592
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 3.119323253631592, 'test_acc': 0.19685038924217224}]

In [17]:
def validation(test_loader, model, device_):
    # logits = []
    predictions_labels = []
    true_labels = []

    total_loss = 0

    model.eval()

    for test_batch in tqdm(test_loader, total=len(test_loader)):
        # test_batch = next(iter(test_loader))
        true_labels += test_batch['labels'].numpy().flatten().tolist()
        with torch.no_grad():
            outputs = model(**test_batch)
            loss, logits = outputs[:2]
            logits = logits.detach().cpu().numpy()
            total_loss += loss.item()
            predict_content = logits.argmax(axis=-1).flatten().tolist()
            predictions_labels += predict_content
    
    avg_epoch_loss = total_loss / len(test_loader)
        
    return true_labels, predictions_labels, avg_epoch_loss

In [49]:
true_labels, predictions_labels, avg_epoch_loss = validation(test_loader,model , device)

  0%|          | 0/32 [00:00<?, ?it/s]

In [54]:
print(avg_epoch_loss)

3.1099701076745987


In [56]:
print(label2id.keys())

dict_keys(['Art-Photography', 'Biography', 'Business-Finance-Law', 'Childrens-Books', 'Computing', 'Crafts-Hobbies', 'Crime-Thriller', 'Dictionaries-Languages', 'Entertainment', 'Food-Drink', 'Graphic-Novels-Anime-Manga', 'Health', 'History-Archaeology', 'Home-Garden', 'Humour', 'Medical', 'Mind-Body-Spirit', 'Natural-History', 'Personal-Development', 'Poetry-Drama', 'Reference', 'Religion', 'Romance', 'Science-Fiction-Fantasy-Horror', 'Science-Geography', 'Society-Social-Sciences', 'Sport', 'Stationery', 'Teaching-Resources-Education', 'Technology-Engineering', 'Teen-Young-Adult', 'Transport', 'Travel-Holiday-Guides'])


In [50]:
from ml_things import plot_confusion_matrix
from sklearn.metrics import classification_report

In [None]:
indices = []
i = 0
for t, p in zip(true_labels, predictions_labels):
    if t == p:
        indices.append(i)
    
    i = i + 1

In [75]:
prediction_output = 'prediction_output'

np.savetxt(os.path.join(prediction_output, "vit_prediction_output.txt"), np.array(indices))

50 of the book cover are matched from the vit model, the next step is to predict books descriptions from nlp model

In [70]:
print(len(indices))

50


In [4]:
test_description = pd.read_csv(os.path.join(DATA_DIR, 'temp_data.csv'))
# print(test_description[['volumeInfo.title', 'volumeInfo.description', 'search_term']])
test_description_sorted = pd.DataFrame(columns=['text', 'label'])
test_description['volumeInfo.description'].fillna(test_description['volumeInfo.title'], inplace=True)
test_description_sorted.text = test_description['volumeInfo.description']
test_description_sorted.label = test_description['search_term']
test_description_sorted

Unnamed: 0,text,label
0,Photographs are an integral part of our daily ...,Art-Photography
1,Susan Sontag's On Photography is a seminal and...,Art-Photography
2,Contemporary Photography and Theory offers an ...,Art-Photography
3,Art and Photography is the first book of its k...,Art-Photography
4,"The most comprehensive, up-to-date resource fo...",Art-Photography
...,...,...
259,"Dear Traveler, Welcome to the WanderStories™ g...",Travel-Holiday-Guides
260,"Action-packed, thrill-filled holidays begin ri...",Travel-Holiday-Guides
261,"Over 2,500 courses covered in detail. Hotels r...",Travel-Holiday-Guides
262,Insight Guides Great Breaks Guernsey Travel ma...,Travel-Holiday-Guides


In [5]:
test_description_sorted.isna().values.sum()

0

In [6]:
test_description_sorted.to_csv(os.path.join(DATA_DIR, "test_data_sorted.csv"))

In [7]:
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)

In [10]:
set_seed(123)

epochs = 4

batch_size = 8

max_length = 200

n_labels = len(CATEGORIES)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = 'fine_tuned_models_gpt2\\model'

tokenizer_path = 'fine_tuned_models_gpt2\\tokenizer'

In [11]:
print('Loading configuraiton...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_path, num_labels=n_labels)

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_path)
# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token


# Get the actual model.
print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_path, config=model_config)

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

# Load model to defined device.
model.to(device)
print('Model loaded to `%s`'%device)

Loading configuraiton...
Loading tokenizer...
Loading model...
Model loaded to `cuda`


In [12]:
from torch.utils.data import Dataset, DataLoader
class BookDescriptionDataset(Dataset):
    def __init__(self, path, use_tokenizer):
        if not os.path.exists(path):
            raise ValueError('Invalid path variable.')
        
        book_info = pd.read_csv(path)
        self.texts = book_info['text'].values
        self.labels = book_info['label'].values
        self.n_examples = len(self.labels)
    
    def __len__(self):
        return self.n_examples

    def __getitem__(self, item):
        return {'text': self.texts[item],
                'label': self.labels[item]}
        

In [13]:
class Gpt2ClassificationCollator(object):
    def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):

        # Tokenizer to be used inside the class.
        self.use_tokenizer = use_tokenizer
        # Check max sequence length.
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
        # Label encoder used inside the class.
        self.labels_encoder = labels_encoder

        return

    def __call__(self, sequences):
        # Get all texts from sequences list.
        texts = [sequence['text'] for sequence in sequences]
        # Get all labels from sequences list.
        labels = [sequence['label'] for sequence in sequences]
        # Encode all labels using label encoder.
        labels = [self.labels_encoder[label] for label in labels]
        # Call tokenizer on all texts to convert into tensors of numbers with 
        # appropriate padding.
        inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)
        # Update the inputs with the associated encoded labels as tensor.
        inputs.update({'labels':torch.tensor(labels)})

        return inputs

In [14]:
labels_ids = {CATEGORIES[i]:i for i in range(len(CATEGORIES))}
gpt2_classificaiton_collator = Gpt2ClassificationCollator(use_tokenizer=tokenizer, 
                                                          labels_encoder=labels_ids, 
                                                          max_sequence_len=max_length)

In [18]:
print('Dealing with Test...')
# Create pytorch dataset.
test_dataset =  BookDescriptionDataset(path=os.path.join(os.path.join(DATA_DIR, "test_data_sorted.csv")), 
                               use_tokenizer=tokenizer)
print('Created `test_dataset` with %d examples!'%len(test_dataset))

# Move pytorch dataset into dataloader.
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)
print('Created `eval_dataloader` with %d batches!'%len(test_dataloader))

Dealing with Test...
Created `test_dataset` with 264 examples!
Created `eval_dataloader` with 33 batches!


In [1]:
true_labels, predictions_labels, avg_epoch_loss = validation(test_dataloader, model, 'cuda:0')

NameError: name 'validation' is not defined

In [23]:
print(true_labels)

NameError: name 'true_labels' is not defined