In [1]:
import os
import numpy as np
import pandas as pd
import torch
import math
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score

In [2]:
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)

  import imp


In [3]:
set_seed(123)

epochs = 4

batch_size = 8

max_length = 200

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = 'fine_tuned_models_gpt2\\model'

tokenizer_path = 'fine_tuned_models_gpt2\\tokenizer'

In [4]:
DATA_DIR = "book-covers"

CATEGORIES = next(os.walk(DATA_DIR), (None, None, []))[1]
# print(CATEGORIES)

# Dictionary of labels and their id - this will be used to convert.
# String labels to number ids.
labels_ids = {CATEGORIES[i]:i for i in range(len(CATEGORIES))}

# How many labels are we using in training.
# This is used to decide size of classification head.
n_labels = len(labels_ids)
print(labels_ids)
print(n_labels)

{'Art-Photography': 0, 'Biography': 1, 'Business-Finance-Law': 2, 'Childrens-Books': 3, 'Computing': 4, 'Crafts-Hobbies': 5, 'Crime-Thriller': 6, 'Dictionaries-Languages': 7, 'Entertainment': 8, 'Food-Drink': 9, 'Graphic-Novels-Anime-Manga': 10, 'Health': 11, 'History-Archaeology': 12, 'Home-Garden': 13, 'Humour': 14, 'Medical': 15, 'Mind-Body-Spirit': 16, 'Natural-History': 17, 'Personal-Development': 18, 'Poetry-Drama': 19, 'Reference': 20, 'Religion': 21, 'Romance': 22, 'Science-Fiction-Fantasy-Horror': 23, 'Science-Geography': 24, 'Society-Social-Sciences': 25, 'Sport': 26, 'Stationery': 27, 'Teaching-Resources-Education': 28, 'Technology-Engineering': 29, 'Teen-Young-Adult': 30, 'Transport': 31, 'Travel-Holiday-Guides': 32}
33


In [5]:
class BookDescriptionDataset(Dataset):
    def __init__(self, path, use_tokenizer):
        if not os.path.exists(path):
            raise ValueError('Invalid path variable.')
        
        book_info = pd.read_csv(path)
        self.texts = book_info['text'].values
        self.labels = book_info['label'].values
        self.n_examples = len(self.labels)
    
    def __len__(self):
        return self.n_examples

    def __getitem__(self, item):
        return {'text': self.texts[item],
                'label': self.labels[item]}
        

In [6]:
class Gpt2ClassificationCollator(object):
    def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):

        # Tokenizer to be used inside the class.
        self.use_tokenizer = use_tokenizer
        # Check max sequence length.
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
        # Label encoder used inside the class.
        self.labels_encoder = labels_encoder

        return

    def __call__(self, sequences):
        # Get all texts from sequences list.
        texts = [sequence['text'] for sequence in sequences]
        # Get all labels from sequences list.
        labels = [sequence['label'] for sequence in sequences]
        # Encode all labels using label encoder.
        labels = [self.labels_encoder[label] for label in labels]
        # Call tokenizer on all texts to convert into tensors of numbers with 
        # appropriate padding.
        inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)
        # Update the inputs with the associated encoded labels as tensor.
        inputs.update({'labels':torch.tensor(labels)})

        return inputs

In [7]:
def validation(dataloader, device_):
  # Use global variable for model.
  global model

  # Tracking variables
  predictions_labels = []
  true_labels = []
  #total loss for this epoch.
  total_loss = 0

  # Put the model in evaluation mode--the dropout layers behave differently
  # during evaluation.
  model.eval()

  # Evaluate data for one epoch
  for batch in tqdm(dataloader, total=len(dataloader)):

    # add original labels
    true_labels += batch['labels'].numpy().flatten().tolist()

    # move batch to device
    batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

    # Telling the model not to compute or store gradients, saving memory and
    # speeding up validation
    with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # This will return the logits rather than the loss because we have
        # not provided labels.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(**batch)

        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple along with the logits. We will use logits
        # later to to calculate training accuracy.
        loss, logits = outputs[:2]
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()
        
        # get predicitons to list
        predict_content = logits.argmax(axis=-1).flatten().tolist()

        # update list
        predictions_labels += predict_content

  # Calculate the average loss over the training data.
  avg_epoch_loss = total_loss / len(dataloader)

  # Return all true labels and prediciton for future evaluations.
  return true_labels, predictions_labels, avg_epoch_loss

In [8]:
# Get model configuration.
print('Loading configuraiton...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_path, num_labels=n_labels)

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_path)
# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token


# Get the actual model.
print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_path, config=model_config)

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

# Load model to defined device.
model.to(device)
print('Model loaded to `%s`'%device)

Loading configuraiton...
Loading tokenizer...
Loading model...
Model loaded to `cuda`


In [9]:
# Create data collator to encode text and labels into numbers.
gpt2_classificaiton_collator = Gpt2ClassificationCollator(use_tokenizer=tokenizer, 
                                                          labels_encoder=labels_ids, 
                                                          max_sequence_len=max_length)

print('Dealing with Test...')
# Create pytorch dataset.
test_dataset =  BookDescriptionDataset(path=os.path.join('book_cover_temp\\test_data_sorted.csv'), 
                               use_tokenizer=tokenizer)
print('Created `test_dataset` with %d examples!'%len(test_dataset))

# Move pytorch dataset into dataloader.
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)
print('Created `eval_dataloader` with %d batches!'%len(test_dataloader))

Dealing with Test...
Created `test_dataset` with 264 examples!
Created `eval_dataloader` with 33 batches!


In [10]:
true_labels, predictions_labels, avg_epoch_loss = validation(test_dataloader, device)

  0%|          | 0/33 [00:00<?, ?it/s]

In [11]:
indices = []
i = 0
for t, p in zip(true_labels, predictions_labels):
    if t == p:
        indices.append(i)
    
    i = i + 1

In [14]:
np.savetxt("prediction_output\\gpt2_prediction_output.txt", np.array(indices))

See the Prediction Result Intersection from both models, save the index for the book recommendation website.

In [2]:
import numpy as np
description_index = np.loadtxt("prediction_output\\gpt2_prediction_output.txt")
book_cover_index = np.loadtxt("prediction_output\\vit_prediction_output.txt")

In [3]:
description_index

array([  0.,   1.,   6.,  23.,  24.,  25.,  26.,  33.,  35.,  36.,  37.,
        39.,  44.,  46.,  47.,  49.,  50.,  53.,  54.,  56.,  58.,  59.,
        61.,  62.,  63.,  68.,  72.,  76.,  78.,  79.,  81.,  83.,  92.,
        98., 100., 102., 103., 104., 106., 108., 110., 114., 118., 120.,
       124., 126., 129., 130., 132., 135., 136., 141., 142., 149., 150.,
       152., 153., 155., 156., 167., 168., 170., 172., 174., 176., 181.,
       182., 184., 188., 190., 213., 218., 220., 221., 222., 226., 230.,
       231., 232., 233., 234., 235., 238., 239., 240., 241., 242., 246.,
       247., 250., 251., 252., 254., 256., 257., 258., 259., 262.])

In [4]:
book_cover_index

array([ 12.,  32.,  34.,  40.,  43.,  48.,  54.,  56.,  70.,  71.,  72.,
        76.,  79.,  80.,  92., 110., 115., 121., 125., 134., 135., 136.,
       143., 145., 148., 158., 164., 171., 174., 178., 180., 182., 184.,
       185., 189., 210., 212., 213., 215., 218., 238., 241., 243., 244.,
       245., 246., 247., 249., 251., 253.])

In [5]:
intersection = np.intersect1d(description_index, book_cover_index).tolist()

In [6]:
intersection = [int(i) for i in intersection]

In [7]:
intersection

[54,
 56,
 72,
 76,
 79,
 92,
 110,
 135,
 136,
 174,
 182,
 184,
 213,
 218,
 238,
 241,
 246,
 247,
 251]

In [14]:
import pandas as pd
test_data = pd.read_csv('book_cover_temp\\temp_data.csv')

In [15]:
selected_book_info = test_data.iloc[intersection]

In [16]:
selected_book_info = selected_book_info[['volumeInfo.title', 'volumeInfo.description', 'volumeInfo.imageLinks.thumbnail', 'search_term']]
selected_book_info = selected_book_info.rename(columns={'volumeInfo.title':'title', 'volumeInfo.description':'description', 'volumeInfo.imageLinks.thumbnail': 'thumbnail'})
# selected_book_info['thumbnail'] = '<img src=\"' + selected_book_info['thumbnail'] + '\">'

In [17]:
selected_book_info = selected_book_info.fillna(" ")

In [18]:
selected_book_info

Unnamed: 0,title,description,thumbnail,search_term
54,"Crime, Histoire & Sociétés",,http://books.google.com/books/content?id=xwttA...,Crime-Thriller
56,Blue Moon,"Reacher is on a Greyhound bus, minding his own...",http://books.google.com/books/content?id=V1zaD...,Crime-Thriller
72,Drink Progressively,DRINK PROGRESSIVELY offers readers an easy and...,http://books.google.com/books/content?id=0Zo2M...,Food-Drink
76,Food and Drink in American History,This three-volume encyclopedia on the history ...,http://books.google.com/books/content?id=o7gxB...,Food-Drink
79,We Shall Eat and Drink Again,,http://books.google.com/books/content?id=kAMMA...,Food-Drink
92,Archaeology For Dummies,An objective guide to this fascinating science...,http://books.google.com/books/content?id=GRz0y...,History-Archaeology
110,The Sweet Life,"An anthology of Laura Stoddart's exquisite, ti...",http://books.google.com/books/content?id=lvhlT...,Home-Garden
135,Mind and the New Physics,,http://books.google.com/books/content?id=9VkdA...,Mind-Body-Spirit
136,The Necessity of the Mind,,http://books.google.com/books/content?id=b4pcA...,Mind-Body-Spirit
174,Origen's Contra Celsum,Presented here for the first time in years is ...,http://books.google.com/books/content?id=8Tz7D...,Religion


In [19]:
selected_book_info.to_csv('book_cover_recommendation_page\\documents\\selected_books.csv')

In [15]:
with open("book_recommendation.txt", "w") as file:
    file.write("\n".join(str(int(s)) for s in intersection))