Collection of methods used to pull down openlibrary data dump and get cover image embeddings using OpenAI's Clip model


In [None]:
import pandas as pd
import requests
import os
import gzip
import json
import numpy as np

import pprint
import tempfile

from typing import Dict, Text

from ast import literal_eval

# import faiss
import torch
import skimage
# import pinecone
import numpy as np
import pandas as pd
from PIL import Image
from io import BytesIO
import IPython.display
import matplotlib.pyplot as plt
# from datasets import load_dataset
from collections import OrderedDict
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer

In [None]:
import PIL

PIL.Image.MAX_IMAGE_PIXELS = None

Step 1: Download data dump from https://openlibrary.org/data; process in batches with pandas

In [None]:
def download_data_dump():
  chunk_size = 1000000  # Adjust the chunk size as needed
  file_path = '/content/drive/MyDrive/ml_app/book_data/ol_dump_editions_latest.txt.gz'  # Replace with the path to your large file


  # Read and process each chunk, then output to separate files
  for i, chunk in enumerate(pd.read_csv(file_path, chunksize=chunk_size, compression='gzip', header=0, sep='\t', quotechar='"')):
      output_file_path = f'/content/drive/MyDrive/ml_app/book_data/processed_batches/processed_chunk_{i + 1}.csv'
      chunk.to_csv(output_file_path, index=False)

      print(f"Processed chunk {i + 1}. Output saved to {output_file_path}")

In [None]:

def process_raw_files():
  # Directory where processed chunk files are stored
  processed_chunks_dir = '/content/drive/MyDrive/ml_app/book_data/processed_batches/'  # Replace with the path to your directory
  select_columns = ['title', 'isbn_10', 'isbn_13', 'publish_date', 'key', 'subjects', 'languages', 'description.value', 'genres']

  # List to store DataFrames of processed chunks
  processed_chunks = []

  # Column names for your DataFrame
  column_names = ['col1', 'col2', 'col3', 'col4', 'col5']  # original data tab separated columns

  # Read each processed chunk file and append to the list
  for i, filename in enumerate(os.listdir(processed_chunks_dir)):
      if filename.endswith('.csv'):
          print(f'Processing file {filename}....')
          file_path = os.path.join(processed_chunks_dir, filename)
          chunk_df = pd.read_csv(file_path, names=column_names)
          books_info_lst = chunk_df['col5'].tolist()

          # raw data stored in column as json
          books_info_lst = [json.loads(book_record) for book_record in books_info_lst]
          books_df = pd.json_normalize(books_info_lst)

          # narrow down amount of data by only outputting english language books and books greater than 100 pages
          books_df['languages_unpacked'] = books_df['languages'].apply(lambda lst: ', '.join(d['key'] for d in lst) if isinstance(lst, list) else '')
          df_languages_keys = books_df[books_df['languages_unpacked'] =='/languages/eng']
          df_page_cnt_filter = df_languages_keys[df_languages_keys['number_of_pages'] >= 100]

          final_df = df_page_cnt_filter
          final_df = final_df[select_columns]
          final_df.to_csv(f'/content/drive/MyDrive/ml_app/book_data/all_raw/books_{i}.csv')



Step 2: Read in full data dump and filter for desired data (in this case recent (between 2020-2023) adult fiction

In [None]:
def read_csvs_in_directory(directory):
    dataframes = []
    for filename in os.listdir(directory):
        print(f'Processing file {filename}....')
        if filename.endswith(".csv"):
            csv_path = os.path.join(directory, filename)
            books_df = pd.read_csv(csv_path)

            dataframes.append(books_df)
    # return dataframes
    return pd.concat(dataframes, ignore_index=True)

In [None]:
def get_image_id(row):
    isbn_10 = row['isbn_10']
    isbn_13 = row['isbn_13']
    image_id = None
    if isbn_10 is not np.nan:
        image_id = literal_eval(isbn_10)[0]
    elif isbn_13 is not np.nan:
        image_id = literal_eval(isbn_13)[0]

    return image_id

In [None]:
def get_image_url(image_id):
  if image_id:
    return f'https://covers.openlibrary.org/b/isbn/{image_id}.jpg'
  return None

In [None]:
def get_filtered_data():
  parent_directory = '/content/drive/MyDrive/ml_app/book_data/all_raw'
  result_df = read_csvs_in_directory(parent_directory)
  fiction_df = result_df[result_df['subjects'].fillna('').str.contains('Fiction', case=False)]

  adult_fiction_df = fiction_df[~fiction_df['subjects'].str.contains('Juvenile', case=False)]

  recent_fiction_df = adult_fiction_df[adult_fiction_df['publish_date'].fillna('').str.contains('|'.join(['2020', '2021', '2022', '2023']), case=False)]

  # get image id which is isbn_13 or isbn_13 code then use to build urls

  recent_fiction_df['image_id'] = recent_fiction_df.apply(get_image_id, axis=1)
  recent_fiction_df['image_url'] = recent_fiction_df['image_id'].apply(get_image_url)

  recent_fiction_df = recent_fiction_df[recent_fiction_df['image_url'].notna()]

  recent_fiction_df.to_csv('/content/drive/MyDrive/ml_app/book_data/recent_fiction.csv')

Step 3: Batch through filtered dataset, pull down images, and save embeddings

In [None]:
def get_image(image_URL):
  if image_URL:
    try:
      response = requests.get(image_URL)
      image = Image.open(BytesIO(response.content)).convert("RGB")
      width, height = image.size
      if width == 1 and height == 1:
        return None
      return image
    except:
      print(f"Error: {image_URL}")
      return None
    return None

In [None]:
def get_embedding_batch(images):

  model_ID = "openai/clip-vit-base-patch32"
  device = "cuda" if torch.cuda.is_available() else "cpu"

  model = CLIPModel.from_pretrained(model_ID).to(device)
  processor = CLIPProcessor.from_pretrained(model_ID)
      # Get the tokenizer
  tokenizer = CLIPTokenizer.from_pretrained(model_ID)

  try:
    batch_tensor = torch.stack([processor(images=img, return_tensors="pt", padding=True)['pixel_values'][0] for img in images])
    embedding = model.get_image_features(batch_tensor)

    return embedding.cpu().detach().numpy()
  except:
      print(f'Error: image in batch does not exist or cannot be downloaded')
      return []
  return []

In [None]:
def batch_process_get_embeddings():
  filtered_df = pd.read_csv('/content/drive/MyDrive/ml_app/book_data/recent_fiction.csv')
  directory_path = '/content/drive/MyDrive/ml_app/book_data/batched_embeddings/'
  directory_files = os.listdir(directory_path)

  # get iterator starting point from file name to kick process off at file where it failed
  latest_ind = 0
  if len(directory_files) > 0:
    latest_file = sorted(directory_files)[len(directory_files) - 1]
    latest_ind = int(latest_file.split('_')[1])

  step_size = 100
  images = []
  selected_rows = None
  for start_row in range(latest_ind, 10000, step_size):
      print(f'Processing from row {start_row}')
      # Calculate the end row for each iteration
      end_row = min(start_row + step_size - 1, len(filtered_df) - 1)

      # Get the group of rows for the current iteration
      selected_rows = filtered_df.iloc[start_row:end_row + 1]


      image_embedding_df_cols = ['key', 'image']


      selected_rows["image"] = selected_rows[str("image_url")].apply(get_image)

      image_df = selected_rows[selected_rows['image'].notna()]
      image_df = image_df[image_embedding_df_cols]


      image_embeddings = []
      chunk_size = 5
      for i in range(0, len(image_df['image'].values), chunk_size):
        print(f'Processing images from {i}')
        chunk_df = image_df.iloc[i:i + chunk_size]

        chunk_embeddings = get_embedding_batch(chunk_df['image'].values)

        image_embeddings.extend(list(chunk_embeddings))



    image_df['image_embeddings'] = list(image_embeddings)


    merged_df = pd.merge(selected_rows, image_df, on='key', how='left')


    merged_df.to_csv(f'/content/drive/MyDrive/ml_app/book_data/batched_embeddings/{start_row}_{start_row + step_size}_w_embeddings.csv')


In [None]:
def pkl_embeddings():
  result_df = read_csvs_in_directory('/content/drive/MyDrive/ml_app/book_data/batched_embeddings/')
  result_df['image_embeddings'] = result_df['image_embeddings'].str.strip('[]').str.split().apply(lambda x: np.array(x).astype(float)).to_numpy()
  image_df = result_df[result_df['image_embeddings'].notna()]
  image_df['image_embeddings'] = image_df['image_embeddings'].apply(lambda x: x.reshape(1, -1))

  image_df = image_df[image_df['image_embeddings'].notna()]

  image_df.to_pickle('/content/drive/MyDrive/ml_app/book_data/image_embeddings.pkl');