In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time

In [2]:
raw = pd.read_csv("data_sets/Books_Raw.csv", na_values='None')

In [3]:
import csv
import requests

def fetch_isbn(title, author):
    url = f'https://www.googleapis.com/books/v1/volumes?q=intitle:{title}+inauthor:{author}&maxResults=1'
    response = requests.get(url)
    data = response.json()
    if 'items' in data and data['items']:
        volume_info = data['items'][0]['volumeInfo']
        if 'industryIdentifiers' in volume_info:
            return volume_info['industryIdentifiers'][0]['identifier']
    return None

In [4]:
isbn_list = []
m = 20

start_time = time.time()
for i in range(m):
    row = raw.loc[i]
    isbn = fetch_isbn(row['Title'], row['authors'])
    isbn_list.append(isbn)
end_time = time.time()

In [5]:
print("Time to fetch", m, "ISBN's:", end_time - start_time)
print("Time per ISBN:", (end_time - start_time) / m)
print("Time for all ISBN's:", len(raw) * (end_time - start_time) / m / 3600, "hours")

Time to fetch 20 ISBN's: 27.258432149887085
Time per ISBN: 1.3629216074943542
Time for all ISBN's: 80.41388919950856 hours


In [6]:
[(len(i) if i != None else 0) for i in isbn_list]
isbn_list

[None,
 '0826417086',
 '0829814000',
 '9780595344550',
 None,
 '0802841899',
 'OCLC:8772487',
 'LCCN:89106899',
 '9781618902917',
 '0963923099',
 None,
 None,
 '1510738614',
 '9781461536260',
 '9781787209596',
 'OCLC:939613762',
 '0414088107',
 '9781466895423',
 '9780307489272',
 '9781513262833']

In [7]:
def not_an_isbn(title, author):
    return len(title+author)

def update_csv(input_file, output_file):
    with open(input_file, 'r',encoding='utf-8') as csv_file:
        reader = csv.DictReader(csv_file)
        fieldnames = reader.fieldnames + ['ISBN']
        with open(output_file, 'w', newline='',encoding='utf-8') as output_csv:
            writer = csv.DictWriter(output_csv, fieldnames=fieldnames)
            writer.writeheader()
            for row in reader:
                isbn = not_an_isbn(row['Title'], row['authors'])
                row['ISBN'] = isbn if isbn else ''
                writer.writerow(row)

start_time = time.time()
update_csv("data_sets/Books_Raw.csv", "data_sets/iban_books.csv")
end_time = time.time()
print("Time for reading and writing csv's:", round(end_time - start_time, 1), "second")

Time for reading and writing csv's: 8.0 second
