# CSV + API

In this reboot, we are going to use:

- The [Goodreads books](https://www.kaggle.com/jealousleopard/goodreadsbooks) dataset from Kaggle.
- The [Open Library Books API](https://openlibrary.org/dev/docs/api/books)

The goal of this livecode is to load the data from a CSV + loop over rows to enrich each row with information such as:

- List of subjects (Science, Humor, Travel, etc.)
- The cover URL of the book
- Other information you'd find useful in the JSON API

First, download the CSV in the local folder:

In [1]:
!curl -L https://gist.githubusercontent.com/ssaunier/351b17f5a7a009808b60aeacd1f4a036/raw/books.csv > books.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1509k  100 1509k    0     0   755k      0  0:00:01  0:00:01 --:--:--  759k


In [2]:
!ls -lh

total 4264
-rw-r--r--  1 francoisgirard  staff   579B Nov 29  2022 README.md
-rw-r--r--  1 francoisgirard  staff    14K Oct 31 15:43 Recap.ipynb
-rw-r--r--  1 francoisgirard  staff   1.5M Oct 31 15:44 books.csv


Then import the usual suspects!

In [3]:
import requests
import pandas as pd
import numpy as np

## Load books from CSV

In [4]:
# Load books from CSV
books_df = pd.read_csv('books.csv')

# Display the first few rows of the dataframe to check the data
books_df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,# num_pages,ratings_count,text_reviews_count
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling-Mary GrandPré,4.56,0439785960,9780439785969,eng,652,1944099,26249
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling-Mary GrandPré,4.49,0439358078,9780439358071,eng,870,1996446,27613
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling-Mary GrandPré,4.47,0439554934,9780439554930,eng,320,5629932,70390
3,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.41,0439554896,9780439554893,eng,352,6267,272
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling-Mary GrandPré,4.55,043965548X,9780439655484,eng,435,2149872,33964


Let's add a new column

In [5]:
books_df['sample_column'] = 'sample_value'

## API - Open Library

In [None]:
books_df = pd.read_csv('books.csv')

print(books_df.head())

# Add a sample column
books_df['sample_column'] = 'sample_value'

# Define the base URL for the Open Library API
BASE_URL = "https://openlibrary.org/api/books?bibkeys=ISBN:{}&format=json&jscmd=data"

# Initialize new columns to store the extracted information
books_df['cover_url'] = np.nan
books_df['subjects'] = np.nan

# Loop through the DataFrame and make API requests
for index, row in books_df.iterrows():
    isbn = row['isbn']
    response = requests.get(BASE_URL.format(isbn))
    
    if response.status_code == 200:  # Check if the request was successful
        data = response.json()
        
        try:
            # Extract the cover URL
            cover_url = data[f'ISBN:{isbn}']['cover']['medium']
            books_df.at[index, 'cover_url'] = cover_url
        except KeyError:
            pass  # If cover URL is not present, just pass
        
        try:
            # Extract subjects
            subjects = [subject['name'] for subject in data[f'ISBN:{isbn}']['subjects']]
            books_df.at[index, 'subjects'] = ', '.join(subjects)
        except KeyError:
            pass  # If subjects are not present, just pass

# Display the updated DataFrame
print(books_df.head())


   bookID                                              title  \
0       1  Harry Potter and the Half-Blood Prince (Harry ...   
1       2  Harry Potter and the Order of the Phoenix (Har...   
2       3  Harry Potter and the Sorcerer's Stone (Harry P...   
3       4  Harry Potter and the Chamber of Secrets (Harry...   
4       5  Harry Potter and the Prisoner of Azkaban (Harr...   

                      authors  average_rating        isbn         isbn13  \
0  J.K. Rowling-Mary GrandPré            4.56  0439785960  9780439785969   
1  J.K. Rowling-Mary GrandPré            4.49  0439358078  9780439358071   
2  J.K. Rowling-Mary GrandPré            4.47  0439554934  9780439554930   
3                J.K. Rowling            4.41  0439554896  9780439554893   
4  J.K. Rowling-Mary GrandPré            4.55  043965548X  9780439655484   

  language_code  # num_pages  ratings_count  text_reviews_count  
0           eng          652        1944099               26249  
1           eng          8

In [None]:
BASE_URL = "https://openlibrary.org/api/books?bibkeys=ISBN:{}&format=json&jscmd=data"
books_df['subjects'] = np.nan
books_df['cover_url'] = np.nan

for idx, row in books_df.iterrows():
    url = BASE_URL.format(row['isbn'])
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json().get(f'ISBN:{row["isbn"]}', {})
        subjects = [subject['name'] for subject in data.get('subjects', [])]
        books_df.at[idx, 'subjects'] = ', '.join(subjects)
        books_df.at[idx, 'cover_url'] = data.get('cover', {}).get('medium')

def get_book_details_from_api(isbn):
    url = BASE_URL.format(isbn)
    headers = {
        "User-Agent": "YourAppName/1.0"  # Remplacez par le nom de votre application ou autre identifiant
    }
    response = requests.get(url, headers=headers)
    data = {}

    if response.status_code == 200:
        json_data = response.json()
        
        # Extract cover_url
        data['cover_url'] = json_data.get(f'ISBN:{isbn}', {}).get('cover', {}).get('medium')
        
        # Extract subjects
        subjects_data = json_data.get(f'ISBN:{isbn}', {}).get('subjects', [])
        data['subjects'] = ', '.join([subject['name'] for subject in subjects_data])
    
    return data

# Loop through the DataFrame
for index, row in books_df.iterrows():
    details = get_book_details_from_api(row['isbn'])
    books_df.at[index, 'cover_url'] = details.get('cover_url')
    books_df.at[index, 'subjects'] = details.get('subjects')

# Display the updated DataFrame
print(books_df.head())


## Calling the API with multiple ISBNs at a time

In [None]:
# Define the base URL for the Open Library API
BASE_URL = "https://openlibrary.org/api/books?bibkeys={}&format=json&jscmd=data"
HEADERS = {
    "User-Agent": "YourAppName/1.0"  # Remplacez par le nom de votre application ou autre identifiant
}

def fetch_details_for_group(isbn_group):
    """Fetch details for a group of ISBNs"""
    url = BASE_URL.format(",".join([f"ISBN:{isbn}" for isbn in isbn_group]))
    response = requests.get(url, headers=HEADERS)
    
    if response.status_code == 200:
        return response.json()
    return {}

# Batch the ISBNs to groups (e.g., groups of 10)
isbn_groups = [books_df['isbn'][i:i+10].tolist() for i in range(0, books_df.shape[0], 10)]

for group in isbn_groups:
    details = fetch_details_for_group(group)
    
    for isbn in group:
        key = f"ISBN:{isbn}"
        if key in details:
            data = details[key]
            # Extract cover_url
            cover_url = data.get('cover', {}).get('medium', np.nan)
            # Extract subjects
            subjects_data = data.get('subjects', [])
            subjects = ', '.join([subject['name'] for subject in subjects_data]) or np.nan
            
            # Update the DataFrame
            books_df.loc[books_df['isbn'] == isbn, 'cover_url'] = cover_url
            books_df.loc[books_df['isbn'] == isbn, 'subjects'] = subjects

# Display the updated DataFrame
print(books_df.head())