webscraping collections data and general bibliography


# query on data.bnf 

it's not possible to do a query directly using python, so this query:
```PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX bif: <bif:>
SELECT * 
WHERE {
  ?work dct:title ?title ;
        dct:publisher ?publisher;
        dct:date ?date;
        rdfs:seeAlso ?uri ;
        bnf-onto:isbn ?isbn
  FILTER (bif:contains(?title, "De_Kooning"))}```

has been done on the web endpoint and then the CSV was downloaded. 

In [137]:
import pandas as pd
getty2 = pd.read_csv("databnf.csv")

getty2.head()


Unnamed: 0,work,title,publisher,date,uri,isbn
0,http://data.bnf.fr/ark:/12148/cb45692271m#about,Le rire de De Kooning,"Bordeaux : Olympique , 2019",2019,https://catalogue.bnf.fr/ark:/12148/cb45692271m,978-2-9557550-6-8
1,http://data.bnf.fr/ark:/12148/cb45292173b#about,A way of living : the art of Willem De Kooning,"London : Phaidon Press Limited , 2017",2014,https://catalogue.bnf.fr/ark:/12148/cb45292173b,0714845817
2,http://data.bnf.fr/ark:/12148/cb45292173b#about,A way of living : the art of Willem De Kooning,"London : Phaidon Press Limited , 2017",2014,https://catalogue.bnf.fr/ark:/12148/cb45292173b,0714873160
3,http://data.bnf.fr/ark:/12148/cb45292173b#about,A way of living : the art of Willem De Kooning,"London : Phaidon Press Limited , 2017",2014,https://catalogue.bnf.fr/ark:/12148/cb45292173b,9780714845814
4,http://data.bnf.fr/ark:/12148/cb45292173b#about,A way of living : the art of Willem De Kooning,"London : Phaidon Press Limited , 2017",2014,https://catalogue.bnf.fr/ark:/12148/cb45292173b,9780714873169


In [138]:
for column_name in getty2.columns:
    if column_name == "isbn":
        for i, value in enumerate(getty2[column_name]):
            if "-" in value:
                # Replacing hyphens with empty string
                getty2.at[i, column_name] = value.replace("-", "")
getty2.head()

Unnamed: 0,work,title,publisher,date,uri,isbn
0,http://data.bnf.fr/ark:/12148/cb45692271m#about,Le rire de De Kooning,"Bordeaux : Olympique , 2019",2019,https://catalogue.bnf.fr/ark:/12148/cb45692271m,9782955755068
1,http://data.bnf.fr/ark:/12148/cb45292173b#about,A way of living : the art of Willem De Kooning,"London : Phaidon Press Limited , 2017",2014,https://catalogue.bnf.fr/ark:/12148/cb45292173b,714845817
2,http://data.bnf.fr/ark:/12148/cb45292173b#about,A way of living : the art of Willem De Kooning,"London : Phaidon Press Limited , 2017",2014,https://catalogue.bnf.fr/ark:/12148/cb45292173b,714873160
3,http://data.bnf.fr/ark:/12148/cb45292173b#about,A way of living : the art of Willem De Kooning,"London : Phaidon Press Limited , 2017",2014,https://catalogue.bnf.fr/ark:/12148/cb45292173b,9780714845814
4,http://data.bnf.fr/ark:/12148/cb45292173b#about,A way of living : the art of Willem De Kooning,"London : Phaidon Press Limited , 2017",2014,https://catalogue.bnf.fr/ark:/12148/cb45292173b,9780714873169


# query on google books api

In [139]:
import requests
import json

def fetch_books(query, max_results=40):
    base_url = "https://www.googleapis.com/books/v1/volumes"
    start_index = 0
    all_results = []

    while True:
        params = {
            "q": query,
            "startIndex": start_index,
            "maxResults": max_results
        }
        response = requests.get(base_url, params=params)

        if response.status_code == 200:
            data = response.json()
            items = data.get("items", [])
            if not items:
                break
            all_results.extend(items)
            start_index += max_results
        else:
            print("Failed to retrieve data. Status code:", response.status_code)
            break

    return all_results

# Search for books related to Cezanne
cezanne_books = fetch_books("De Kooning")

# Saving JSON data to a file
with open("C:/Users/crosi/Downloads/cezanne_books.json", "w") as json_file:
    json.dump(cezanne_books, json_file, indent=4)

print("JSON data saved to cezanne_books.json")

JSON data saved to cezanne_books.json


In [140]:
import json
import pandas as pd

# Load JSON data from file
with open("C:/Users/crosi/Downloads/cezanne_books.json", "r") as json_file:
    cezanne_books_data = json.load(json_file)

# Extract relevant fields from each book item
books_list = []
for book in cezanne_books_data:
    book_info = {
        "Title": book["volumeInfo"].get("title", "N/A"),
        "Subtitle": book["volumeInfo"].get("subtitle", "N/A"),
        "Authors": ", ".join(book["volumeInfo"].get("authors", ["N/A"])),
        "Publisher": book["volumeInfo"].get("publisher", "N/A"),
        "PublishedDate": book["volumeInfo"].get("publishedDate", "N/A"),
        "Description": book["volumeInfo"].get("description", "N/A"),
        "isbn": book["volumeInfo"].get("industryIdentifiers", [{}])[0].get("identifier", "N/A"),  # Retrieving ISBN
        "Categories": ", ".join(book["volumeInfo"].get("categories", ["N/A"]))
    }
    books_list.append(book_info)

# Create DataFrame
books_df = pd.DataFrame(books_list)

# Display DataFrame
books_df.head()


Unnamed: 0,Title,Subtitle,Authors,Publisher,PublishedDate,Description,isbn,Categories
0,"De Kooning, dipinti, disegni, sculture",,Willem De Kooning,,1985,,UOM:39015015825683,
1,De Kooning,,,,1985,,OCLC:12250843,
2,Willem De Kooning,late paintings,"Willem De Kooning, Museo Carlo Bilotti",Mondadori Electa,2006,A publication displaying the late works of one...,UOM:39015066851935,Art
3,De Kooning,A Retrospective,"Willem De Kooning, John Elderfield, Lauren Mah...",The Museum of Modern Art,2011,This publication offers an unparalleled opport...,9780870707971,Art
4,Willem de Kooning,,Carolyn Lanchner,The Museum of Modern Art,2011,Willem de Kooning was a pioneering figure amon...,9780870707889,Art


In [141]:
liss = []
for column_name in books_df.columns:
    if column_name == "Title":
        for value in books_df[column_name]:
            if "de Kooning" in value or "De Kooning" in value:
                liss.append(value)
print(len(liss))

148


In [142]:
import pandas as pd

# Assuming books_df is your DataFrame
new_dataframe = books_df[books_df["Title"].str.contains("de Kooning", case=False) & ~books_df["Title"].str.contains("Elaine de Kooning", case=False)].copy()

new_dataframe.reset_index(drop=True, inplace=True)

new_dataframe.head()



Unnamed: 0,Title,Subtitle,Authors,Publisher,PublishedDate,Description,isbn,Categories
0,"De Kooning, dipinti, disegni, sculture",,Willem De Kooning,,1985,,UOM:39015015825683,
1,De Kooning,,,,1985,,OCLC:12250843,
2,Willem De Kooning,late paintings,"Willem De Kooning, Museo Carlo Bilotti",Mondadori Electa,2006,A publication displaying the late works of one...,UOM:39015066851935,Art
3,De Kooning,A Retrospective,"Willem De Kooning, John Elderfield, Lauren Mah...",The Museum of Modern Art,2011,This publication offers an unparalleled opport...,9780870707971,Art
4,Willem de Kooning,,Carolyn Lanchner,The Museum of Modern Art,2011,Willem de Kooning was a pioneering figure amon...,9780870707889,Art


In [143]:
import pandas as pd

# Assuming df1 and df2 are your two DataFrames with the same column header 'ISBN'

# Merge the two DataFrames on the 'ISBN' column
df_combined = pd.merge(getty2, new_dataframe, on='isbn', how='inner')

# Display the new DataFrame with rows where ISBN is found in both DataFrames
df_combined


Unnamed: 0,work,title,publisher,date,uri,isbn,Title,Subtitle,Authors,Publisher,PublishedDate,Description,Categories
0,http://data.bnf.fr/ark:/12148/cb45292173b#about,A way of living : the art of Willem De Kooning,"London : Phaidon Press Limited , 2017",2014,https://catalogue.bnf.fr/ark:/12148/cb45292173b,0714873160,Willem de Kooning,A Way of Living,Judith Zilczer,Phaidon Press,2017-05-22,"In a new format, the bestselling monograph on ...",Art
1,http://data.bnf.fr/ark:/12148/cb347640459#about,Willem De Kooning : Whitney museum of American...,"Paris : Centre Georges-Pompidou , 1984",1984,https://catalogue.bnf.fr/ark:/12148/cb347640459,2858502552,De Kooning,,"Claire Stoullig, Catherine Bompuis, Akademie d...",,1984,,Art
2,http://data.bnf.fr/ark:/12148/cb37526494b#about,"Willem de Kooning : drawings, paintings, sculp...",New York : Whitney museum of American art ; Mu...,1983,https://catalogue.bnf.fr/ark:/12148/cb37526494b,0393018407,Willem de Kooning,"Drawings, Paintings, Sculpture, [mostra Itiner...","Paul Cummings, Willem De Kooning",,1983,,
3,http://data.bnf.fr/ark:/12148/cb45288067c#about,"Willem De Kooning, Zao Wou-Ki : [exposition Lé...",New York : Lévy Gorvy,2017,https://catalogue.bnf.fr/ark:/12148/cb45288067c,1944379126,De Kooning - Zao Wou-KI,,,Dominique Levy Gallery,2017-03-28,This volume explores the parallel careers of t...,Art
4,http://data.bnf.fr/ark:/12148/cb42265321b#about,Willem de Kooning : the artist's materials,"Los Angeles : Getty conservation institute , c...",2010,https://catalogue.bnf.fr/ark:/12148/cb42265321b,9781606060216,Willem de Kooning,The Artist's Materials,Susan Lake,Getty Publications,2010,This in-depth study of the paintings of Willem...,Art
5,http://data.bnf.fr/ark:/12148/cb34982520m#about,"Willem de Kooning, recent paintings, 1983-1986...","London : Anthony d'Offay gallery , 1986",1986,https://catalogue.bnf.fr/ark:/12148/cb34982520m,094756408X,Willem de Kooning,"Recent Paintings, 1983-1986",Willem De Kooning,Anthony D'Offay Gallery,1986-01-01,,"Art, Modern"
6,http://data.bnf.fr/ark:/12148/cb347686735#about,"De Kooning : petit journal de l'exposition, 28...","Paris : Centre Georges Pompidou , 1984",1984,https://catalogue.bnf.fr/ark:/12148/cb347686735,285850234X,De Kooning,petit journal de l'exposition : Musee national...,"Musée national d'art moderne (Paris), Whitney ...",,1984,,
7,http://data.bnf.fr/ark:/12148/cb38841654h#about,"Willem de Kooning : tracing the figure, [exhib...",Los Angeles : Museum of contemporary art ; Pri...,2002,https://catalogue.bnf.fr/ark:/12148/cb38841654h,069109618X,Willem de Kooning,Tracing the Figure,"Willem De Kooning, Cornelia H. Butler, Paul Sc...",Princeton University Press,2002,"Willem de Kooning, one of the great pioneers o...",Art
8,http://data.bnf.fr/ark:/12148/cb356992193#about,Willem De Kooning,"Paris : l'Échoppe , 1994",1994,https://catalogue.bnf.fr/ark:/12148/cb356992193,2840680297,Willem De Kooning,,Edwin Denby,,1994,,


In [144]:
import pandas as pd

# Assuming df1 and df2 are your two DataFrames

# Rename the 'Title' column in df2 to 'title'
new_dataframe.rename(columns={'Title': 'title'}, inplace=True)
new_dataframe.rename(columns={'PublishedDate': 'date'}, inplace=True)
new_dataframe.rename(columns={'Publisher': 'publisher'}, inplace=True)
# Concatenate the DataFrames vertically
combined_df = pd.concat([getty2, new_dataframe], ignore_index=True)

# Drop duplicates based on 'isbn' column
# new_df = combined_df.drop_duplicates(subset='isbn')
# new_df = combined_df.drop_duplicates(subset=['title', 'publisher', 'date'], keep='first')
# Reset index of the new DataFrame
combined_df.reset_index(drop=True, inplace=True)

# Display the new DataFrame with unique rows based on ISBN and consistent column name 'title'
combined_df


Unnamed: 0,work,title,publisher,date,uri,isbn,Subtitle,Authors,Description,Categories
0,http://data.bnf.fr/ark:/12148/cb45692271m#about,Le rire de De Kooning,"Bordeaux : Olympique , 2019",2019,https://catalogue.bnf.fr/ark:/12148/cb45692271m,9782955755068,,,,
1,http://data.bnf.fr/ark:/12148/cb45292173b#about,A way of living : the art of Willem De Kooning,"London : Phaidon Press Limited , 2017",2014,https://catalogue.bnf.fr/ark:/12148/cb45292173b,0714845817,,,,
2,http://data.bnf.fr/ark:/12148/cb45292173b#about,A way of living : the art of Willem De Kooning,"London : Phaidon Press Limited , 2017",2014,https://catalogue.bnf.fr/ark:/12148/cb45292173b,0714873160,,,,
3,http://data.bnf.fr/ark:/12148/cb45292173b#about,A way of living : the art of Willem De Kooning,"London : Phaidon Press Limited , 2017",2014,https://catalogue.bnf.fr/ark:/12148/cb45292173b,9780714845814,,,,
4,http://data.bnf.fr/ark:/12148/cb45292173b#about,A way of living : the art of Willem De Kooning,"London : Phaidon Press Limited , 2017",2014,https://catalogue.bnf.fr/ark:/12148/cb45292173b,9780714873169,,,,
5,http://data.bnf.fr/ark:/12148/cb45317929s#about,"Hartung et les peintres lyriques : Schneider, ...",Landerneau : Fonds Hélène & Édouard Leclerc po...,2016,https://catalogue.bnf.fr/ark:/12148/cb45317929s,9791096209002,,,,
6,http://data.bnf.fr/ark:/12148/cb37053892s#about,Willem de Kooning : drawing seeing-seeing draw...,"New York : Arena , 1998",1998,https://catalogue.bnf.fr/ark:/12148/cb37053892s,0965728080,,,,
7,http://data.bnf.fr/ark:/12148/cb47258002f#about,"Les irascibles : Pollock, De Kooning, Rothko e...","Paris : le Cherche midi , DL 2023",2023,https://catalogue.bnf.fr/ark:/12148/cb47258002f,9782749176703,,,,
8,http://data.bnf.fr/ark:/12148/cb40103400g#about,École de New York : expressionnisme abstrait a...,"[Nice] : Nice musées , impr. 2005",2005,https://catalogue.bnf.fr/ark:/12148/cb40103400g,2913548695,,,,
9,http://data.bnf.fr/ark:/12148/cb41263375n#about,"Action/abstraction : Pollock, de Kooning, and ...","New York : the Jewish museum , cop. 2008",2008,https://catalogue.bnf.fr/ark:/12148/cb41263375n,9780300122152,,,,


In [147]:
import pandas as pd

# Assuming df is your DataFrame

# Count total rows of the DataFrame
total_rows = len(combined_df)

# Count rows where 'title', 'subtitle', or 'description' contain specified keywords
keyword_rows = combined_df[combined_df['title'].str.contains(r'exhibition|exhib\.|mostra|museum', case=False, na=False) |
                  combined_df['Subtitle'].str.contains(r'exhibition|exhib\.|mostra|museum', case=False, na=False) |
                  combined_df['Description'].str.contains(r'exhibition|exhib\.|mostra|museum', case=False, na=False)]

# Get the count of rows containing the specified keywords
keyword_rows_count = len(keyword_rows)

print("Total rows in DataFrame:", total_rows)
print("Rows containing specified keywords:", keyword_rows_count)


Total rows in DataFrame: 214
Rows containing specified keywords: 49


In [150]:
import pandas as pd

# Assuming df is your original DataFrame

# Create a mask for rows containing specified keywords in 'title', 'subtitle', or 'description' columns
mask = combined_df['title'].str.contains(r'exhibition|exhib\.|mostra|museum', case=False, na=False) | \
       combined_df['Subtitle'].str.contains(r'exhibition|exhib\.|mostra|museum', case=False, na=False) | \
       combined_df['Description'].str.contains(r'exhibition|exhib\.|mostra|museum', case=False, na=False)

# Create the exhibitions DataFrame containing rows where keywords are present
exhibitions_dataframe = combined_df[mask]

# Remove the rows where keywords are present from the original DataFrame
df = combined_df[~mask]

# Reset index of the original DataFrame
df.reset_index(drop=True, inplace=True)

# Reset index of the exhibitions DataFrame
exhibitions_dataframe.reset_index(drop=True, inplace=True)

# Display the original DataFrame without rows containing keywords
print("Original DataFrame without rows containing keywords:")
print(df)

# Display the exhibitions DataFrame containing rows where keywords are present
print("\nExhibitions DataFrame:")
exhibitions_dataframe


Original DataFrame without rows containing keywords:
                                                work  \
0    http://data.bnf.fr/ark:/12148/cb45692271m#about   
1    http://data.bnf.fr/ark:/12148/cb45292173b#about   
2    http://data.bnf.fr/ark:/12148/cb45292173b#about   
3    http://data.bnf.fr/ark:/12148/cb45292173b#about   
4    http://data.bnf.fr/ark:/12148/cb45292173b#about   
5    http://data.bnf.fr/ark:/12148/cb45317929s#about   
6    http://data.bnf.fr/ark:/12148/cb47258002f#about   
7    http://data.bnf.fr/ark:/12148/cb40103400g#about   
8    http://data.bnf.fr/ark:/12148/cb37047286z#about   
9    http://data.bnf.fr/ark:/12148/cb349979951#about   
10   http://data.bnf.fr/ark:/12148/cb41133627t#about   
11   http://data.bnf.fr/ark:/12148/cb424991989#about   
12   http://data.bnf.fr/ark:/12148/cb44524427d#about   
13   http://data.bnf.fr/ark:/12148/cb44524427d#about   
14   http://data.bnf.fr/ark:/12148/cb354180675#about   
15   http://data.bnf.fr/ark:/12148/cb45288067c#abou

Unnamed: 0,work,title,publisher,date,uri,isbn,Subtitle,Authors,Description,Categories
0,http://data.bnf.fr/ark:/12148/cb37053892s#about,Willem de Kooning : drawing seeing-seeing draw...,"New York : Arena , 1998",1998,https://catalogue.bnf.fr/ark:/12148/cb37053892s,0965728080,,,,
1,http://data.bnf.fr/ark:/12148/cb41263375n#about,"Action/abstraction : Pollock, de Kooning, and ...","New York : the Jewish museum , cop. 2008",2008,https://catalogue.bnf.fr/ark:/12148/cb41263375n,9780300122152,,,,
2,http://data.bnf.fr/ark:/12148/cb45241886b#about,Burri : lo spazio di materia - tra Europa e US...,Città di Castello : Fondazione Palazzo Albizzi...,2016,https://catalogue.bnf.fr/ark:/12148/cb45241886b,8894063984,,,,
3,http://data.bnf.fr/ark:/12148/cb45241886b#about,Burri : lo spazio di materia - tra Europa e US...,Città di Castello : Fondazione Palazzo Albizzi...,2016,https://catalogue.bnf.fr/ark:/12148/cb45241886b,9788894063981,,,,
4,http://data.bnf.fr/ark:/12148/cb37225082g#about,"Willem De Kooning : the late paintings, the 19...",San Francisco : San Francisco museum of modern...,1995,https://catalogue.bnf.fr/ark:/12148/cb37225082g,0935640479,,,,
5,http://data.bnf.fr/ark:/12148/cb42562001p#about,"American vanguards : Graham, Davis, Gorky, De ...",Andover (Mass.) : Addison gallery of American ...,2011,https://catalogue.bnf.fr/ark:/12148/cb42562001p,0300121679,,,,
6,http://data.bnf.fr/ark:/12148/cb42562001p#about,"American vanguards : Graham, Davis, Gorky, De ...",Andover (Mass.) : Addison gallery of American ...,2011,https://catalogue.bnf.fr/ark:/12148/cb42562001p,9780300121674,,,,
7,http://data.bnf.fr/ark:/12148/cb38802554n#about,The impact of Chaim Soutine (1893-1943) : de K...,"Ostfildern-Ruit : Hatje Cantz , cop. 2002",2002,https://catalogue.bnf.fr/ark:/12148/cb38802554n,3775791035,,,,
8,http://data.bnf.fr/ark:/12148/cb347640459#about,Willem De Kooning : Whitney museum of American...,"Paris : Centre Georges-Pompidou , 1984",1984,https://catalogue.bnf.fr/ark:/12148/cb347640459,2858502552,,,,
9,http://data.bnf.fr/ark:/12148/cb374930625#about,"Willem de Kooning : Retrospektive, Zeichnungen...","München : Prestel , cop. 1984",1984,https://catalogue.bnf.fr/ark:/12148/cb374930625,3791306596,,,,
