webscraping collections data and general bibliography


# query on data.bnf 

it's not possible to do a query directly using python, so this query:
```PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX bnf-onto: <http://data.bnf.fr/ontology/bnf-onto/>
SELECT * 
WHERE {
  ?work dct:title ?title ;
        dct:publisher ?publisher;
        dct:date ?date;
        rdfs:seeAlso ?uri ;
        bnf-onto:isbn ?isbn
  FILTER (bif:contains(?title, "De_Kooning"))}```

has been done on the web endpoint and then the CSV was downloaded. 

In [12]:
import pandas as pd
getty2 = pd.read_csv("databnf_KLI.csv")

getty2.head()


Unnamed: 0,work,title,publisher,date,uri,isbn
0,http://data.bnf.fr/ark:/12148/cb45532401k#about,Klimt and Schiele : drawings from the Albertin...,"Boston, MA : Museum of Fine Arts, Boston : D.A...",2018,https://catalogue.bnf.fr/ark:/12148/cb45532401k,0878468528
1,http://data.bnf.fr/ark:/12148/cb45532401k#about,Klimt and Schiele : drawings from the Albertin...,"Boston, MA : Museum of Fine Arts, Boston : D.A...",2018,https://catalogue.bnf.fr/ark:/12148/cb45532401k,9780878468522
2,http://data.bnf.fr/ark:/12148/cb35356100b#about,"Art in Vienna, 1898-1918 : Klimt, Kokoschka, S...","London : Phaidon , 1975",1975,https://catalogue.bnf.fr/ark:/12148/cb35356100b,0-7148-1600-0
3,http://data.bnf.fr/ark:/12148/cb37494522j#about,"Art in Vienna 1898-1918 : Klimt, Kokoschka, Sc...","London : Phaidon , 1993",1993,https://catalogue.bnf.fr/ark:/12148/cb37494522j,0-7148-1600-0
4,http://data.bnf.fr/ark:/12148/cb39124684p#about,"Art in Vienna 1898-1918 : Klimt, Kokoschka, Sc...","London : Phaidon , 2001",2001,https://catalogue.bnf.fr/ark:/12148/cb39124684p,0-7148-2967-6


In [13]:
for column_name in getty2.columns:
    if column_name == "isbn":
        for i, value in enumerate(getty2[column_name]):
            if "-" in value:
                # Replacing hyphens with empty string
                getty2.at[i, column_name] = value.replace("-", "")
getty2.head()

Unnamed: 0,work,title,publisher,date,uri,isbn
0,http://data.bnf.fr/ark:/12148/cb45532401k#about,Klimt and Schiele : drawings from the Albertin...,"Boston, MA : Museum of Fine Arts, Boston : D.A...",2018,https://catalogue.bnf.fr/ark:/12148/cb45532401k,878468528
1,http://data.bnf.fr/ark:/12148/cb45532401k#about,Klimt and Schiele : drawings from the Albertin...,"Boston, MA : Museum of Fine Arts, Boston : D.A...",2018,https://catalogue.bnf.fr/ark:/12148/cb45532401k,9780878468522
2,http://data.bnf.fr/ark:/12148/cb35356100b#about,"Art in Vienna, 1898-1918 : Klimt, Kokoschka, S...","London : Phaidon , 1975",1975,https://catalogue.bnf.fr/ark:/12148/cb35356100b,714816000
3,http://data.bnf.fr/ark:/12148/cb37494522j#about,"Art in Vienna 1898-1918 : Klimt, Kokoschka, Sc...","London : Phaidon , 1993",1993,https://catalogue.bnf.fr/ark:/12148/cb37494522j,714816000
4,http://data.bnf.fr/ark:/12148/cb39124684p#about,"Art in Vienna 1898-1918 : Klimt, Kokoschka, Sc...","London : Phaidon , 2001",2001,https://catalogue.bnf.fr/ark:/12148/cb39124684p,714829676


# query on google books api

In [14]:
import requests
import json

def fetch_books(query, max_results=40):
    base_url = "https://www.googleapis.com/books/v1/volumes"
    start_index = 0
    all_results = []

    while True:
        params = {
            "q": query,
            "startIndex": start_index,
            "maxResults": max_results
        }
        response = requests.get(base_url, params=params)

        if response.status_code == 200:
            data = response.json()
            items = data.get("items", [])
            if not items:
                break
            all_results.extend(items)
            start_index += max_results
        else:
            print("Failed to retrieve data. Status code:", response.status_code)
            break

    return all_results

# Search for books related to Cezanne
cezanne_books = fetch_books("Klimt")

# Saving JSON data to a file
with open("klimt_books.json", "w") as json_file:
    json.dump(cezanne_books, json_file, indent=4)

print("JSON data saved to klimt_books.json")

JSON data saved to klimt_books.json


In [15]:
import json
import pandas as pd

# Load JSON data from file
with open("klimt_books.json", "r") as json_file:
    cezanne_books_data = json.load(json_file)

# Extract relevant fields from each book item
books_list = []
for book in cezanne_books_data:
    book_info = {
        "Title": book["volumeInfo"].get("title", "N/A"),
        "Subtitle": book["volumeInfo"].get("subtitle", "N/A"),
        "Authors": ", ".join(book["volumeInfo"].get("authors", ["N/A"])),
        "Publisher": book["volumeInfo"].get("publisher", "N/A"),
        "PublishedDate": book["volumeInfo"].get("publishedDate", "N/A"),
        "Description": book["volumeInfo"].get("description", "N/A"),
        "isbn": book["volumeInfo"].get("industryIdentifiers", [{}])[0].get("identifier", "N/A"),  # Retrieving ISBN
    }
    books_list.append(book_info)

# Create DataFrame
books_df = pd.DataFrame(books_list)

# Display DataFrame
books_df.head()


Unnamed: 0,Title,Subtitle,Authors,Publisher,PublishedDate,Description,isbn
0,Gustav Klimt,,Gustav Klimt,,1981,,8820204649
1,Klimt,la secessione e l'Italia,,,2021,,8857246574
2,Gustav Klimt. Tutti i dipinti,,Tobias G. Natter,,2018,,383656291X
3,Klimt,,"Gustav Klimt, Cecilia Bernardini",,1980,,UCSC:32106008329085
4,"Klimt. L'uomo, l'artista, il suo mondo. Ediz. ...",,"Gabriella Belli, Elena Pontiggia",,2022,,8857247511


In [16]:
liss = []
for column_name in books_df.columns:
    if column_name == "Title":
        for value in books_df[column_name]:
            if "Klimt" in value or "Klimt" in value:
                liss.append(value)
print(len(liss))

209


In [17]:
import pandas as pd

# Assuming books_df is your DataFrame
new_dataframe = books_df[books_df["Title"].str.contains("Klimt", case=False)]

new_dataframe.reset_index(drop=True, inplace=True)

new_dataframe.head()



Unnamed: 0,Title,Subtitle,Authors,Publisher,PublishedDate,Description,isbn
0,Gustav Klimt,,Gustav Klimt,,1981,,8820204649
1,Klimt,la secessione e l'Italia,,,2021,,8857246574
2,Gustav Klimt. Tutti i dipinti,,Tobias G. Natter,,2018,,383656291X
3,Klimt,,"Gustav Klimt, Cecilia Bernardini",,1980,,UCSC:32106008329085
4,"Klimt. L'uomo, l'artista, il suo mondo. Ediz. ...",,"Gabriella Belli, Elena Pontiggia",,2022,,8857247511


In [18]:
import pandas as pd

# Assuming df1 and df2 are your two DataFrames with the same column header 'ISBN'

# Merge the two DataFrames on the 'ISBN' column
df_combined = pd.merge(getty2, new_dataframe, on='isbn', how='inner')

# Display the new DataFrame with rows where ISBN is found in both DataFrames
df_combined


Unnamed: 0,work,title,publisher,date,uri,isbn,Title,Subtitle,Authors,Publisher,PublishedDate,Description
0,http://data.bnf.fr/ark:/12148/cb46858502q#about,"Klimt e i maestri ""segreti"" della Ricci Oddi :...",Piacenza : Galleria d'arte moderna Ricci Oddi ...,2021,https://catalogue.bnf.fr/ark:/12148/cb46858502q,9791259580344,Klimt e i maestri «segreti» della Ricci Oddi. ...,,E. Pontiggia,,2021,
1,http://data.bnf.fr/ark:/12148/cb46858502q#about,"Klimt e i maestri ""segreti"" della Ricci Oddi :...",Piacenza : Galleria d'arte moderna Ricci Oddi ...,2021,https://catalogue.bnf.fr/ark:/12148/cb46858502q,9791259580344,Klimt e i maestri «segreti» della Ricci Oddi. ...,,E. Pontiggia,,2021,
2,http://data.bnf.fr/ark:/12148/cb46855667v#about,"Klimt : la secessione e l'Italia, [mostra al P...","Milano : Skira , 2021",2021,https://catalogue.bnf.fr/ark:/12148/cb46855667v,8857246574,Klimt,la secessione e l'Italia,,,2021,
3,http://data.bnf.fr/ark:/12148/cb371857458#about,"Gustav Klimt, 1862-1918","Köln ; London ; Paris [etc.] : Taschen , cop. ...",2000,https://catalogue.bnf.fr/ark:/12148/cb371857458,3822859400,Gustav Klimt 1862-1918,,"Gilles Néret, Gustav Klimt",,2000,
4,http://data.bnf.fr/ark:/12148/cb43818271g#about,"Klimt : alle origini di un mito, [mostra, Mila...","Milano : 24 ore cultura , impr. 2014",2014,https://catalogue.bnf.fr/ark:/12148/cb43818271g,8866481939,Klimt,alle origini di un mito,Alfred Weidinger,24 Ore Cultura,2014,The career of an Austrian artist who has becom...
5,http://data.bnf.fr/ark:/12148/cb43818271g#about,"Klimt : alle origini di un mito, [mostra, Mila...","Milano : 24 ore cultura , impr. 2014",2014,https://catalogue.bnf.fr/ark:/12148/cb43818271g,8866481939,Klimt,alle origini di un mito,Alfred Weidinger,24 Ore Cultura,2014,The career of an Austrian artist who has becom...
6,http://data.bnf.fr/ark:/12148/cb442786023#about,"Klimt : up close and personal, paintings, lett...","Wien : Brandstätter , cop. 2012",2012,https://catalogue.bnf.fr/ark:/12148/cb442786023,3850336298,Klimt,"Up Close and Personal : Paintings, Letters, In...","Gustav Klimt, Tobias Günter Natter, Franz Smol...",Christian Brandstatter,2012,"The work of Gustav Klimt is world famous, but ..."
7,http://data.bnf.fr/ark:/12148/cb472430726#about,"Klimt e l'arte italiana : [mostra, Museo d'art...","Cinisello Balsamo, Milano : Silvana editoriale",2023,https://catalogue.bnf.fr/ark:/12148/cb472430726,8836654703,Klimt e l'arte italiana. Ediz. illustrata,,"Beatrice Avanzi, V. Sgarbi",,2023,
8,http://data.bnf.fr/ark:/12148/cb40941041p#about,"Schiele, Klimt, Kokoschka e gli amici viennesi...","Milano : Skira ; Trento : MART, Museo di arte ...",2006,https://catalogue.bnf.fr/ark:/12148/cb40941041p,8876249508,"Schiele, Klimt, Kokoschka e gli amici viennesi...",,"Tobias Günter Natter, Tomas Sharman, Thomas Tr...",,2006,


In [19]:
import pandas as pd

# Assuming df1 and df2 are your two DataFrames

# Rename the 'Title' column in df2 to 'title'
new_dataframe.rename(columns={'Title': 'title'}, inplace=True)
new_dataframe.rename(columns={'PublishedDate': 'date'}, inplace=True)
new_dataframe.rename(columns={'Publisher': 'publisher'}, inplace=True)
# Concatenate the DataFrames vertically
combined_df = pd.concat([getty2, new_dataframe], ignore_index=True)

# Drop duplicates based on 'isbn' column
new_df = combined_df.drop_duplicates(subset='isbn')
new_df = combined_df.drop_duplicates(subset=['title', 'publisher', 'date'], keep='first')
# Reset index of the new DataFrame
new_df.reset_index(drop=True, inplace=True)
new_df.drop(columns=['work', 'uri'], inplace=True)
index_column = new_df.columns.get_loc('Subtitle')

# Move the column to position 2
new_column_order = list(new_df.columns)
new_column_order.insert(1, new_column_order.pop(index_column))
new_df = new_df[new_column_order]

# Display the new DataFrame with unique rows based on ISBN and consistent column name 'title'
new_df


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_dataframe.rename(columns={'Title': 'title'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_dataframe.rename(columns={'PublishedDate': 'date'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_dataframe.rename(columns={'Publisher': 'publisher'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.ht

Unnamed: 0,title,Subtitle,publisher,date,isbn,Authors,Description
0,Klimt and Schiele : drawings from the Albertin...,,"Boston, MA : Museum of Fine Arts, Boston : D.A...",2018,0878468528,,
1,"Art in Vienna, 1898-1918 : Klimt, Kokoschka, S...",,"London : Phaidon , 1975",1975,0714816000,,
2,"Art in Vienna 1898-1918 : Klimt, Kokoschka, Sc...",,"London : Phaidon , 1993",1993,0714816000,,
3,"Art in Vienna 1898-1918 : Klimt, Kokoschka, Sc...",,"London : Phaidon , 2001",2001,0714829676,,
4,"Art in Vienna 1898-1918 : Klimt, Kokoschka, Sc...",,"London : Phaidon , 2015",2015,0714868787,,
...,...,...,...,...,...,...,...
304,Tout l'œuvre peint de Klimt,,,1983,UOM:39015042493455,"Gustav Klimt, Sergio Coradeschi",
305,"Gustav Klimt, Drawings",,,1983,UOM:39015027868796,"Gustav Klimt, Serge Sabarsky",
306,Gustav Klimt. La storia illustrata dei grandi ...,,24Ore Cultura,2020-08-05T00:00:00+02:00,9788866484851,"Stefano Zuffi, Massimiliano Aurelio",Un coloratissimo volume monografico per raccon...
307,Klimt,,Giunti Editore,1988,8809760832,Eva Di Stefano,


In [20]:
import pandas as pd

# Assuming df is your DataFrame

# Count total rows of the DataFrame
total_rows = len(new_df)

# Count rows where 'title', 'subtitle', or 'description' contain specified keywords
keyword_rows = new_df[new_df['title'].str.contains(r'exhibition|exhib\.|mostra|catalogue|catalogo|exposition|retrospective|Ausstellung', case=False, na=False) |
                  new_df['Subtitle'].str.contains(r'exhibition|exhib\.|mostra|catalogue|catalogo|exposition|retrospective|Ausstellung', case=False, na=False) ]

# Get the count of rows containing the specified keywords
keyword_rows_count = len(keyword_rows)

print("Total rows in DataFrame:", total_rows)
print("Rows containing specified keywords:", keyword_rows_count)


Total rows in DataFrame: 309
Rows containing specified keywords: 36


In [21]:
import pandas as pd

# Assuming df is your original DataFrame

# Create a mask for rows containing specified keywords in 'title', 'subtitle', or 'description' columns
mask = new_df['title'].str.contains(r'exhibition|exhib\.|mostra|catalogue |catalogo|exposition|retrospective|Ausstellung', case=False, na=False) | \
       new_df['Subtitle'].str.contains(r'exhibition|exhib\.|mostra|catalogue|catalogo|exposition|retrospective|Ausstellung', case=False, na=False) 

# Create the exhibitions DataFrame containing rows where keywords are present
exhibitions_dataframe = new_df[mask]

# Remove the rows where keywords are present from the original DataFrame
dfbooks = new_df[~mask]

# Reset index of the original DataFrame
dfbooks.reset_index(drop=True, inplace=True)

# Reset index of the exhibitions DataFrame
exhibitions_dataframe.reset_index(drop=True, inplace=True)


# Display the exhibitions DataFrame containing rows where keywords are present
print("\nExhibitions DataFrame:")
exhibitions_dataframe



Exhibitions DataFrame:


Unnamed: 0,title,Subtitle,publisher,date,isbn,Authors,Description
0,Klimt and Schiele : drawings from the Albertin...,,"Boston, MA : Museum of Fine Arts, Boston : D.A...",2018,0878468528,,
1,"Gustav Klimt : papiers érotiques, [exposition,...",,[Paris] : Gallimard : Fondation Dina Vierny-Mu...,2005,2070118053,,
2,"Il simbolismo : da Moreau a Gauguin a Klimt, m...",,"Ferrara : Ferrara arte , impr. 2007",2007,8889793066,,
3,Heiliger Frühling : Gustav Klimt und die Anfän...,,"Wien ; München : C. Brandstätter , 1999",1999,3854478569,,
4,"Klimt e i maestri ""segreti"" della Ricci Oddi :...",,Piacenza : Galleria d'arte moderna Ricci Oddi ...,2021,9791259580344,,
5,"Klimt, Kokoschka, Schiele : dall'art nouveau a...",,"Milano : Mazzotta , 2001",2001,8820215020,,
6,"Vienna 1900 : Klimt, Schiele, Moser, Kokoschka...",,"Paris : Réunion des musées nationaux , impr. 2005",2005,271185003X,,
7,"Klimt : la secessione e l'Italia, [mostra al P...",,"Milano : Skira , 2021",2021,8857246574,,
8,"De la scène au tableau : David, Füssli, Klimt,...",,Paris : Skira-Flammarion ; [Marseille] : MM ; ...,2009,9782081236912,,
9,"Vienna 1900 : Klimt, Schiele, and their times,...",,"Ostfildern (Allemagne) : Hatje Cantz , cop. 2010",2010,3775726853,,
