In [1]:
%load_ext sql

In [99]:
import pandas as pd

In [98]:
# connect to the MySQL database
import mysql.connector
import os

try:
    database_connect = mysql.connector.connect(
        host='localhost',
        user='root',
        password=os.getenv('MYSQL_PASSWORD'),
        database='art'
    )
    print("Successfully connected to MySQL!")

    cursor = database_connect.cursor()

except mysql.connector.Error as err:
    print(f"Error: {err}")

Successfully connected to MySQL!


### Top 10 Painted Subjects

In [13]:
# query for top 10 painted subjects
query = """
SELECT s.subject, 
       COUNT(w.work_id) AS work_count 
FROM subject AS s 
LEFT JOIN 
    work AS w 
    ON s.work_id = w.work_id
GROUP BY s.subject
ORDER BY work_count DESC
limit 10;
"""
# run SQL query
cursor.execute(query)
# fetch data from query
data = cursor.fetchall()
# fetch column names
columns = [desc[0] for desc in cursor.description]
# convert to df
df = pd.DataFrame(data, columns=columns)
# display first few rows of df
df.head(10)

Unnamed: 0,subject,work_count
0,Portraits,1070
1,Nude,525
2,Landscape Art,495
3,Rivers/Lakes,480
4,Flowers,457
5,Abstract/Modern Art,399
6,Still-Life,395
7,Seascapes,323
8,Marine Art/Maritime,268
9,Horses,265


In [5]:
# export to CSV
df.to_csv("exports/top_10_subjects.csv", index=False)
print("Exported top_10_subject.csv")

Exported top_10_subject.csv


### Artists With The Most Paintings

In [14]:
# query for artists with the most paintings
query = """
SELECT a.full_name, 
       COUNT(w.work_id) AS work_count 
FROM artist AS a 
LEFT JOIN 
    work AS w 
    ON a.artist_id = w.artist_id
GROUP BY a.full_name
ORDER BY work_count DESC
limit 10;
"""
# run SQL query
cursor.execute(query)
# fetch data from query
data = cursor.fetchall()
# fetch column names
columns = [desc[0] for desc in cursor.description]
# convert to df
df = pd.DataFrame(data, columns=columns)
# display first few rows of df
df.head(10)

Unnamed: 0,full_name,work_count
0,Pierre-Auguste Renoir,469
1,Claude Monet,378
2,Vincent Van Gogh,308
3,Maurice Utrillo,253
4,Albert Marquet,233
5,Henri Lebasque,200
6,Camille Pissarro,188
7,Louis Valtat,184
8,Maximilien Luce,174
9,Henri Le Sidaner,159


In [7]:
# export to CSV
df.to_csv("exports/top_10_artists.csv", index=False)
print("Exported top_10_artists.csv")

Exported top_10_artists.csv


### Artists Exhibited in Multiple Countries

In [101]:
# query for artists who have exhibited in more that one country
query = """
SELECT a.full_name, 
       COUNT(DISTINCT m.country) AS country_count 
FROM artist AS a
LEFT JOIN 
    work AS w 
    ON a.artist_id = w.artist_id
LEFT JOIN
    museum AS m
    ON w.museum_id = m.museum_id
GROUP BY a.full_name
HAVING country_count >= 1
ORDER BY country_count DESC
;
"""
# run SQL query
cursor.execute(query)
# fetch data from query
data = cursor.fetchall()
# fetch column names
columns = [desc[0] for desc in cursor.description]
# convert to df
df = pd.DataFrame(data, columns=columns)
# display first few rows of df
df.head(10)

Unnamed: 0,full_name,country_count
0,Vincent Van Gogh,8
1,Claude Monet,7
2,Paul Gauguin,7
3,Francois Boucher,6
4,Pierre-Auguste Renoir,6
5,Rembrandt Van Rijn,6
6,Alfred Sisley,5
7,André Derain,5
8,Camille Pissarro,5
9,Edgar Degas,5


In [103]:
# export to CSV
df.to_csv("exports/artists_countries.csv", index=False)
print("Exported artists_countries.csv")

Exported artists_countries.csv


### Artists and Their Exhibition Rates

In [34]:
# query for artists with the highest ratios of works exhibited to works produced
query = """
SELECT a.full_name, 
    COUNT(DISTINCT CASE WHEN w.museum_id IS NOT NULL THEN w.work_id END) AS exhibited_works,
    COUNT(DISTINCT w.work_id) AS total_works,
    (COUNT(DISTINCT CASE WHEN w.museum_id IS NOT NULL THEN w.work_id END)*100)/COUNT(DISTINCT w.work_id) AS exhibition_rate
FROM artist AS a 
LEFT JOIN 
    work AS w 
    ON a.artist_id = w.artist_id
GROUP BY a.full_name
HAVING exhibited_works >= 0
ORDER BY exhibition_rate DESC, exhibited_works DESC
;
"""
# run SQL query
cursor.execute(query)
# fetch data from query
data = cursor.fetchall()
# fetch column names
columns = [desc[0] for desc in cursor.description]
# convert to df
df = pd.DataFrame(data, columns=columns)
# display without decimal points
df["exhibition_rate"] = df["exhibition_rate"].astype(int)
# display first few rows of df
df.head(10)

Unnamed: 0,full_name,exhibited_works,total_works,exhibition_rate
0,Jean Baptiste Vanmour,60,60,100
1,Francesco Guardi,28,28,100
2,Gerard Van Honthorst,22,22,100
3,Pieter De Hooch,21,21,100
4,Cornelis Troost,19,19,100
5,Adriaan De Lelie,15,15,100
6,Jan Willem Pieneman,15,15,100
7,John Frederick Kensett,15,15,100
8,Ludolf Backhuysen,15,15,100
9,Bartholomeus Van Der Helst,14,14,100


In [35]:
# export to CSV
df.to_csv("exports/artist_exhibition_rates.csv", index=False)
print("Exported artist_exhibition_rates.csv")

Exported artist_exhibition_rates.csv


### Paintings Displayed vs. Not Displayed in Museums

In [43]:
# query for number of paintings displayed in museums vs number not displayed
query = """
SELECT 
    COUNT(DISTINCT CASE WHEN w.museum_id IS NOT NULL THEN w.work_id END) AS displayed_works,
    COUNT(DISTINCT CASE WHEN w.museum_id IS NULL THEN w.work_id END) AS undisplayed_works
FROM work w
;
"""
# run SQL query
cursor.execute(query)
# fetch data from query
data = cursor.fetchall()
# fetch column names
columns = [desc[0] for desc in cursor.description]
# convert to df
df = pd.DataFrame(data, columns=columns)
# display first few rows of df
df.head()

Unnamed: 0,displayed_works,undisplayed_works
0,4553,10163


In [44]:
# export to CSV
df.to_csv("exports/displayed_vs_not_displayed.csv", index=False)
print("Exported displayed_vs_not_displayed.csv")

Exported displayed_vs_not_displayed.csv


### Paintings Exhibited in Museums by Country

In [40]:
# query for number of exhibited painting in each country
query = """
SELECT m.country, 
    COUNT(DISTINCT CASE WHEN w.museum_id IS NOT NULL THEN w.work_id END) AS exhibited_works
FROM museum m
LEFT JOIN 
    work AS w 
    ON m.museum_id = w.museum_id
GROUP BY m.country
HAVING exhibited_works >= 0
ORDER BY exhibited_works DESC
;
"""
# run SQL query
cursor.execute(query)
# fetch data from query
data = cursor.fetchall()
# fetch column names
columns = [desc[0] for desc in cursor.description]
# convert to df
df = pd.DataFrame(data, columns=columns)
# display first few rows of df
df.head(20)

Unnamed: 0,country,exhibited_works
0,USA,2672
1,UK,508
2,Netherlands,471
3,France,371
4,Spain,196
5,Russia,164
6,United Kingdom,135
7,Germany,25
8,Switzerland,3
9,Australia,1


In [42]:
# export to CSV
df.to_csv("exports/displays_by_country.csv", index=False)
print("Exported displays_by_country.csv")

Exported displays_by_country.csv


### Number of Museums in Each Country

In [54]:
# query for number of exhibited painting in each country
query = """
SELECT m.country, 
    COUNT(DISTINCT m.museum_id) AS museums
FROM museum m
GROUP BY m.country
ORDER BY museums DESC
;
"""
# run SQL query
cursor.execute(query)
# fetch data from query
data = cursor.fetchall()
# fetch column names
columns = [desc[0] for desc in cursor.description]
# convert to df
df = pd.DataFrame(data, columns=columns)
# display first few rows of df
df.head(20)

Unnamed: 0,country,museums
0,USA,25
1,France,7
2,UK,5
3,Netherlands,4
4,Russia,2
5,Spain,2
6,Switzerland,2
7,Australia,1
8,Brazil,1
9,Czechia,1


In [55]:
# export to CSV
df.to_csv("exports/museums_by_country.csv", index=False)
print("Exported museums_by_country.csv")

Exported museums_by_country.csv


### Museum Open Days

In [48]:
# query for number of days open for museums
query = """
SELECT m.name, 
    COUNT(mh.museum_id) AS days_open
FROM museum m
LEFT JOIN 
    museum_hours AS mh 
    ON m.museum_id = mh.museum_id
GROUP BY m.name
ORDER BY days_open DESC
;
"""
# run SQL query
cursor.execute(query)
# fetch data from query
data = cursor.fetchall()
# fetch column names
columns = [desc[0] for desc in cursor.description]
# convert to df
df = pd.DataFrame(data, columns=columns)
# display first few rows of df
df.head(20)

Unnamed: 0,name,days_open
0,The Museum of Modern Art,7
1,Pushkin State Museum of Fine Arts,7
2,National Gallery of Victoria,7
3,Solomon R. Guggenheim Museum,7
4,The Prado Museum,7
5,The Metropolitan Museum of Art,7
6,Van Gogh Museum,7
7,Israel Museum,7
8,Smithsonian American Art Museum,7
9,The Tate Gallery,7


In [49]:
# export to CSV
df.to_csv("exports/museum_days_open.csv", index=False)
print("Exported museum_days_open.csv")

Exported museum_days_open.csv


### Museum Total Opening Hours

In [50]:
# query for total number of opening hours for each museum
query = """
SELECT m.name, 
    SUM(TIMESTAMPDIFF(HOUR, mh.open, mh.close)) AS total_hours
FROM museum m
LEFT JOIN 
    museum_hours AS mh 
    ON m.museum_id = mh.museum_id
GROUP BY m.name
ORDER BY total_hours DESC
;
"""
# run SQL query
cursor.execute(query)
# fetch data from query
data = cursor.fetchall()
# fetch column names
columns = [desc[0] for desc in cursor.description]
# convert to df
df = pd.DataFrame(data, columns=columns)
# display first few rows of df
df.head(20)

Unnamed: 0,name,total_hours
0,The Prado Museum,69
1,Van Gogh Museum,63
2,Uffizi Gallery Italy,60
3,Army Museum,59
4,National Gallery,59
5,The Metropolitan Museum of Art,57
6,Musée du Louvre,57
7,The Tate Gallery,56
8,National Gallery Prague,56
9,Rijksmuseum,56


In [51]:
# export to CSV
df.to_csv("exports/museum_open_hours.csv", index=False)
print("Exported museum_open_hours.csv")

Exported museum_open_hours.csv


### Museums by Number of Exhibits

In [52]:
# query for number of paintings in each museum
query = """
SELECT m.name, 
    COUNT(DISTINCT CASE WHEN w.museum_id IS NOT NULL THEN w.work_id END) AS paintings_count
FROM museum m
LEFT JOIN 
    work AS w
    ON m.museum_id = w.museum_id
GROUP BY m.name
ORDER BY paintings_count DESC
;
"""
# run SQL query
cursor.execute(query)
# fetch data from query
data = cursor.fetchall()
# fetch column names
columns = [desc[0] for desc in cursor.description]
# convert to df
df = pd.DataFrame(data, columns=columns)
# display first few rows of df
df.head(20)

Unnamed: 0,name,paintings_count
0,The Metropolitan Museum of Art,939
1,Rijksmuseum,452
2,National Gallery,423
3,National Gallery of Art,375
4,The Barnes Foundation,350
5,Musée d'Orsay,266
6,Philadelphia Museum of Art,244
7,Cleveland Museum Of Art,194
8,The J. Paul Getty Museum,168
9,Los Angeles County Museum of Art,157


In [53]:
# export to CSV
df.to_csv("exports/paintings_count.csv", index=False)
print("Exported paintings_count.csv")

Exported paintings_count.csv


### Museums with Most Expensive Collection of Works

In [70]:
# query for museums with most expensive works
query = """
SELECT m.name AS museum, 
    SUM(ps.regular_price) AS collection_value
FROM product_size AS ps
LEFT JOIN 
    work AS w
    ON w.work_id = ps.work_id
LEFT JOIN 
    museum AS m
    ON m.museum_id = w.museum_id
WHERE w.museum_id IS NOT NULL
GROUP BY m.name
ORDER BY collection_value DESC
;
"""
# run SQL query
cursor.execute(query)
# fetch data from query
data = cursor.fetchall()
# fetch column names
columns = [desc[0] for desc in cursor.description]
# convert to df
df = pd.DataFrame(data, columns=columns)
# display first few rows of df
df.head(10)

Unnamed: 0,museum,collection_value
0,The Metropolitan Museum of Art,5170895.0
1,National Gallery,2222930.0
2,Rijksmuseum,2161275.0
3,National Gallery of Art,2056600.0
4,The Barnes Foundation,1667760.0
5,Musée d'Orsay,1453575.0
6,Philadelphia Museum of Art,1332990.0
7,Cleveland Museum Of Art,960515.0
8,Los Angeles County Museum of Art,834730.0
9,The J. Paul Getty Museum,828520.0


In [None]:
# export to CSV
df.to_csv("exports/expensive_paintings.csv", index=False)
print("Exported expensive_paintings.csv")

### Most Discounted Artworks

In [118]:
# query for most discounted artworks
query = """
SELECT w.name, 
    a.full_name,
    CAST(MAX((ps.regular_price - ps.sale_price)*100/ps.regular_price) AS UNSIGNED) AS discount_rate
FROM work AS w
LEFT JOIN 
    product_size AS ps 
    ON ps.work_id = w.work_id
LEFT JOIN 
    artist as a
    ON a.artist_id = w.artist_id
WHERE ps.sale_price > 0
GROUP BY w.name, a.full_name
HAVING discount_rate >= 0.5
ORDER BY discount_rate DESC
;
"""
# run SQL query
cursor.execute(query)
# fetch data from query
data = cursor.fetchall()
# fetch column names
columns = [desc[0] for desc in cursor.description]
# convert to df
df = pd.DataFrame(data, columns=columns)
# remove decimal points
df["discount_rate"] = df["discount_rate"].astype(int)
# display first few rows of df
df.head(10)

Unnamed: 0,name,full_name,discount_rate
0,Portrait of Madame Labille-Guyard and Her Pupils,Adélaïde Labille-Guiard,92
1,The Chalon Family in London,Jacques Laurent Agasse,92
2,Portrait of Mr. and Mrs. Thomas Mifflin (Sarah...,John Singleton Copley,79
3,Sun Rising Through Vapour,Joseph M. W. Turner,76
4,"The Victory Returning from Trafalgar, in Three...",Joseph M. W. Turner,76
5,Disrobing of Christ (Espolio),El Greco,76
6,"Venice, The Bridge of Sighs",Joseph M. W. Turner,68
7,The Adoration of the Shepherds,Nicolas Poussin,68
8,The Companions of Rinaldo,Nicolas Poussin,68
9,Taking the Census,Francis William Edmonds,65


In [108]:
# export to CSV
df.to_csv("exports/most_discounted.csv", index=False)
print("Exported most_discounted.csv")

Exported most_discounted.csv
