# Loading and cleaning the raw dataset
The original dataset used can be retrieved from MET museum open access GitHub repository: https://github.com/metmuseum/openaccess/tree/master 

In [None]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

pd.set_option('display.max_columns', None)

met_df = pd.read_csv('../data/raw/met_objects.txt', low_memory=False)
met_df.columns = (met_df.columns.str.strip()
                  .str.lower()                
                  .str.replace(' ', '_')       
                  .str.replace('[^a-z0-9_]', '')  
)

display(met_df.tail())
print(met_df.shape)

In [None]:
met_df = met_df.drop(columns = ['metadata_date','is_timeline_work', 'object_number', 'rights_and_reproduction',
                               'object_wikidata_url', 'tags_aat_url', 'tags_wikidata_url','repository',
                               'constituent_id'])
met_df.isna().sum()

# Fetching the missing values directly from the MET website
There are multiple missing values in most columns. In this case, it is important to keep as many columns as possible with as much relevant information about the artworks for research purposes. Therefore, I chose to fetch the missing values directly from the item's url whenever it was possible.

In [None]:
# Define a function to process a single bin of rows with missing titles
def fetch_titles_for_bin(bin_df, met_df):
    for index, row in bin_df.iterrows():
        url = row['link_resource']  # The URL column
        try:
            # Fetch the page
            response = requests.get(url)
            if response.status_code == 200:
                # Parse the HTML content
                soup = BeautifulSoup(response.text, 'html.parser')
                # Extract the title from the specific <span> element
                title_element = soup.find('span', class_="artwork__title--text js-artwork__title--text")
                if title_element:
                    met_df.at[index, 'title'] = title_element.text.strip()  # Update the original DataFrame
                else:
                    met_df.at[index, 'title'] = "Title not found"
            else:
                met_df.at[index, 'title'] = f"Error {response.status_code}"
        except Exception as e:
            met_df.at[index, 'title'] = f"Error: {e}"

# Filter rows with missing titles
missing_title_df = met_df[met_df['title'].isna()]

# Split the rows with missing titles into bins of size 1000
bins = [missing_title_df.iloc[i:i+1000] for i in range(0, len(missing_title_df), 1000)]

# Process each bin one by one
for i, bin_df in enumerate(bins):
    print(f"Processing bin {i+1}/{len(bins)}...")
    fetch_titles_for_bin(bin_df, met_df)  # Process the current bin
    time.sleep(1)  # Add a delay between bins if needed (e.g., to avoid server overload)

    # Optionally save progress to a file after each bin (recommended for large datasets)
    met_df.to_csv('met_df_updated.csv', index=False)
    print(f"Bin {i+1} completed and progress saved.")

# Final message
print("All bins processed!")

In [None]:
# There are some duplicated object numbers that need to be taken care of. 
# However, they don't always have the same values in every column, so this needs to be taken into account, ass to not lose any objects
pd.set_option('display.max_columns', None)

general_df = pd.read_csv('../data/clean/met_df_updated.csv', low_memory=False)

general_df = general_df.drop_duplicates()

duplicate_counts = general_df['object_number'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)
print(general_df.shape)

In [4]:
# Replacing NaN values in the 'title' column with 'Untitled'
general_df['title'] = general_df['title'].fillna('Untitled')

In [None]:
# Changing the types of the columns 
general_df['accession_year'] = pd.to_numeric(general_df['accessionyear'], errors='coerce')
general_df.shape

In [None]:
#missing_column = met_df['gallery_number']  # Replace 'column_name' with the actual name

# Add it back to the updated dataset
#general_df['gallery_number'] = missing_column

# Move the column to the first position
#column_name = 'gallery_number'  # Replace with the actual column name
#columns = [col for col in general_df.columns if col != column_name]  # Exclude the column first
#columns.insert(6, column_name)  # Insert the column at the 7th position (index 6)
#general_df = general_df[columns]
#general_df.drop(columns = ['is_timeline_work'])
general_df.columns

In [None]:
# Drop the irrelevant columns
general_df = general_df.drop(columns=['is_timeline_work', 'object_number', 'accessionyear'])

# Drop rows where the department is 'The Cloisters'
general_df = general_df.drop(general_df[general_df['department'] == 'The Cloisters'].index)

# Check the shape of the DataFrame
general_df.columns

In [10]:
# Now that everything seems to be correct, let's overwrite the met_df_updated
general_df.to_csv('../data/clean/met_df_updated.csv', index=False)