In [None]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

pd.set_option('display.max_columns', None)

general_df = pd.read_csv('../data/clean/met_df_updated.csv', low_memory=False)

display(general_df.isna().sum())

# Separating the dataset into various ones
Splitting the main dataset into various ones allows to check each parameter more easily (and it allows to import them into SQL, for future work).

In [None]:
# Splitting the original DataFrame into smaller DataFrames
# 1. Museum DataFrame
museum_df = general_df[[
    'object_id',
    'is_highlight',
    'is_public_domain',
    'is_on_view',
    'department',
    'gallery_number',
    'accession_year',
]].drop_duplicates()

# 2. Artworks DataFrame
artworks_df = general_df[[
    'object_id',
    'object_name',
    'title',
    'primary_images'
    'culture',
    'period',
    'dynasty',
    'reign',
    'portfolio',
    'object_date',
    'object_begin_date',
    'object_end_date',
    'medium',
    'dimensions',
    'credit_line',
    'classification',
    'link_resource',
    'tags'
]].drop_duplicates()

# 3. Artists DataFrame
artists_df = general_df[[
    'object_id',
    'artist_role',
    'artist_prefix',
    'artist_display_name',
    'artist_display_bio',
    'artist_suffix',
    'artist_alpha_sort',
    'artist_nationality',
    'artist_begin_date',
    'artist_end_date',
    'artist_gender',
    'artist_ulan_url',
    'artist_wikidata_url'
]].drop_duplicates()

# 4. Geographic Tags DataFrame
geo_df = general_df[[
    'object_id',
    'geography_type',
    'city',
    'state',
    'county',
    'country',
    'region',
    'subregion',
    'locale',
    'locus',
    'excavation',
    'river'
]].drop_duplicates()

# 5. Images and Tags DataFrame, in case I have the time, I would like to scrape the website for the images and get the keywords from their descriptions
img_tags_df = general_df[[
    'object_id',
    'link_resource',
    'tags'  
]].drop_duplicates()

# Save each DataFrame to CSV or any other format
museum_df.to_csv('../data/clean/museum_df.csv', index=False)
artworks_df.to_csv('../data/clean/artworks_df.csv', index=False)
artists_df.to_csv('../data/clean/artists_df.csv', index=False)
geo_df.to_csv('../data/clean/geo_df.csv', index=False)
img_tags_df.to_csv('../data/clean/img_tags_df.csv', index=False)

# Confirmation message
print("DataFrames have been successfully split and saved.")


In [None]:
artists_df = pd.read_csv('../data/clean/artists_df.csv', low_memory=False)
artists_df

In [3]:
# Splitting rows with multiple artists into separate rows
artists_df['artist_display_name_split'] = artists_df['artist_display_name'].str.split('|')

# Exploding the columns to separate rows for each artist
exploded_artists_df = artists_df.explode('artist_display_name_split')

# Repeating the process for other artist-related columns, ensuring alignment
columns_to_split = ['artist_role', 'artist_prefix', 'artist_suffix', 'artist_begin_date', 'artist_end_date', 'artist_gender', 
                    'artist_display_bio', 'artist_alpha_sort', 'artist_nationality', 'artist_ulan_url', 'artist_wikidata_url']

for col in columns_to_split:
    artists_df[col + '_split'] = artists_df[col].str.split('|')
    exploded_artists_df[col] = artists_df[col + '_split'].explode()

In [4]:
just_artists_df = exploded_artists_df[['artist_display_name_split',
                                         'artist_begin_date', 'artist_end_date', 'artist_display_bio', 
                                         'artist_alpha_sort', 'artist_nationality', 'artist_ulan_url', 'artist_wikidata_url'
                                         ]].drop_duplicates()

just_artists_df.rename(columns={'artist_display_name_split': 'artist_display_name'}, inplace=True)
just_artists_df = just_artists_df.fillna('').drop_duplicates()
just_artists_df.replace('', np.nan, inplace=True)

In [5]:
just_artists_df['artist_display_name'] = just_artists_df['artist_display_name'].replace(['', 'Unidentified artist'], 'Unknown')

In [6]:
# Function that generates artists IDs 
from collections import defaultdict

# Function that generates artist IDs
def generate_artist_id(name, index, name_counts):
    # Check if the name is a valid string and not NaN
    if isinstance(name, str) and name.strip():  # Check if name is a non-empty string
        # Splitting the name into first and last name
        name_parts = name.split()
        
        # Getting the first letter of the first and last name (initials)
        first_initial = name_parts[0][0].upper() if name_parts else ''
        last_initial = name_parts[-1][0].upper() if len(name_parts) > 1 else ''
        
        # Create initials combination
        initials = f"{first_initial}{last_initial}"
        
        # Increment the count for this initials combination
        name_counts[initials] += 1
        
        # Padding the count with leading zeros
        number = str(name_counts[initials]).zfill(3)
        
        return f"{initials}{number}"
    else:
        # Return a default ID if name is invalid
        return f"UNKNOWN{str(index).zfill(3)}"

def apply_artist_ids(df):
    name_counts = defaultdict(int)  # To track how many times each initials combination appears
    df['artist_id'] = [
        generate_artist_id(name, index, name_counts)
        for index, name in enumerate(df['artist_display_name'], start=1)
    ]
    return df

# Applying rhe functions
artists_with_ids = apply_artist_ids(just_artists_df)

In [None]:
artists_with_ids

In [None]:
artist_name_counts = artists_with_ids['artist_display_name'].value_counts()

# Filter for names that appear more than once
duplicated_names = artist_name_counts[artist_name_counts > 1]

# Display the duplicated names and their counts
print(duplicated_names)

In [None]:
# Step 1: Merge the general_df with unique_artists_df to associate object_number with artist_id
linking_table = general_df.merge(
    just_artists_df[['artist_display_name', 'artist_id']], 
    on='artist_display_name',  # Adjust this key based on your actual column names
    how='inner'  # Use 'inner' to ensure only valid matches are included
)

# Step 2: Drop duplicate combinations of object_number and artist_id
linking_table = linking_table[['object_id', 'artist_id']].drop_duplicates()

# Step 3: Ensure uniqueness of the object_number-artist_id relationship
# Check how many artists are linked to each object_number
artist_counts_per_object = linking_table.groupby('object_id')['artist_id'].count()

# If an object has only 1 artist, it should be fine, so let's filter those with multiple associations
linking_table_fixed = linking_table[linking_table['object_id'].isin(
    artist_counts_per_object[artist_counts_per_object == 1].index
)]

# Step 4: Verify the final table for duplicates
assert linking_table_fixed.duplicated(subset=['object_id', 'artist_id']).sum() == 0, "Duplicates found in the linking table"

# Final check of the linking table
display(linking_table_fixed.head())

# Export the linking table and overwrite the artists_df file with the cleaner version

In [10]:
artists_with_ids.to_csv('../data/clean/artists_df.csv')
linking_table_fixed.to_csv('../data/clean/linking_table.csv')