In [3]:
%load_ext sql

In [2]:
import pandas as pd

In [10]:
# connect to the MySQL database
import mysql.connector
import os

try:
    database_connect = mysql.connector.connect(
        host='localhost',
        user='root',
        password=os.getenv('MYSQL_PASSWORD'),
        database='ecomm'
    )
    print("✅ Successfully connected to MySQL!")

    cursor = database_connect.cursor()

except mysql.connector.Error as err:
    print(f"❌ Error: {err}")

✅ Successfully connected to MySQL!


In [3]:
# list of CSV files and their corresponding table names
csv_files = [
    ('work.csv', 'work'),
    ('artist.csv', 'artist'),
    ('canvas_size.csv', 'canvas_size'),
    ('image_link.csv', 'image_link'),
    ('museum.csv', 'museum'),
    ('museum_hours.csv', 'museum_hours'),
    ('product_size.csv', 'product_size'),
    ('subject.csv', 'subject')
]

In [11]:
def get_sql_type(dtype):
    if pd.api.types.is_integer_dtype(dtype):
        return 'INT'
    elif pd.api.types.is_float_dtype(dtype):
        return 'FLOAT'
    elif pd.api.types.is_bool_dtype(dtype):
        return 'BOOLEAN'
    elif pd.api.types.is_datetime64_any_dtype(dtype):
        return 'DATETIME'
    else:
        return 'TEXT'

for csv_file, table_name in csv_files:
    file_path = csv_file
    
    # read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    
    # fill Nan's with None for SQL to mark as Null
    df = df.astype(object).where(pd.notnull(df), None)
    
    # show the number of missing values found
    print(f"Processing {csv_file}")
    print(f"NaN values before replacement:\n{df.isnull().sum()}\n")

    # clean column names
    df.columns = [col.replace(' ', '_').replace('-', '_').replace('.', '_') for col in df.columns]

    # generate the CREATE TABLE statement with appropriate data types
    columns = ', '.join([f'`{col}` {get_sql_type(df[col].dtype)}' for col in df.columns])
    create_table_query = f'CREATE TABLE IF NOT EXISTS `{table_name}` ({columns})'
    cursor.execute(create_table_query)

    # insert DataFrame data into the MySQL table
    for _, row in df.iterrows():
        values = tuple(None if pd.isna(x) else x for x in row)
        sql = f"INSERT INTO `{table_name}` ({', '.join(['`' + col + '`' for col in df.columns])}) VALUES ({', '.join(['%s'] * len(row))})"
        cursor.execute(sql, values)
        
    # commit the transaction for the current CSV file
    database_connect.commit()

Processing work.csv
NaN values before replacement:
work_id          0
name             0
artist_id        0
style         1286
museum_id    10223
dtype: int64

Processing artist.csv
NaN values before replacement:
artist_id         0
full_name         0
first_name        0
middle_names    273
last_name         0
nationality       0
style             0
birth             0
death             0
dtype: int64

Processing canvas_size.csv
NaN values before replacement:
size_id    0
width      0
height     7
label      0
dtype: int64

Processing image_link.csv
NaN values before replacement:
work_id                0
url                    0
thumbnail_small_url    2
thumbnail_large_url    2
dtype: int64

Processing museum.csv
NaN values before replacement:
museum_id     0
name          0
address       0
city          0
state        19
postal        7
country       0
phone         0
url           0
dtype: int64

Processing museum_hours.csv
NaN values before replacement:
museum_id    0
day          

In [15]:
cursor.execute(f"SELECT * FROM Work LIMIT 5;")
for row in cursor.fetchall():
    print(row)

('160228', 'Still Life with Flowers and a Watch', '615', 'Baroque', '43.0')
('160236', "Still Life with Fruit and a Beaker on a Cock's Foot", '615', 'Baroque', '43.0')
('160244', 'Still Life with Fruit and a Goldfinch', '615', 'Baroque', '43.0')
('160252', 'Still Life with Fruit and Oysters', '615', 'Baroque', '43.0')
('160260', 'Still Life with Fruit, Oysters, and a Porcelain Bowl', '615', 'Baroque', '43.0')


In [None]:
# close the connection
database_connect.close()