# Supervised neural networks: Data Cleaning

In [1]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
import warnings
warnings.filterwarnings('ignore')

from config import usr, pwd, url, port, db, table

## Load Dataset

In [2]:
df = pd.read_csv('./data/Artworks.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155252 entries, 0 to 155251
Data columns (total 29 columns):
Title                 155213 non-null object
Artist                136662 non-null object
ConstituentID         136662 non-null object
ArtistBio             132559 non-null object
Nationality           136662 non-null object
BeginDate             136662 non-null object
EndDate               136662 non-null object
Gender                136662 non-null object
Date                  135743 non-null object
Medium                127156 non-null object
Dimensions            127253 non-null object
CreditLine            135354 non-null object
AccessionNumber       138118 non-null object
Classification        138118 non-null object
Department            138118 non-null object
DateAcquired          131389 non-null object
Cataloged             138118 non-null object
ObjectID              138118 non-null float64
URL                   79611 non-null object
ThumbnailURL          69206 non-nu

## Data Cleaning
### Columns of interest
For this drill, only consider a few columns.

In [4]:
columns_of_interest = ['Gender', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)', 'DateAcquired', 'Department', 
                       'Date', 'Nationality']

In [5]:
df_clean = df.loc[:, columns_of_interest]

### URLs
Convert URLs to booleans

In [6]:
df_clean['URL'] = df_clean['URL'].notnull()
df_clean['ThumbnailURL'] = df_clean['ThumbnailURL'].notnull()

### Department column
Remove problematic rows from _Department_ column.

In [7]:
df_clean = df_clean[df_clean['Department']!='Film']
df_clean = df_clean[df_clean['Department']!='Media and Performance Art']
df_clean = df_clean[df_clean['Department']!='Fluxus Collection']

### Year Acquired column
Add _YearAcquired_ column.

In [8]:
df_clean['DateAcquired'] = pd.to_datetime(df_clean.DateAcquired)
df_clean['YearAcquired'] = df_clean.DateAcquired.dt.year
df_clean['YearAcquired'].dtype

# Drop DateAcquired column
df_clean.drop('DateAcquired', axis=1, inplace=True)

### Missing values
Drop rows with missing values.

In [9]:
df_clean.dropna(inplace=True)
df_clean.index = np.arange(len(df_clean))

### Gender column
Change value if multiple genders a listed for a row.

In [10]:
df_clean.loc[df_clean['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'

Inspect unique values for _Gender_.

In [11]:
df_clean['Gender'].unique()

array(['(Male)', '\\(multiple_persons\\)', '(Female)', '()', '(male)'],
      dtype=object)

Male appears twice, but the case is different. I will replace _(male)_ with _(Male)_.

In [12]:
df_clean['Gender'] = df_clean['Gender'].replace('(male)', '(Male)')

In [13]:
df_clean['Gender'].unique()

array(['(Male)', '\\(multiple_persons\\)', '(Female)', '()'], dtype=object)

In [14]:
df_clean['Gender'].value_counts()

(Male)                  83648
(Female)                14473
\(multiple_persons\)     5528
()                       4954
Name: Gender, dtype: int64

### Nationality column
Change value if multiple nationalities a listed for a row.

In [15]:
df_clean.loc[df_clean['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'

### Dimensions columns
Rename column titles so that data can be read into SQL database.

In [16]:
df_clean.rename(columns={'Height (cm)': 'Height', 'Width (cm)': 'Width'}, inplace=True)

## Save dataset
Save the cleaned dataset to a SQL database to use for modeling.

In [17]:
# Create the database if it doesn't exist
db_url = f"postgresql+psycopg2://{usr}:{pwd}@{url}:{port}/{db}"
if database_exists(db_url):
    pass
else:
    create_database(db_url)

In [18]:
engine = create_engine(f"postgresql+psycopg2://{usr}:{pwd}@{url}:{port}/{db}")
df_clean.to_sql(name=table, con=engine, index=False, if_exists='replace')
engine.dispose()