# Data Cleaning

The data we retrieve from Smartify in the previous notebook (d) was sparse and not completely standardized.
Here are just small functions to clean the dataframes of data about the different paintings.

In [1]:
# Some imports
import pandas as pd
import re

In [2]:
# Check whether the string of info contains a " x ". If so, we make the assumption that it is a dimensions string.
def is_dimensions(string): 
    return ' x ' in string

# The specs are in theroy in the format : technique | dimension | date
# But sometimes one or two of the informations are missing, so this function helps sorting which is which
def triple_specs(specs):
    if len(specs) == 3: # technique | dimension | date
        return specs[0], specs[1], specs[2]
    
    elif len(specs) == 2:
        if is_dimensions(specs[0]): # dimension | date
            return '' , specs[0], specs[1]
        elif is_dimensions(specs[1]): # technique | dimension 
            return specs[0], specs[1], ''
        else: # technique | date
            return specs[0], '' , specs[1]
    elif len(specs) == 1: 
        if is_dimensions(specs[0]): # dimension
            return '' , specs[0], ''
        elif re.search(r'.*([1-3][0-9]{3})', specs[0]) is not None: # date
            return '', '', specs[0]
        else: # technique
            return specs[0], '', ''
    else:
        return '', '', ''
        

In [5]:
# Organizing clear columns for illustration's attributes
def clean_dataframe(df_name):
    df = pd.read_csv(df_name)
    df['technique'] = df.apply(lambda row: triple_specs(row['specs'].strip().split('|'))[0], axis=1)
    df['dimensions'] = df.apply(lambda row: triple_specs(row['specs'].strip().split('|'))[1], axis=1)
    df['date'] = df.apply(lambda row: triple_specs(row['specs'].strip().split('|'))[2], axis=1)
    df['title'] = df.apply(lambda row: row['title'].strip(), axis=1)
    df['author'] = df.apply(lambda row: row['author'].strip(), axis=1)
    df = df.drop(columns=['Unnamed: 0', 'specs'])
    df.to_csv('./data/illu_infos_clean.csv', index=False)

In [6]:
clean_dataframe('./data/illu_infos.csv')