In [1]:
import numpy as np
import pandas as pd

pd.options.display.float_format = '{:.2f}'.format # change the display format of numbers

In [2]:
saffronart_df = pd.read_csv('saffronart_dataset.csv') 

# change the dates to Pandas datetime objects:
saffronart_df['auction_date'] = pd.to_datetime(saffronart_df['auction_date'])
saffronart_df['birth_date'] = pd.to_datetime(saffronart_df['birth_date'])
saffronart_df['death_date'] = pd.to_datetime(saffronart_df['death_date'])

saffronart_df.head(5)

Unnamed: 0,artist,birth_date,birth_place,death_date,artist_age,title,winning_bid,low_est,high_est,auction_date,category,style,size,medium
0,akhilesh,1956-01-01,Indore,NaT,63.0,Divine Conversation,9000.0,5000.0,7000.0,2019-06-01,painting,abstract,71.5 x 71.5 in,acrylic on canvas
1,akhilesh,1956-01-01,Indore,NaT,63.0,In search of Untitled forms -II,1120.0,5000.0,6670.0,2013-11-01,painting,abstract,32.5 x 44 in,acrylic on canvas
2,akhilesh,1956-01-01,Indore,NaT,63.0,In Search of Untitled Lines,6186.0,6900.0,8625.0,2013-08-01,painting,,47 x 47 in,acrylic on canvas
3,akhilesh,1956-01-01,Indore,NaT,63.0,Magadhi,2400.0,5770.0,7695.0,2013-02-01,painting,abstract,33 x 44.5 in,acrylic on canvas
4,akhilesh,1956-01-01,Indore,NaT,63.0,Untitled,2942.0,6735.0,8655.0,2013-02-01,painting,abstract,40 x 40 in,acrylic on canvas


In [3]:
import re

def get_area_vol(x):
    dimensions = re.findall('\d+\.\d+|\d+', x)
    dimensions = list(map(float, dimensions)) # turn all dimensions into float type
                       
    if 'cm' in x:
        np.array(dimensions)/2.54 # convert to inch
    
    return np.prod(dimensions) # multiply them altogether

In [4]:
saffronart_df['area_or_vol'] = saffronart_df['size'].apply(lambda x: get_area_vol(x))
saffronart_df['area_or_vol'].head(5)

0   5112.25
1   1430.00
2   2209.00
3   1468.50
4   1600.00
Name: area_or_vol, dtype: float64

In [5]:
# get the artist's age on the date of the auction:

saffronart_df['auction_dt_age'] = saffronart_df['auction_date'].sub(saffronart_df['birth_date'], axis=0)
saffronart_df.auction_dt_age

0       23162 days
1       21124 days
2       21032 days
3       20851 days
4       20851 days
           ...    
12416   18852 days
12417   36877 days
12418   27271 days
12419   21646 days
12420   25080 days
Name: auction_dt_age, Length: 12421, dtype: timedelta64[ns]

In [6]:
# divide the number of days by 365 to get the age:

saffronart_df['auction_dt_age'] = saffronart_df['auction_dt_age'].dt.days//365
saffronart_df.auction_dt_age

0        63.00
1        57.00
2        57.00
3        57.00
4        57.00
         ...  
12416    51.00
12417   101.00
12418    74.00
12419    59.00
12420    68.00
Name: auction_dt_age, Length: 12421, dtype: float64

In [7]:
# if the artist is no longer living at the time of the auction, auction_dt_age = artist_age

saffronart_df.loc[saffronart_df.auction_dt_age > saffronart_df.artist_age, 'auction_dt_age']=saffronart_df.artist_age

In [8]:
# verify the maximum value:
saffronart_df.iloc[saffronart_df.auction_dt_age.idxmax()]

artist                     b c sanyal
birth_date        1904-04-22 00:00:00
birth_place           Dibrugarh Assam
death_date        2003-01-01 00:00:00
artist_age                      99.00
title                        Untitled
winning_bid                       NaN
low_est                      11595.00
high_est                     14495.00
auction_date      2019-06-01 00:00:00
category                     painting
style                       landscape
size                  35.5 x 47.25 in
medium                 oil on canvas 
area_or_vol                   1677.38
auction_dt_age                  99.00
Name: 9879, dtype: object

In [9]:
# export the updated dataframe to a CSV for later use: 
saffronart_df.to_csv("updated_saffronart_dataset.csv", index = False)