In [1]:
import numpy as np
import pandas as pd
import re

pd.options.display.float_format = '{:.2f}'.format # change the display format of numbers

In [2]:
saffronart_df = pd.read_csv('saffronart_dataset.csv') 

# change the dates to Pandas datetime objects:
saffronart_df['auction_date'] = pd.to_datetime(saffronart_df['auction_date'])
saffronart_df['birth_date'] = pd.to_datetime(saffronart_df['birth_date'])
saffronart_df['death_date'] = pd.to_datetime(saffronart_df['death_date'])

saffronart_df.head(5)

Unnamed: 0,artist,birth_date,birth_place,death_date,artist_age,title,winning_bid,low_est,high_est,auction_date,category,style,size,medium
0,akhilesh,1956-01-01,Indore,NaT,63.0,Divine Conversation,9000.0,5000.0,7000.0,2019-06-01,painting,abstract,71.5 x 71.5 in,acrylic on canvas
1,akhilesh,1956-01-01,Indore,NaT,63.0,In search of Untitled forms -II,1120.0,5000.0,6670.0,2013-11-01,painting,abstract,32.5 x 44 in,acrylic on canvas
2,akhilesh,1956-01-01,Indore,NaT,63.0,In Search of Untitled Lines,6186.0,6900.0,8625.0,2013-08-01,painting,,47 x 47 in,acrylic on canvas
3,akhilesh,1956-01-01,Indore,NaT,63.0,Magadhi,2400.0,5770.0,7695.0,2013-02-01,painting,abstract,33 x 44.5 in,acrylic on canvas
4,akhilesh,1956-01-01,Indore,NaT,63.0,Untitled,2942.0,6735.0,8655.0,2013-02-01,painting,abstract,40 x 40 in,acrylic on canvas


In [3]:
# play with regex:

# if there're more than one piece, they're often marked with alphabet chars:
abc_order = '.*[a-z]\)' # ex: a), b), c), ... z)

# they can also mention the number of pieces in the lot:
pieces_num = ['.* two ', ' three ', ' four ', ' five ', ' six ', 
              ' seven ', ' eight ', ' nine ', ' ten ', ' eleven |']

multi_pi_ind = '|.*'.join(pieces_num) # multiple pieces indicator
multi_pi_ind += abc_order
print(multi_pi_ind)

s1 = 'height a): 14 x 11.5 in '
s2 = 'this work comprises of two parts'
s3 = 'c) and e) 11 x 7 in'

for i, s in enumerate([s1, s2, s3]):
    print('s{}:'.format(i+1), bool(re.match(multi_pi_ind, s)))

.* two |.* three |.* four |.* five |.* six |.* seven |.* eight |.* nine |.* ten |.* eleven |.*[a-z]\)
s1: True
s2: True
s3: True


In [4]:
def check_size(row):
    dimensions = re.findall('\d+\.\d+|\d+', row['size'])
    dimensions = list(map(float, dimensions)) # turn all dimensions into float type
    
    if not all(d > 0 for d in dimensions): # check if there's any dimensions not greater than 0
        print(row.name, row['size'], row['medium'], sep=' | ')
    
    if len(dimensions)<2 or len(dimensions)>3: #cannot have more than 2 or 3 dimensions
        print(row.name, row['size'], row['medium'], sep=' | ')
    
    if bool(re.match(multi_pi_ind, row['size'])):
        print(row.name, row['size'], row['medium'], sep=' | ')
           
saffronart_df.apply(check_size, axis=1)
print()

55 |  a) height : 14 x 11.5 in |  fiberglass and paint 
1254 |  b) 15 x 12.5 in |  a) 12 x 9.5 in (30.4 x 24.1 cm) 
2593 |  two panels measuring 13.5 x 8.5 in |  laminated photo-copies and fabric transfer mounted on board 
3041 |  10.5 x 0 x 3 in |  bronze 
7651 |   c) 3.5 x 5.5 in |  a) b) 5.5 x 3.5  in (14 x 8.8 cm)  
7669 |  published: exhibited: | â â 146 cm x 105.4 cm  
7714 |  this work comprises of two parts, a) measuring 7.5 x 7.25 x 15.5 inches |  depth: 15.5 in (39.4 cm) 
7961 |  c) and e) 11 x 7 in |  a), b), d) and f) 11 x 7.5 in (28 x 19 cm) 
8466 |  b) 4.5 x 4.5 in |  a) 10.5 x 8.5 in (26.6 x 21.5 cms) 
8515 |  b) 11 x 8.5 in |  a) & c) 10 x 8 in (25.4 x 20.3 cm) (each) 
8531 |  c) 8 x 7.5 in |  b) 8 x 7 in (20 x 18 cm) 
8617 |  9 in x 0 in | bronze 
9844 |  b) 17.5 x 17.5 in |  a) 17 x 17 in (43.1 x 43.1 cm) 
10122 |  24 x 0 in |  bronze 
10123 |  20 x 0 x 3 in | teracotta 
10124 |  13.5 x 0 in |  bronze 
10598 |  c) 24 x 37.5 in |  b) 30 x 11 in (76.2 x 27.9 cm) 
11945 

In [5]:
# play with regex: 

avoid_ex_list = ['^(\d+|\d+\.\d+)\sx\s(\d+|\d+\.\d+)\sin\.?$',            # ex: 9 x 3 in, 9.5 x 3 in
                 '^(\d+|\d+\.\d+)\sx\s(\d+|\d+\.\d+)\sin\.?\s\(each\)$',  # ex: 9 x 3 in (each)
                 '^(\d+|\d+\.\d+)\sin\sx\s(\d+|\d+\.\d+)\sin\sâ\sâ$']     # ex: 9 in x 3 in â â

avoid_ex = '|'.join(avoid_ex_list)

s1 = '14 x 10.5 in (each)'
s2 = '38 in x 43 in â â'
s3 = 'b) 11.5 x 9 in'
s4 = '75.5 x 22.5 in'
s5 = 'image size: 7.5 x 9.5 in'

for i, s in enumerate([s1, s2, s3, s4, s5]):
    print('s{}:'.format(i+1), bool(re.match(avoid_ex, s)))

s1: True
s2: True
s3: False
s4: True
s5: False


In [6]:
def check_medium(row):
    if bool(re.match(avoid_ex, row['medium'].strip())): # check if it matches the specified expression
        return                                          # if matched then pass
    
    if any(char.isdigit() for char in row['medium']) or bool(re.match(multi_pi_ind, row['medium'])): 
    # check if there's any number in medium or if it indicates multiple pieces
        print(row.name, row['size'], row['medium'], sep=' | ')
        
saffronart_df.apply(check_medium, axis=1)
print()

302 |  29.5 x 21.5 in |  270gsm, gold foil on gf smith paper and mounted veneer 
307 |  37 x 25.5 in | painted lithograph on 250 gsm velin d arches paper pasted on board 
308 |  37 x 25.5 in | painted lithograph on 250 gsm velin d arches paper pasted on board 
483 |  24 x 36 in |  ultra chrome k3 pigments on hahnemuhle photo acid free paper 
484 |  24 x 36 in |  ultra chrome k3 pigments on hahnemuhle photo acid free paper 
485 |  24 x 35.5 in |  ultra chrome k3 pigments on hahnemuhle photo acid free paper 
486 |  24.5 x 35.5 in |  ultra chrome k3 pigments on hahnemuhle photo acid free paper 
990 |  29.2 x 22.8 cm |  b) 11.5 x 9 in 
1254 |  b) 15 x 12.5 in |  a) 12 x 9.5 in (30.4 x 24.1 cm) 
1378 |  5 x 3.5 in |  b) ink on paper pasted on mountboard 
1439 |  10.5 x 14.5 in |  b) mixed media on handmade paper 
1552 |  44.25 x 30 in |  this work comprises of four panels 
1661 |  13.75 in x 34 in |  digital prints on duratrans/ clear films, 3 layers 
1662 |  13.75 in x 34 in |  digital pri

In [7]:
dirty_rows = [990, 1254, 1552, 2593, 3041, 4599, 7651, 7669, 7714, 7961, 8466, 8515, 8531, 8617, 9844, 10122, 
              11878, 10123, 10124, 10598, 11841, 11945, 11957, 11958, 11959, 12027, 12062, 12064, 12069, 12300]

saffronart_df.iloc[dirty_rows]

Unnamed: 0,artist,birth_date,birth_place,death_date,artist_age,title,winning_bid,low_est,high_est,auction_date,category,style,size,medium
990,deepak bhandari,NaT,,NaT,,"A) UNTITLED B) MONTH OF BAISAKHA, BARAMASA (KA...",91.0,385.0,465.0,2015-11-01,,,29.2 x 22.8 cm,b) 11.5 x 9 in
1254,sakti burman,1935-01-01,Kolkata,NaT,84.0,Untitled,13770.0,11480.0,14755.0,2015-02-12,painting,figurative,b) 15 x 12.5 in,a) 12 x 9.5 in (30.4 x 24.1 cm)
1552,phaneendra nath chaturvedi,1981-01-01,Varanasi,NaT,38.0,We Who Live by Myth,6900.0,6250.0,7815.0,2017-06-01,painting,figurative,44.25 x 30 in,this work comprises of four panels
2593,anita dube,1958-01-01,Lucknow,NaT,61.0,An Old (Oslo) Story,,20000.0,30000.0,2013-09-01,installation,,two panels measuring 13.5 x 8.5 in,laminated photo-copies and fabric transfer mo...
3041,k laxma goud,1940-01-01,Nizampur Andhra Pradesh,NaT,79.0,untitled,7920.0,4545.0,5685.0,2005-12-01,sculpture,figurative,10.5 x 0 x 3 in,bronze
4599,ranbir kaleka,1953-01-01,Patiala Punjab,NaT,66.0,Crossings 2,180000.0,150000.0,200000.0,2013-09-01,installation,,75 x 98 in,four channel video projection on painted canv...
7651,jagannath panda,1970-01-01,"Kotilingi, Orissa",NaT,49.0,"UNTITLED, 1995",480.0,1035.0,1380.0,2014-07-01,,,c) 3.5 x 5.5 in,a) b) 5.5 x 3.5 in (14 x 8.8 cm)
7669,aditya pande,1976-01-01,Lucknow,NaT,43.0,"HALF-LIFE FORM V, 2012",3722.0,8065.0,11295.0,2015-03-01,,,published: exhibited:,â â 146 cm x 105.4 cm
7714,manisha parekh,1964-01-01,,NaT,55.0,Beings - 5,3156.0,3125.0,4690.0,2017-12-01,sculpture,figurative,"this work comprises of two parts, a) measurin...",depth: 15.5 in (39.4 cm)
7961,baiju parthan,1956-01-01,"Kottayam, Kerala",NaT,63.0,Untitled,15813.0,12500.0,15000.0,2007-09-01,painting,figurative,c) and e) 11 x 7 in,"a), b), d) and f) 11 x 7.5 in (28 x 19 cm)"


**Problems:**
1. Empty sizes
2. Sizes pushed to medium:
    * Fix medium 
    * Fix size

In [8]:
def get_area_vol(x):
    dimensions = re.findall('\d+\.\d+|\d+', x)  
    dimensions = list(map(float, dimensions)) # turn all dimensions into float type
    dimensions = [d for d in dimensions if d > 0] # remove invalid dimension that's equal 
                       
    if 'cm' in x:
        dimensions = np.array(dimensions)/2.54 # convert to inch
    
    return np.prod(dimensions) # multiply them altogether

In [9]:
# verify the minimum and maximum values of area_or_vol:

# print('MIN AREA/VOLUME:\n', saffronart_df.iloc[saffronart_df.area_or_vol.idxmin()], sep='')
# print()
# print('MAX AREA/VOLUME:\n', saffronart_df.iloc[saffronart_df.area_or_vol.idxmax()], sep='')

---

In [10]:
# get the artist's age on the date of the auction:

saffronart_df['auction_dt_age'] = saffronart_df['auction_date'].sub(saffronart_df['birth_date'], axis=0)
saffronart_df.auction_dt_age

0       23162 days
1       21124 days
2       21032 days
3       20851 days
4       20851 days
           ...    
12416   18852 days
12417   36877 days
12418   27271 days
12419   21646 days
12420   25080 days
Name: auction_dt_age, Length: 12421, dtype: timedelta64[ns]

In [11]:
# divide the number of days by 365 to get the age:

saffronart_df['auction_dt_age'] = saffronart_df['auction_dt_age'].dt.days//365
saffronart_df.auction_dt_age

0        63.00
1        57.00
2        57.00
3        57.00
4        57.00
         ...  
12416    51.00
12417   101.00
12418    74.00
12419    59.00
12420    68.00
Name: auction_dt_age, Length: 12421, dtype: float64

In [12]:
# if the artist is no longer living at the time of the auction, auction_dt_age = artist_age

saffronart_df.loc[saffronart_df.auction_dt_age > saffronart_df.artist_age, 'auction_dt_age']=saffronart_df.artist_age

In [13]:
# verify the minimum and maximum values of auction_dt_age:

print('YOUNGEST ARTIST:\n', saffronart_df.iloc[saffronart_df.auction_dt_age.idxmin()], sep='')
print()
print('OLDEST ARTIST:\n', saffronart_df.iloc[saffronart_df.auction_dt_age.idxmax()], sep='')

YOUNGEST ARTIST:
artist                girish dahiwale
birth_date        1974-01-01 00:00:00
birth_place                       NaN
death_date        1998-01-01 00:00:00
artist_age                      24.00
title                        Untitled
winning_bid                   9178.00
low_est                       2900.00
high_est                      4350.00
auction_date      2018-12-01 00:00:00
category                     painting
style                      figurative
size                 95.75 x 113.5 in
medium              acrylic on cloth 
auction_dt_age                  24.00
Name: 2083, dtype: object

OLDEST ARTIST:
artist                     b c sanyal
birth_date        1904-04-22 00:00:00
birth_place           Dibrugarh Assam
death_date        2003-01-01 00:00:00
artist_age                      99.00
title                        Untitled
winning_bid                       NaN
low_est                      11595.00
high_est                     14495.00
auction_date      2019-06-01 

In [14]:
# export the updated dataframe to a CSV for later use: 
saffronart_df.to_csv("updated_saffronart_dataset.csv", index = False)