In [1]:
import numpy as np
import pandas as pd 
import scipy
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()
import thinkstats2
import thinkplot

pd.options.display.float_format = '{:.2f}'.format # change the display format of numbers

In [2]:
saffronart_df = pd.read_csv('updated_saffronart_dataset.csv', parse_dates=[1, 3, 9]) 

saffronart_df.head(5)

Unnamed: 0,artist,birth_date,birth_place,death_date,artist_age,title,winning_bid,low_est,high_est,auction_date,category,style,size,medium,area_or_vol,auction_dt_age
0,akhilesh,1956-01-01,Indore,NaT,63.0,Divine Conversation,9000.0,5000.0,7000.0,2019-06-01,painting,abstract,71.5 x 71.5 in,acrylic on canvas,5112.25,63.0
1,akhilesh,1956-01-01,Indore,NaT,63.0,In search of Untitled forms -II,1120.0,5000.0,6670.0,2013-11-01,painting,abstract,32.5 x 44 in,acrylic on canvas,1430.0,57.0
2,akhilesh,1956-01-01,Indore,NaT,63.0,In Search of Untitled Lines,6186.0,6900.0,8625.0,2013-08-01,painting,,47 x 47 in,acrylic on canvas,2209.0,57.0
3,akhilesh,1956-01-01,Indore,NaT,63.0,Magadhi,2400.0,5770.0,7695.0,2013-02-01,painting,abstract,33 x 44.5 in,acrylic on canvas,1468.5,57.0
4,akhilesh,1956-01-01,Indore,NaT,63.0,Untitled,2942.0,6735.0,8655.0,2013-02-01,painting,abstract,40 x 40 in,acrylic on canvas,1600.0,57.0


# Category

In [3]:
cat_means = saffronart_df.groupby('category').mean() # mean prices by category 
cat_mean_p = pd.DataFrame(cat_means['winning_bid'])
cat_mean_p

Unnamed: 0_level_0,winning_bid
category,Unnamed: 1_level_1
ceramics,10116.0
chemical alterations,8420.0
digital art,9398.9
drawing,8276.08
installation,54903.54
painting,49133.3
photography,5810.26
print making,8544.54
relief sculpture,14282.48
sculpture,29420.97


In [4]:
def Cohen_effect_size(gr1, gr2):
    mean_diff = np.mean(gr1) - np.mean(gr2)
    
    n1, n2 = len(gr1), len(gr2) # drop na values
    
    var1, var2 = np.var(gr1), np.var(gr2)
    pooled_var = (n1*var1 + n2*var2)/(n1 + n2)
    
    d = mean_diff/ np.sqrt(pooled_var)
    
    return mean_diff, d 

In [19]:
from collections import defaultdict

def mean_p_diff_by(df, var_name): # mean price difference
    groups = df.groupby(var_name)
    mean_diff_dict = defaultdict(list)

    for gr1_name, gr1 in groups:
        for gr2_name, gr2 in groups:
            mean_diff, d = Cohen_effect_size(gr1.winning_bid, gr2.winning_bid)
            mean_diff_dict[gr1_name].append((mean_diff, d))
            
        all_others = df.loc[df[var_name]!=gr1_name]
        mean_diff, d = Cohen_effect_size(gr1.winning_bid, all_others.winning_bid)
        mean_diff_dict[gr1_name].append((mean_diff, d))
        
    mean_diff_df = pd.DataFrame(mean_diff_dict)
        
    gr_names = list(groups.groups.keys())
    gr_names.append('all others')
    mean_diff_df[var_name] = gr_names
    mean_diff_df.set_index(var_name, inplace=True)
    
    return mean_diff_df

In [20]:
def highlight_diag(df):
    style_arr = np.full(df.shape, '', dtype='<U24') # create an empty np array of the same structure
    np.fill_diagonal(style_arr, 'background-color: pink')
    return pd.DataFrame(style_arr, index=df.index, columns=df.columns)

def format_pairwise_df(df, d_prec=2):
    # apply the decimal points precision for the df's values:
    df = df.applymap(lambda x: '{:.{prec}f}, {:.{prec}f}'.format(*x, prec=d_prec)) 
    
    return df.style.apply(highlight_diag, axis=None)

In [21]:
cat_mean_diff = mean_p_diff_by(saffronart_df, 'category')
format_pairwise_df(cat_mean_diff)

Unnamed: 0_level_0,ceramics,chemical alterations,digital art,drawing,installation,painting,photography,print making,relief sculpture,sculpture
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ceramics,"0.00, 0.00","-1696.00, -0.46","-717.10, -0.04","-1839.92, -0.20","44787.54, 0.59","39017.30, 0.25","-4305.74, -0.11","-1571.46, -0.14","4166.48, 0.30","19304.97, 0.32"
chemical alterations,"1696.00, 0.46","0.00, 0.00","978.90, 0.06","-143.92, -0.02","46483.54, 0.65","40713.30, 0.27","-2609.74, -0.07","124.54, 0.01","5862.48, 0.45","21000.97, 0.35"
digital art,"717.10, 0.04","-978.90, -0.06","0.00, 0.00","-1122.83, -0.11","45504.64, 0.83","39734.40, 0.26","-3588.64, -0.10","-854.36, -0.07","4883.58, 0.29","20022.07, 0.35"
drawing,"1839.92, 0.20","143.92, 0.02","1122.83, 0.11","0.00, 0.00","46627.47, 2.26","40857.22, 0.28","-2465.81, -0.12","268.47, 0.03","6006.40, 0.62","21144.89, 0.52"
installation,"-44787.54, -0.59","-46483.54, -0.65","-45504.64, -0.83","-46627.47, -2.26","0.00, 0.00","-5770.24, -0.04","-49093.28, -1.06","-46359.00, -1.58","-40621.06, -0.68","-25482.57, -0.42"
painting,"-39017.30, -0.25","-40713.30, -0.27","-39734.40, -0.26","-40857.22, -0.28","5770.24, 0.04","0.00, 0.00","-43323.04, -0.29","-40588.76, -0.27","-34850.82, -0.23","-19712.33, -0.13"
photography,"4305.74, 0.11","2609.74, 0.07","3588.64, 0.10","2465.81, 0.12","49093.28, 1.06","43323.04, 0.29","0.00, 0.00","2734.28, 0.10","8472.22, 0.23","23610.71, 0.44"
print making,"1571.46, 0.14","-124.54, -0.01","854.36, 0.07","-268.47, -0.03","46359.00, 1.58","40588.76, 0.27","-2734.28, -0.10","0.00, 0.00","5737.94, 0.51","20876.43, 0.43"
relief sculpture,"-4166.48, -0.30","-5862.48, -0.45","-4883.58, -0.29","-6006.40, -0.62","40621.06, 0.68","34850.82, 0.23","-8472.22, -0.23","-5737.94, -0.51","0.00, 0.00","15138.49, 0.26"
sculpture,"-19304.97, -0.32","-21000.97, -0.35","-20022.07, -0.35","-21144.89, -0.52","25482.57, 0.42","19712.33, 0.13","-23610.71, -0.44","-20876.43, -0.43","-15138.49, -0.26","0.00, 0.00"


**Instructions:** The first value of the tuple is the mean difference between the column category and the row category, i.e. $column - row$. The second value of the tuple is the Cohen effect size of the mean difference. 

# Style

In [8]:
style_mean_diff = mean_p_diff_by(saffronart_df, 'style')
format_pairwise_df(style_mean_diff)

Unnamed: 0_level_0,abstract,calligraphy,figurative,landscape,still life,unknown
style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
abstract,"0.00, 0.00","-59502.32, -0.28","-30515.78, -0.22","-6303.54, -0.03","-24625.49, -0.12","-62818.72, -0.29"
calligraphy,"59502.32, 0.28","0.00, 0.00","28986.54, 0.25","53198.78, 0.35","34876.83, 0.31","-3316.40, -0.93"
figurative,"30515.78, 0.22","-28986.54, -0.25","0.00, 0.00","24212.24, 0.20","5890.30, 0.05","-32302.94, -0.28"
landscape,"6303.54, 0.03","-53198.78, -0.35","-24212.24, -0.20","0.00, 0.00","-18321.95, -0.13","-56515.18, -0.37"
still life,"24625.49, 0.12","-34876.83, -0.31","-5890.30, -0.05","18321.95, 0.13","0.00, 0.00","-38193.23, -0.34"
unknown,"62818.72, 0.29","3316.40, 0.93","32302.94, 0.28","56515.18, 0.37","38193.23, 0.34","0.00, 0.00"
all others,"42214.47, 0.37","-21937.25, -0.19","12418.74, 0.11","33382.36, 0.29","13314.25, 0.12","-25247.99, -0.22"


# Medium

In [9]:
med_counts = saffronart_df.groupby('medium').count() # counts of mediums
med_counts = pd.DataFrame(med_counts['artist'])
med_counts.rename(columns={'artist': 'count'}, inplace=True)

top_meds_ct = med_counts.nlargest(15, ['count']) # the top 15 mediums with highest counts
top_meds_ct

Unnamed: 0_level_0,count
medium,Unnamed: 1_level_1
oil on canvas,2189
acrylic on canvas,1239
watercolour on paper,820
mixed media on paper,489
acrylic on paper,243
ink on paper,241
mixed media on canvas,214
pen and ink on paper,200
bronze,171
gouache on paper,167


In [10]:
top_meds = top_meds_ct.index.to_list()
top_meds

[' oil on canvas ',
 ' acrylic on canvas ',
 ' watercolour on paper ',
 ' mixed media on paper ',
 ' acrylic on paper ',
 ' ink on paper ',
 ' mixed media on canvas ',
 ' pen and ink on paper ',
 ' bronze ',
 ' gouache on paper ',
 ' pencil on paper ',
 ' charcoal on paper ',
 ' oil on board ',
 ' watercolour and ink on paper ',
 ' acrylic and oil on canvas ']

In [11]:
top_meds_df = saffronart_df.loc[saffronart_df.medium.isin(top_meds)]
top_meds_df.reset_index(inplace=True, drop=True)
top_meds_df

Unnamed: 0,artist,birth_date,birth_place,death_date,artist_age,title,winning_bid,low_est,high_est,auction_date,category,style,size,medium,area_or_vol,auction_dt_age
0,akhilesh,1956-01-01,Indore,NaT,63.00,Divine Conversation,9000.00,5000.00,7000.00,2019-06-01,painting,abstract,71.5 x 71.5 in,acrylic on canvas,5112.25,63.00
1,akhilesh,1956-01-01,Indore,NaT,63.00,In search of Untitled forms -II,1120.00,5000.00,6670.00,2013-11-01,painting,abstract,32.5 x 44 in,acrylic on canvas,1430.00,57.00
2,akhilesh,1956-01-01,Indore,NaT,63.00,In Search of Untitled Lines,6186.00,6900.00,8625.00,2013-08-01,painting,,47 x 47 in,acrylic on canvas,2209.00,57.00
3,akhilesh,1956-01-01,Indore,NaT,63.00,Magadhi,2400.00,5770.00,7695.00,2013-02-01,painting,abstract,33 x 44.5 in,acrylic on canvas,1468.50,57.00
4,akhilesh,1956-01-01,Indore,NaT,63.00,Untitled,2942.00,6735.00,8655.00,2013-02-01,painting,abstract,40 x 40 in,acrylic on canvas,1600.00,57.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6592,jai zharotia,1945-01-01,New Delhi,NaT,74.00,"COMPOSITION, 1997",2326.00,6350.00,9525.00,2015-07-01,,,43.5 in x 59.5 in,oil on canvas,2588.25,70.00
6593,k h ara,1914-01-01,,1985-01-01,71.00,Untitled,16675.00,13500.00,15500.00,2007-12-01,painting,still life,21.5 x 29.5 in,gouache on paper,634.25,71.00
6594,gobardhan ash,1907-01-01,,1996-01-01,89.00,Untitled,,800.00,1200.00,2002-12-01,painting,figurative,11 x 18 in,gouache on paper,198.00,89.00
6595,s. dhanapal,1919-01-01,Chennai,2000-05-15,81.00,Mother and Child,,7955.00,10230.00,2011-08-01,sculpture,figurative,18.5 x 13 in,bronze,240.50,81.00


In [25]:
def subgr_mean_p_diff_by(big_df, small_df, var_name): # mean price difference
    groups = small_df.groupby(var_name)
    mean_diff_dict = defaultdict(list)

    for gr1_name, gr1 in groups:
        for gr2_name, gr2 in groups:
            mean_diff, d = Cohen_effect_size(gr1.winning_bid, gr2.winning_bid)
            mean_diff_dict[gr1_name].append((mean_diff, d))
            
        all_others = big_df.loc[big_df[var_name]!=gr1_name]
        mean_diff, d = Cohen_effect_size(gr1.winning_bid, all_others.winning_bid)
        mean_diff_dict[gr1_name].append((mean_diff, d))
        
    mean_diff_df = pd.DataFrame(mean_diff_dict)
        
    gr_names = list(groups.groups.keys())
    gr_names.append('all others')
    mean_diff_df[var_name] = gr_names
    mean_diff_df.set_index(var_name, inplace=True)
    
    return mean_diff_df

In [28]:
med_mean_diff = subgr_mean_p_diff_by(saffronart_df, top_meds_df, 'medium')
format_pairwise_df(med_mean_diff)

Unnamed: 0_level_0,acrylic and oil on canvas,acrylic on canvas,acrylic on paper,bronze,charcoal on paper,gouache on paper,ink on paper,mixed media on canvas,mixed media on paper,oil on board,oil on canvas,pen and ink on paper,pencil on paper,watercolour and ink on paper,watercolour on paper
medium,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
acrylic and oil on canvas,"0.00, 0.00","20609.32, 0.15","-17167.79, -0.48","-10546.72, -0.26","-22514.62, -0.64","-10003.61, -0.14","-22710.58, -0.77","-10417.32, -0.27","-21092.91, -0.83","55041.12, 0.58","52265.54, 0.24","-24096.42, -0.79","-24521.50, -0.75","-22297.45, -0.61","-19642.49, -0.90"
acrylic on canvas,"-20609.32, -0.15","0.00, 0.00","-37777.11, -0.28","-31156.04, -0.23","-43123.94, -0.31","-30612.94, -0.22","-43319.91, -0.32","-31026.65, -0.23","-41702.23, -0.33","34431.80, 0.24","31656.21, 0.16","-44705.74, -0.33","-45130.82, -0.33","-42906.77, -0.31","-40251.81, -0.35"
acrylic on paper,"17167.79, 0.48","37777.11, 0.28","0.00, 0.00","6621.07, 0.23","-5346.83, -0.24","7164.18, 0.13","-5542.79, -0.28","6750.46, 0.24","-3925.12, -0.20","72208.91, 0.99","69433.33, 0.32","-6928.63, -0.35","-7353.71, -0.35","-5129.66, -0.23","-2474.70, -0.14"
bronze,"10546.72, 0.26","31156.04, 0.23","-6621.07, -0.23","0.00, 0.00","-11967.90, -0.46","543.11, 0.01","-12163.86, -0.53","129.40, 0.00","-10546.19, -0.49","65587.84, 0.81","62812.26, 0.29","-13549.70, -0.59","-13974.78, -0.57","-11750.73, -0.44","-9095.77, -0.48"
charcoal on paper,"22514.62, 0.64","43123.94, 0.31","5346.83, 0.24","11967.90, 0.46","0.00, 0.00","12511.00, 0.20","-195.96, -0.02","12097.29, 0.50","1421.71, 0.10","77555.74, 0.93","74780.16, 0.34","-1581.80, -0.20","-2006.88, -0.25","217.17, 0.02","2872.13, 0.21"
gouache on paper,"10003.61, 0.14","30612.94, 0.22","-7164.18, -0.13","-543.11, -0.01","-12511.00, -0.20","0.00, 0.00","-12706.97, -0.23","-413.71, -0.01","-11089.29, -0.25","65044.73, 0.65","62269.15, 0.29","-14092.80, -0.25","-14517.89, -0.24","-12293.83, -0.19","-9638.87, -0.26"
ink on paper,"22710.58, 0.77","43319.91, 0.32","5542.79, 0.28","12163.86, 0.53","195.96, 0.02","12706.97, 0.23","0.00, 0.00","12293.26, 0.57","1617.67, 0.12","77751.70, 1.11","74976.12, 0.35","-1385.83, -0.16","-1810.92, -0.20","413.14, 0.04","3068.10, 0.23"
mixed media on canvas,"10417.32, 0.27","31026.65, 0.23","-6750.46, -0.24","-129.40, -0.00","-12097.29, -0.50","413.71, 0.01","-12293.26, -0.57","0.00, 0.00","-10675.58, -0.52","65458.45, 0.86","62682.86, 0.29","-13679.09, -0.63","-14104.18, -0.62","-11880.12, -0.48","-9225.16, -0.50"
mixed media on paper,"21092.91, 0.83","41702.23, 0.33","3925.12, 0.20","10546.19, 0.49","-1421.71, -0.10","11089.29, 0.25","-1617.67, -0.12","10675.58, 0.52","0.00, 0.00","76134.03, 1.37","73358.45, 0.36","-3003.51, -0.23","-3428.59, -0.25","-1204.54, -0.08","1450.42, 0.10"
oil on board,"-55041.12, -0.58","-34431.80, -0.24","-72208.91, -0.99","-65587.84, -0.81","-77555.74, -0.93","-65044.73, -0.65","-77751.70, -1.11","-65458.45, -0.86","-76134.03, -1.37","0.00, 0.00","-2775.58, -0.01","-79137.54, -1.07","-79562.62, -1.00","-77338.57, -0.89","-74683.61, -1.64"


In [29]:
# top 5:

top5_meds = med_counts.nlargest(5, ['count']).index.to_list()
top5_meds_df = saffronart_df.loc[saffronart_df.medium.isin(top5_meds)]

top5_mean_diff = subgr_mean_p_diff_by(saffronart_df, top5_meds_df, 'medium')
format_pairwise_df(top5_mean_diff)

Unnamed: 0_level_0,acrylic on canvas,acrylic on paper,mixed media on paper,oil on canvas,watercolour on paper
medium,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
acrylic on canvas,"0.00, 0.00","-37777.11, -0.28","-41702.23, -0.33","31656.21, 0.16","-40251.81, -0.35"
acrylic on paper,"37777.11, 0.28","0.00, 0.00","-3925.12, -0.20","69433.33, 0.32","-2474.70, -0.14"
mixed media on paper,"41702.23, 0.33","3925.12, 0.20","0.00, 0.00","73358.45, 0.36","1450.42, 0.10"
oil on canvas,"-31656.21, -0.16","-69433.33, -0.32","-73358.45, -0.36","0.00, 0.00","-71908.02, -0.37"
watercolour on paper,"40251.81, 0.35","2474.70, 0.14","-1450.42, -0.10","71908.02, 0.37","0.00, 0.00"
all others,"23499.83, 0.20","-17091.79, -0.15","-21474.27, -0.19","63278.16, 0.55","-20584.61, -0.18"


**Instructions:** The first value of the tuple is the mean difference between the column medium and the row medium, i.e. $column - row$. The second value of the tuple is the Cohen effect size of the mean difference. 

# Artist deceased or not

In [39]:
saffronart_df['deceased'] = ~saffronart_df.death_date.isnull()
saffronart_df['deceased'] = saffronart_df.deceased.astype(int) # convert to binary

saffronart_df.tail(5)

Unnamed: 0,artist,birth_date,birth_place,death_date,artist_age,title,winning_bid,low_est,high_est,auction_date,category,style,size,medium,area_or_vol,auction_dt_age,deceased
12416,atul dodiya,1959-01-20,Mumbai,NaT,60.0,Walking Man,30475.0,20000.0,26670.0,2010-09-01,painting,figurative,21 x 21 in,oil on paper,441.0,51.0,0
12417,m f husain,1913-09-17,"Pandharpur, Maharashtra",2011-06-09,98.0,Untitled,340000.0,166670.0,250000.0,2014-09-04,painting,figurative,118 x 46 in,oil on canvas,5428.0,98.0,1
12418,nasreen mohamedi,1937-01-01,Karachi India (now in Pakistan),1990-01-01,53.0,Untitled,11352.0,8700.0,10870.0,2011-09-01,drawing,abstract,10.5 x 13.5 in,ink and pencil on card paper,141.75,53.0,1
12419,baiju parthan,1956-01-01,"Kottayam, Kerala",NaT,63.0,Process - (Fruit),4426.0,2460.0,4100.0,2015-04-07,print making,still life,36.5 x 34.5 in,archival ink on hahnemuhle archival paper,1259.25,59.0,0
12420,thota vaikuntam,1942-01-01,Boorugupali Karimnagar Andhra Pradesh,NaT,77.0,Untitled,26554.0,11115.0,13335.0,2010-09-01,painting,figurative,36 x 24 in,acrylic on canvas board,864.0,68.0,0


In [49]:
deceased_means = saffronart_df.groupby('deceased').mean() 
deceased_mean_p = pd.DataFrame(deceased_means['winning_bid']) # mean prices by whether the artist is deceased or not 
deceased_mean_p.rename({0: 'No', 1: 'Yes'}, axis='index')

Unnamed: 0_level_0,winning_bid
deceased,Unnamed: 1_level_1
No,17137.68
Yes,50695.99


In [56]:
dead = saffronart_df.loc[saffronart_df.deceased==1]
alive = saffronart_df.loc[saffronart_df.deceased==0]

deceased_mean_diff = Cohen_effect_size(dead.winning_bid, alive.winning_bid)
print('Price mean difference bw living & deceased artists: {:.2f}'.format(deceased_mean_diff[0]))
print('Cohen effect size of the mean difference: {:.2f}'.format(deceased_mean_diff[1]))

Price mean difference bw living & deceased artists: 33558.31
Cohen effect size of the mean difference: 0.29


In [63]:
dec_price_formula = 'winning_bid ~ C(deceased)' 
dec_price_model = smf.ols(dec_price_formula, data=saffronart_df) 
dec_price_results = dec_price_model.fit()

dec_price_results.summary()

0,1,2,3
Dep. Variable:,winning_bid,R-squared:,0.019
Model:,OLS,Adj. R-squared:,0.019
Method:,Least Squares,F-statistic:,205.9
Date:,"Fri, 03 Jul 2020",Prob (F-statistic):,3.03e-46
Time:,21:12:20,Log-Likelihood:,-136590.0
No. Observations:,10456,AIC:,273200.0
Df Residuals:,10454,BIC:,273200.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.714e+04,1383.625,12.386,0.000,1.44e+04,1.98e+04
C(deceased)[T.1],3.356e+04,2338.945,14.348,0.000,2.9e+04,3.81e+04

0,1,2,3
Omnibus:,21187.807,Durbin-Watson:,1.447
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75360128.203
Skew:,16.754,Prob(JB):,0.0
Kurtosis:,417.553,Cond. No.,2.42
