In [14]:
## IMPORTS

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
# Models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, median_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [15]:
## MODELS

def get_decision_tree(X_train, y_train):
    decision_tree = DecisionTreeRegressor(random_state=42)
    decision_tree.fit(X_train, y_train)
    return decision_tree

def get_linear_regression(X_train, y_train):
    linear_regression = LinearRegression()
    linear_regression.fit(X_train, y_train)
    return linear_regression

def get_random_forest(X_train, y_train):
    rf_model = RandomForestRegressor(random_state=1)
    rf_model.fit(X_train, y_train)
    return rf_model

def get_gradient_boosting(X_train, y_train):
    # get gradient boosting model
    gb_model = GradientBoostingRegressor(random_state=1)
    gb_model.fit(X_train, y_train)
    return gb_model

# gives a dataframe taking models as columns and score as rows
def get_all_models(x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

    models = {'Linear regression': get_linear_regression(x_train, y_train),
              'Decision tree': get_decision_tree(x_train, y_train),
              'Random forest': get_random_forest(x_train, y_train),
              'Gradient boosting': get_gradient_boosting(x_train, y_train)}
    
    models_df = pd.DataFrame(columns=models.keys())
    for model_name, model in models.items():
        y_pred = model.predict(x_test)
        r2 = r2_score(y_test, y_pred)
        n = len(y_test)
        p = x_test.shape[1]
        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
        mean_error = mean_absolute_error(y_test, y_pred)
        median_error = median_absolute_error(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        pearson = np.corrcoef(y_test.to_numpy(), y_pred)[0, 1]

        models_df.loc['R² Score', model_name] = r2
        models_df.loc['Adjusted R² Score', model_name] = adj_r2
        models_df.loc['Pearson Correl Predicted-Actual', model_name] = pearson
        models_df.loc['Mean Absolute Error', model_name] = mean_error
        models_df.loc['Median Absolute Error', model_name] = median_error
        models_df.loc['Mean Squared Error', model_name] = mse

        # Round values
        models_df = models_df.round(2)

    return models_df, models

In [30]:
## INITIALIZING DATAFRAME

artworks = pd.read_csv('../temporary-files/artsy_artworks_info.csv')

Unnamed: 0,Gallery,Scarcity,URL,Artist,Materials,Title,Price,Dimensions
0,Wallector,Unique work,https://www.artsy.net/artwork/unknown-artist-p...,Unknown Artist,A mixed colored oil on canvas,Portrait of Noble Woman,Sold,67 3/10 × 47 1/5 × 1 3/5 in | 171 × 120 × 4 cm
1,Hashimoto Contemporary,Unique work,https://www.artsy.net/artwork/seonna-hong-verdant,Seonna Hong,Acrylic on canvas,Verdant,Sold,8 × 8 in | 20.3 × 20.3 cm
2,Dopeness Art Lab,Unique work,https://www.artsy.net/artwork/jon-burgerman-ne...,Jon Burgerman,Aerosol on canvas,Neoncat,Sold,29 9/10 × 29 9/10 in | 76 × 76 cm
3,Tomio Koyama Gallery,Unique work,https://www.artsy.net/artwork/masahiko-kuwahar...,Masahiko Kuwahara,Acrylic on canvas,Sky,Sold,78 3/10 × 31 3/5 in | 199 × 80.3 cm
4,Allouard Gallery,Unique work,https://www.artsy.net/artwork/joanna-glazer-sunny,Joanna Glazer,Acrylic on canvas,Sunny,US$195,31 1/2 × 31 1/2 × 2/5 in | 80 × 80 × 1 cm
...,...,...,...,...,...,...,...,...
895,Uprise Art,Unique work,https://www.artsy.net/artwork/holly-addi-rue-b...,Holly Addi,"Acrylic, charcoal, and watercolor on canvas",Rue Bonaparte No 2,Sold,40 × 30 in | 101.6 × 76.2 cm
896,Kyoto Art Gallery,Unique work,https://www.artsy.net/artwork/unknown-act-57,Unknown,Acrylics,act-57,US$595,15 3/5 × 12 in | 39.5 × 30.5 cm
897,Walter Wickiser Gallery,Unique work,https://www.artsy.net/artwork/ralph-wickiser-h...,Ralph Wickiser,Oil on linen,Heresy,"US$20,000",50 × 40 in | 127 × 101.6 cm
898,Gallery Delaive,Unique work,https://www.artsy.net/artwork/sam-francis-unti...,Sam Francis,Acrylic on paper,Untitled,Contact for price,13 2/5 × 18 9/10 in | 34 × 48 cm


In [17]:
## CLEAN

# Remove rows without numbers in Price
artworks = artworks[artworks['Price'].str.contains(r'\d', na=False)]
artworks = artworks[artworks['Scarcity'] == 'Unique work']

In [18]:
## FIX PRICE

artworks['Currency'] = artworks['Price'].str.extract(r'(\D+)')

artworks['Price_fix'] = artworks['Price'].apply(lambda x: ''.join(re.findall(r'\d+', x)))
artworks['Price_fix'] = artworks['Price_fix'].astype(float)

currency_rates = {
    'US$': 1,
    '€': 1.18,
    '£': 1.38,
    'KRW ₩': 0.00089,
    'C$': 0.8,
    'AU$': 0.76
}

artworks['Price (US$)'] = artworks.apply(lambda x: x['Price_fix'] * currency_rates[x['Currency']], axis=1)

In [19]:
## FIX DIMENSIONS

artworks = artworks[artworks['Dimensions'].str.contains('×')]

artworks['Dimensions (cm)'] = artworks['Dimensions'].apply(lambda x: x.split('|')[1])
artworks['Height (cm)'] = artworks['Dimensions (cm)'].apply(lambda x: x.split('×')[0].strip())
artworks['Width (cm)'] = artworks['Dimensions (cm)'].apply(lambda x: x.split('×')[1].split('cm')[0].strip())
artworks['Size (cm²)'] = artworks['Height (cm)'].astype(float) * artworks['Width (cm)'].astype(float)

In [20]:
## DROP COLUMNS

artworks.drop(columns=['Dimensions (cm)', 'Dimensions', 'Height (cm)', 'Width (cm)', 'Price', 'Price_fix', 'Currency', 'Scarcity', 'URL', 'Title'], inplace=True)

In [21]:
## SPLIT MATERIALS INTO MEDIUMS AND MATERIALS

artworks['Materials'] = artworks['Materials'].apply(lambda x: x.lower())
artworks = artworks[artworks['Materials'].str.contains(' on ')]
# split materials into mediums and materials using 'on' as separator
artworks['Mediums'] = artworks['Materials'].apply(lambda x: x.split(' on ')[0])
artworks['Materials'] = artworks['Materials'].apply(lambda x: x.split(' on ')[1])

In [22]:
artworks

Unnamed: 0,Materials,Artist,Gallery,Price (US$),Size (cm²),Mediums
0,canvas,Adrian Kay Wong,Uprise Art,6300.0,10296.55,oil
1,canvas,Adrian Kay Wong,Uprise Art,6700.0,10788.70,oil
2,arches oil paper,Adrian Kay Wong,Hashimoto Contemporary,850.0,645.16,oil
3,canvas,Adrian Kay Wong,Galerie Tracanelli,1770.0,5168.00,oil and acrylic
4,paper,Adrian Kay Wong,Uprise Art,1675.0,3016.20,oil
...,...,...,...,...,...,...
7548,die-cut rag paper,Damien Hirst,VINCE fine arts/ephemera,7500.0,3234.00,acrylic
7549,handmade paper,Damien Hirst,TGB Contemporary,12420.0,600.00,one shot enamel paint
7550,handmade paper,Damien Hirst,New Art Editions,16815.0,600.00,enamel paint
7553,handmade paper,Damien Hirst,Art Republic,12406.2,620.00,"enamel paint, watermark, microdot, hologam and..."


In [27]:
# get dummies for materials, mediums, artist and gallery
dummies_df = pd.get_dummies(artworks, columns=['Materials', 'Mediums', 'Artist', 'Gallery'])

x = dummies_df.drop(['Price (US$)'], axis=1)
y = dummies_df['Price (US$)']

models_df, models = get_all_models(x, y)
models_df

Unnamed: 0,Linear regression,Decision tree,Random forest,Gradient boosting
R² Score,-58808996642225.25,0.125745,0.677122,0.12718
Adjusted R² Score,-219954908701238.88,-2.269851,-0.207613,-2.264484
Pearson Correl Predicted-Actual,-0.007268,0.919927,0.998013,0.963677
Mean Absolute Error,5.964622930669166e+17,29553047142.928223,19351936927.37548,30172449695.232407
Median Absolute Error,20599394138.0,63.72,189.515473,30699815.616033
Mean Squared Error,1.3849641502353303e+37,2.0588888272986545e+23,7.603834299002493e+22,2.055509361125524e+23
