In [15]:
## IMPORTS

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
# Models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, median_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [16]:
## INITIALIZING DATAFRAME

artworks = pd.read_csv('../temporary-files/artsy_artworks_info.csv')

In [17]:
## CLEAN

# Remove rows without numbers in Price
artworks = artworks[artworks['Price'].str.contains(r'\d', na=False)]
artworks = artworks[artworks['Scarcity'] == 'Unique work']

In [18]:
## FIX PRICE

artworks['Currency'] = artworks['Price'].str.extract(r'(\D+)')

artworks['Price_fix'] = artworks['Price'].apply(lambda x: ''.join(re.findall(r'\d+', x)))
artworks['Price_fix'] = artworks['Price_fix'].astype(float)

currency_rates = {
    'US$': 1,
    '€': 1.18,
    '£': 1.38,
    'KRW ₩': 0.00089,
    'C$': 0.8,
}

artworks['Price (US$)'] = artworks.apply(lambda x: x['Price_fix'] * currency_rates[x['Currency']], axis=1)

KeyError: 'C$'

In [None]:
## FIX DIMENSIONS

artworks['Dimensions (cm)'] = artworks['Dimensions'].apply(lambda x: x.split('|')[1])
artworks['Height (cm)'] = artworks['Dimensions (cm)'].apply(lambda x: x.split('×')[0].strip())
artworks['Width (cm)'] = artworks['Dimensions (cm)'].apply(lambda x: x.split('×')[1].split('cm')[0].strip())
artworks['Size (cm²)'] = artworks['Height (cm)'].astype(float) * artworks['Width (cm)'].astype(float)

In [None]:
## DROP COLUMNS

artworks.drop(columns=['Dimensions (cm)', 'Dimensions', 'Height (cm)', 'Width (cm)', 'Price', 'Price_fix', 'Currency', 'Scarcity', 'URL', 'Title'], inplace=True)

In [None]:
## SPLIT MATERIALS INTO MEDIUMS AND MATERIALS

artworks['Materials'] = artworks['Materials'].apply(lambda x: x.lower())
artworks = artworks[artworks['Materials'].str.contains(' on ')]
# split materials into mediums and materials using 'on' as separator
artworks['Mediums'] = artworks['Materials'].apply(lambda x: x.split(' on ')[0])
artworks['Materials'] = artworks['Materials'].apply(lambda x: x.split(' on ')[1])

In [None]:
## MODELS

def get_decision_tree(X_train, y_train):
    decision_tree = DecisionTreeRegressor(random_state=42)
    decision_tree.fit(X_train, y_train)
    return decision_tree

def get_linear_regression(X_train, y_train):
    linear_regression = LinearRegression()
    linear_regression.fit(X_train, y_train)
    return linear_regression

def get_random_forest(X_train, y_train):
    rf_model = RandomForestRegressor(random_state=1)
    rf_model.fit(X_train, y_train)
    return rf_model

def get_gradient_boosting(X_train, y_train):
    # get gradient boosting model
    gb_model = GradientBoostingRegressor(random_state=1)
    gb_model.fit(X_train, y_train)
    return gb_model

# gives a dataframe taking models as columns and score as rows
def get_all_models(x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

    models = {'Linear regression': get_linear_regression(x_train, y_train),
              'Decision tree': get_decision_tree(x_train, y_train),
              'Random forest': get_random_forest(x_train, y_train),
              'Gradient boosting': get_gradient_boosting(x_train, y_train)}
    
    models_df = pd.DataFrame(columns=models.keys())
    for model_name, model in models.items():
        y_pred = model.predict(x_test)
        r2 = r2_score(y_test, y_pred)
        n = len(y_test)
        p = x_test.shape[1]
        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
        mean_error = mean_absolute_error(y_test, y_pred)
        median_error = median_absolute_error(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        pearson = np.corrcoef(y_test.to_numpy(), y_pred)[0, 1]

        models_df.loc['R² Score', model_name] = r2
        models_df.loc['Adjusted R² Score', model_name] = adj_r2
        models_df.loc['Pearson Correl Predicted-Actual', model_name] = pearson
        models_df.loc['Mean Absolute Error', model_name] = mean_error
        models_df.loc['Median Absolute Error', model_name] = median_error
        models_df.loc['Mean Squared Error', model_name] = mse

        # Round values
        models_df = models_df.round(2)

    return models_df, models

In [None]:
# get dummies for materials, mediums, artist and gallery
dummies_df = pd.get_dummies(artworks, columns=['Materials', 'Mediums', 'Artist', 'Gallery'])

x = dummies_df.drop(['Price (US$)'], axis=1)
y = dummies_df['Price (US$)']

models_df, models = get_all_models(x, y)
models_df

Unnamed: 0,Linear regression,Decision tree,Random forest,Gradient boosting
R² Score,0.50731,-1.651306,-0.259625,-1.631817
Adjusted R² Score,-0.302705,-6.010232,-2.330534,-5.958702
Pearson Correl Predicted-Actual,0.747523,0.04695,0.452962,0.048949
Mean Absolute Error,92534104892.15942,57960148466.614075,37886254610.24955,56431435376.95646
Median Absolute Error,42712374605.57129,115.0,190.771119,14887048.46185
Mean Squared Error,8.866560431389847e+22,4.7713541416736235e+23,2.2668514229264854e+23,4.73628130268116e+23
