In [1]:
## IMPORTS

# External modules
import pandas as pd
import re
from collections import Counter
import matplotlib.pyplot as plt
# Data processing
from sklearn.model_selection import train_test_split
# Linear model
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
# decision tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
## Random forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
## Gradient boosting
from sklearn.ensemble import GradientBoostingRegressor

# Project modules
import filter

In [57]:
# FUNCTIONS


## DATAFRAME CREATION
def get_artworks_df():
    artworks = pd.read_csv('../saatchi_artworks_info.csv')
    artworks.rename(columns=lambda x: x.title(), inplace=True)
    artworks = calculate_area_and_price_per_area(artworks)
    for column_name in ['Styles', 'Mediums', 'Subjects']:
        artworks[column_name] = artworks[column_name].apply(lambda x: [str(value.strip()) for value in x.split(',')])
    return artworks

def get_artists_df(artworks_df):
    agg_df = artworks_df.dropna(subset=['Price'])
    agg_data = agg_df.groupby('Artist').agg({'Artist': 'count',
                                            'Price': ['mean', lambda x: x.std(skipna=False)],
                                            'Price / in²': ['mean', lambda x: x.std(skipna=False)],
                                            'Size': ['mean', lambda x: x.std(skipna=False)]})
    agg_data = agg_data.reset_index()

    # Flatten the column names
    agg_data.columns = ['Artist', 'NumArtworks', 'MeanPrice', 'StdPrice', 'MeanPricePerInch', 'StdPricePerInch', 'MeanSize', 'StdSize']

    # Create a new DataFrame with the aggregated data from artworks_info
    artists_from_artworks = pd.DataFrame(agg_data)
    artists_from_artworks = artists_from_artworks[artists_from_artworks['NumArtworks'] >= 3]
    # Get artists data from artists_info
    artists_from_artists = pd.read_json('../saatchi_artists_info.json')
    artists_from_artists.rename(columns=lambda x: x.title(), inplace=True)
    artists_from_artists.rename(columns={'Name': 'Artist'}, inplace=True)

    # New dataframe with all artists that are contained in both dataframes
    artists_from_artists = artists_from_artists[artists_from_artists['Artist'].isin(artists_from_artworks['Artist'])]
    artists = pd.merge(artists_from_artworks, artists_from_artists, on='Artist')

    return artists

def calculate_area_and_price_per_area(dataframe):
    # Iterate over the "Size" column
    df = dataframe.copy()
    for i, size in enumerate(dataframe['Size']):
        # Extract the dimensions using regular expression
        dimensions = re.findall(r'\d+(?:\.\d+)?', size)
        if len(dimensions) >= 2:
            # Extract the width and height dimensions
            try:
                width = float(dimensions[0])
                height = float(dimensions[1])
                total_area = width * height
                df.at[i, 'Size'] = total_area
            except:
                df.at[i, 'Size'] = 'NaN'
    df['Price / in²'] = df['Price'] / df['Size']

    return df




def get_unique_values(df, column_name):
    # Extract all unique styles from the column
    unique_values = set()
    for styles in df[column_name]:
        if isinstance(styles, str):
            styles_list = [style.strip() for style in styles.split(',')]
            unique_values.update(styles_list)
    
    return unique_values

def check_columns(df, columns_names):
    for column in columns_names:
        df[column] = df[column].str.title()
        unique_list = get_unique_values(df, column)
        print(column, len(unique_list), 'unique values')

def restart_df(dataframe):
    df = calculate_area_and_price_per_area(dataframe)
    df = df[['Styles', 'Mediums', 'Subjects', 'Artist', 'Size', 'Price']]
    df = df.dropna(subset='Styles')
    columns_names = ['Styles', 'Mediums', 'Subjects']

    # Turn the "Styles" column into a list of strings
    for column in columns_names:
        column_serie = df[column].apply(lambda x: x.split(','))
        df[column] = column_serie
        return df

## CLEAN DATA

def remove_words_from_list(lst, words_to_remove):
    return [word for word in lst if word not in words_to_remove]


## OOOOLD
def fix_column(dataframe, column_name, fix_dict, remove_list, split_list):
    # Remove whitespaces from the beginning and end of each string
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: [item.strip() for item in lst])
    # Change '-' to whitespaces in each string
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: [item.replace('-', ' ') for item in lst])
    # Capitalize each string
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: [item.title() for item in lst])
    # Remove ' Art' and ' Painting' from the end of each string
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: [style.replace(' Art', '') for style in lst])
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: [style.replace(' Painting', '') for style in lst])
    # Splits items present the split_list into separate items
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: [item for item in lst if item not in split_list] + [item for item in lst for split_item in split_list if split_item in item])
    return dataframe


## NEEEW
def fix_based_on_dict(column, fix_dict):
    column = column.apply(lambda lst: [next((key for key, values in fix_dict.items() if item in values), item) for item in lst])
    return column

def remove_words_from_column(column, remove_list):
    column = column.apply(lambda lst: remove_words_from_list(lst, remove_list))
    return column


def get_occurrence_count_on_col_dict(values):
    # Get all unique values from the column
    occurrences_counts_dict = dict(Counter(values))
    return occurrences_counts_dict


def group_by_segments(artworks_data, column_name, column, occurrences_threshold):

    ## GETTING ONE DF FOR SEGMENT
    segments_dfs = get_dfs_for_segments(filtered_artworks_data, column_name, occurrence_count_on_col_dict, occurrences_threshold)

    # New dataframe with unique segments as index and MeanPrice, MedianPrice, MeanSize, MedianSize as columns
    all_segments_df = pd.DataFrame(index=segments_dfs.keys(), columns=['MeanPrice', 'MedianPrice', 'MeanSize', 'MedianSize', 'MeanPricePerIn²', 'MedianPricePerIn²', 'Count'])
    # Populate dataframe with mean price, median price, mean size and median size for each segment
    for key, value in segments_dfs.items():
        all_segments_df.loc[key, 'MeanPrice'] = value['Price'].mean().round(0)
        all_segments_df.loc[key, 'MedianPrice'] = value['Price'].median().round(0)
        all_segments_df.loc[key, 'MeanSize'] = value['Size'].mean().round(0)
        all_segments_df.loc[key, 'MedianSize'] = value['Size'].median().round(0)
        all_segments_df.loc[key, 'MeanPricePerIn²'] = value['Price / in²'].mean().round(2)
        all_segments_df.loc[key, 'MedianPricePerIn²'] = value['Price / in²'].median().round(2)
        all_segments_df.loc[key, 'Count'] = len(value)

    all_segments_df.sort_values(by='MeanPrice', ascending=False, inplace=True)
    
    # New dataframe for each segment
    segments_dfs = {}
    for key, value in occurrence_count_on_col_dict.items():
        if value > occurrences_threshold:
            segments_dfs[key] = dataframe[dataframe[column_name].apply(lambda x: key in x)]
    return segments_dfs


def analyse_by_column(dataframe, column_name, threshold):
    artworks_count_by_segment = dataframe[column_name].value_counts()
    artworks_count_pct_by_segment = artworks_count_by_segment / dataframe[column_name].value_counts().sum()
    # filter out segments with less than [threshold] artworks
    threshold = 200
    selection = artworks_count_by_segment[artworks_count_by_segment > threshold].index
    print(selection)
    dataframe = dataframe[dataframe[column_name].isin(selection)]
    return dataframe

def compare_segments(dataframe, segments_to_compare, x_column_name, y_column_name):
    for segment in segments_to_compare:
        print(segment)

        if segment == 'All':
            segment_df = dataframe
        else:
            segment_df = segments_dfs[segment]

        x = segment_df[[x_column_name]]
        y = segment_df[y_column_name]
        
        # print('stats:', get_stats(segment_df, x, y))
        get_all_models(x, y)

        xlim = (0, 10000)
        ylim = (0, 40000)
        # segment_df.plot.scatter(x=segment_df[[x_column_name]], y=segment_df[y_column_name], title=segment+' artworks', figsize=(5, 3), xlim=xlim, ylim=ylim)

In [3]:
# MODELS


def get_stats(x, y):
    stats = {}
    for variable in [x, y]:
        max_variable = round(max(variable), 2)
        min_variable = round(min(variable), 2)
        mean_variable = round(variable.mean(), 2)
        median_variable = round(variable.median(), 2)
        stats_variable = {'Max': max_variable, 'Min': min_variable, 'Mean': mean_variable, 'Median': median_variable}
        # add stats_variable to stats
        stats[variable] = stats_variable
    return stats

## DECISION TREE
def get_decision_tree(x, y):
    # Create a DecisionTreeRegressor object
    decision_tree = DecisionTreeRegressor(random_state=42)
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
    # Train the model
    decision_tree.fit(X_train, y_train)
    # Check the score of the model
    print('model score:', decision_tree.score(X_test, y_test))
    # Check the mean absolute error of the model
    print('model mean error:', mean_absolute_error(y_test, decision_tree.predict(X_test)))
    return decision_tree


## LINEAR REGRESSION 
def get_linear_regression(x, y):
    # Create a LinearRegression object
    linear_regression = LinearRegression()
    # train a model to predict the price of an artwork based on its size (use 0.3 as the test_size)
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
    linear_regression.fit(X_train, y_train)
    print('model score', linear_regression.score(X_test, y_test))
    print('model mean error:', mean_absolute_error(y_test, linear_regression.predict(X_test)))
    # predict the price of an artwork with a size of 1000 square inches
    # linear_regression.predict([[1000]])
    return linear_regression


def get_random_forest(x, y):
    rf_model = RandomForestRegressor(random_state=1)
    rf_model.fit(x, y)
    rf_val_score = rf_model.score(x, y)
    print("model score: {}".format(rf_val_score))
    predicted_prices = rf_model.predict(x)
    rf_val_mae = mean_absolute_error(y, predicted_prices)
    print("model mean avsolute error: {}".format(rf_val_mae))
    return rf_model

def get_gradient_boosting(x, y):
    # get gradient boosting model
    gb_model = GradientBoostingRegressor(random_state=1)
    gb_model.fit(x, y)
    gb_val_score = gb_model.score(x, y)
    print("model score: {}".format(gb_val_score))
    predicted_prices = gb_model.predict(x)
    gb_val_mae = mean_absolute_error(y, predicted_prices)
    print("model mean avsolute error: {}".format(gb_val_mae))
    return gb_model

# gives a dataframe taking models as columns and score as rows
def get_all_models(x, y):
    models = {'Linear regression': get_linear_regression(x, y),
              'Decision tree': get_decision_tree(x, y),
              'Random forest': get_random_forest(x, y),
              'Gradient boosting': get_gradient_boosting(x, y)}
    models_df = pd.DataFrame(columns=models.keys())
    for model_name, model in models.items():
        models_df.loc['score', model_name] = model.score(x, y)
        models_df.loc['mean error', model_name] = mean_absolute_error(y, model.predict(x))
    return models_df
    

## UPDATE WHEN ADDING MORE MODELS
# def get_all_models(x, y):
#     print('Linear regression')
#     get_linear_regression(x, y)
#     print()
#     print('Decision tree')
#     get_decision_tree(x, y)
#     print('------')
#     print()
#     print('Random forest')
#     get_random_forest(x, y)
#     print('------')
#     print()
#     print('Gradient boosting')
#     get_gradient_boosting(x, y)
#     print('------')

In [48]:
artworks = get_artworks_df()
# artworks = artworks[['Styles', 'Mediums', 'Subjects', 'Artist', 'Size', 'Price']]
artworks.columns

Index(['Title', 'Packaging', 'Collage', 'Multi-Paneled Collage', 'Mediums',
       'Ready To Hang', 'Original', 'Digital', 'Country',
       'Multi-Paneled Mixed Media', 'Number Of Pieces', 'Artist_Link',
       'Styles', 'Subjects', 'Multi-Paneled Printmaking', 'Mixed Media',
       'Multi-Paneled Painting', 'Handling', 'Installation', 'Delivery Time',
       'Number Of Panels', 'Sale_Status', 'Materials', 'Photography', 'Url',
       'Artist Produced Limited Edition Of', 'Img', 'Ships From', 'Artist',
       'Multi-Paneled Sculpture', 'Favorite', 'Multi-Paneled Installation',
       'Original Created', 'Price', 'Sculpture', 'Multi-Paneled Photography',
       'Customs', 'Views', 'Description', 'Frame', 'Printmaking',
       'Multi-Paneled Drawing', 'Painting', 'Drawing', 'Size', 'Price / in²'],
      dtype='object')

In [5]:
## ARTWORKS BY SEGMENT


column_name = 'Artist Produced Limited Edition Of'

artworks_by_segment = analyse_by_column(artworks, column_name, 200)

# Get mean price per segment
mean_price_by_segment = artworks.groupby(column_name)['Price'].mean().round(2).sort_values(ascending=False)
mean_price_by_segment

Index([10.0, 20.0, 25.0, 5.0], dtype='float64', name='Artist Produced Limited Edition Of')


Artist Produced Limited Edition Of
35.0     17705.50
135.0    14149.00
40.0     13558.65
85.0     12219.33
2.0      11310.73
77.0      9924.50
45.0      9521.17
75.0      9201.63
17.0      8686.67
250.0     8628.91
8.0       8434.32
16.0      7550.00
500.0     6438.00
50.0      6072.37
3.0       5376.68
100.0     4972.91
200.0     4718.56
5.0       4347.23
24.0      3810.00
7.0       3798.87
150.0     3309.75
25.0      3288.78
30.0      2825.20
11.0      2677.20
6.0       2645.37
12.0      2139.48
15.0      2122.91
58.0      2106.67
10.0      2079.82
999.0     2033.00
14.0      2020.00
9.0       2004.82
1.0       1980.84
4.0       1619.94
18.0      1599.38
28.0      1515.00
21.0      1232.50
76.0      1070.00
20.0       930.35
60.0       759.00
175.0      686.25
275.0      585.00
70.0       525.00
80.0       485.00
90.0       480.00
88.0       477.00
325.0      455.00
22.0       440.00
33.0       150.00
Name: Price, dtype: float64

In [55]:
artworks_data = get_artworks_df()
## COLUMN CHOSEN TO DEFINE SEGMENTS
column = artworks_data.Materials
occurrences_threshold = 200

In [56]:
## SEGMENT AND CLEAN DATA

# Define column to be segmented and cleaned
column_name = column.name

## Getting one dataframe for each segment
segments_dfs = {}
segments_in_column_list = [value for sublist in column for value in sublist]
occurrence_count_on_col_dict = get_occurrence_count_on_col_dict(segments_in_column_list)
# Remove rows that have [] in column
filtered_artworks_data = artworks_data[column.apply(lambda x: len(x) > 0)]
filtered_artworks_data = filtered_artworks_data.dropna(subset=['Price', 'Size'])
filtered_artworks_data[column_name] = column

for key, value in occurrence_count_on_col_dict.items():
    if value > occurrences_threshold:
        segments_dfs[key] = filtered_artworks_data[filtered_artworks_data[column_name].apply(lambda x: key in x)]
# New dataframe with unique segments as index and MeanPrice, MedianPrice, MeanSize, MedianSize as columns
all_segments_df = pd.DataFrame(index=segments_dfs.keys(), columns=['MeanPrice', 'MedianPrice', 'MeanSize', 'MedianSize', 'MeanPricePerIn²', 'MedianPricePerIn²', 'Count'])
# Populate dataframe with mean price, median price, mean size and median size for each segment
for key, value in segments_dfs.items():
    all_segments_df.loc[key, 'MeanPrice'] = value['Price'].mean().round(0)
    all_segments_df.loc[key, 'MedianPrice'] = value['Price'].median().round(0)
    all_segments_df.loc[key, 'MeanSize'] = value['Size'].mean().round(0)
    all_segments_df.loc[key, 'MedianSize'] = value['Size'].median().round(0)
    all_segments_df.loc[key, 'MeanPricePerIn²'] = value['Price / in²'].mean().round(2)
    all_segments_df.loc[key, 'MedianPricePerIn²'] = value['Price / in²'].median().round(2)
    all_segments_df.loc[key, 'Count'] = len(value)

all_segments_df.sort_values(by='MeanPrice', ascending=False, inplace=True)

all_segments_df = group_by_segments(artworks_data, column_name, column, 200)
all_segments_df

Unnamed: 0,MeanPrice,MedianPrice,MeanSize,MedianSize,MeanPricePerIn²,MedianPricePerIn²,Count
u,9693.0,5450.0,2506.0,1552.0,4.79,3.71,601
m,9366.0,4749.0,2486.0,1552.0,4.75,3.71,598
A,8360.0,3210.0,2340.0,1521.0,3.67,3.32,680
c,7965.0,2950.0,2265.0,1487.0,4.91,3.71,709
l,7795.0,3000.0,1991.0,1296.0,5.77,3.55,972
O,7304.0,2639.0,3786.0,902.0,5.52,3.42,1177
h,7110.0,2610.0,3725.0,902.0,5.39,3.32,1209
i,6992.0,2900.0,1967.0,1253.0,5.5,3.51,1079
t,6601.0,2270.0,3112.0,884.0,5.71,3.04,1627
S,6190.0,2000.0,1657.0,884.0,8.19,2.32,293


In [52]:
## PREPARE DATAFRAME: DUMMIES FOR SEGMENTS
dummies_for_segment = artworks_data[['Price', column_name]].dropna(subset=['Price'])
# Populate dataframe with 1 if the column contains the segment and 0 if it doesn't
for key, value in segments_dfs.items():
    dummies_for_segment[key] = filtered_artworks_data[column_name].apply(lambda x: True if key in x else False)

## APPLYING MODELS TO SEGMENTS

# multi variate linear regression with price as y and all segments as x
x = dummies_for_segment.drop(['Price', column_name], axis=1)
# dependent variable
y = dummies_for_segment['Price']

print('Models for',column_name ,'x','Price')
get_all_models(x, y)

Models for Subjects x Price
Linear regression
model score 0.01146555837672214
model mean error: 2815.483236023731

Decision tree
model score: 0.01146555837672214
model mean error: 2815.483236023731
------

Random forest
model score: 0.009613088424659466
model mean avsolute error: 2724.7143287602057
------

Gradient boosting
model score: 0.009566525574220441
model mean avsolute error: 2726.210493183388
------


In [None]:
## COMPARE SEGMENTS

segment1 = 'Architecture'
segment2 = 'Floral'

segments_to_compare = [segment1, segment2]

# Variables
x_column_name = 'Size'
y_column_name = 'Price'

compare_segments(filtered_artworks_data, segments_to_compare, x_column_name, y_column_name)

In [44]:
## ARTISTS

artists = get_artists_df(artworks)
artists.columns

Index(['Artist', 'NumArtworks', 'MeanPrice', 'StdPrice', 'MeanPricePerInch',
       'StdPricePerInch', 'MeanSize', 'StdSize', 'Link', 'Location',
       'Followers_Count', 'Artworks_Count', 'Badges', 'Info', 'Events',
       'Exhibitions', 'Education'],
      dtype='object')

In [46]:
## GET MODELS BY COLUMN


columns_names = [ 'NumArtworks', 'MeanPrice', 'StdPrice',
                  'MeanPricePerInch', 'StdPricePerInch',
                  'MeanSize', 'StdSize', 'Badges']

y_column_name = 'MeanPrice'

artists_to_model = artists[columns_names].dropna(subset=['MeanPrice'])
# Creates dummies' columns for the badges column
badges_to_model = artists_to_model['Badges'].str.join(',').str.get_dummies(sep=',')
artists_to_model = pd.concat([artists_to_model, badges_to_model], axis=1)

# List of badges
badges_list = badges_to_model.columns.tolist()

# Adjusts list of columns to test
x_columns_names = columns_names + badges_list
x_columns_names.remove('Badges')
x_columns_names.remove(y_column_name)

# Apply linear regression ad decision tree to each column individually
for column_name in x_columns_names:
    print('-----------------------------------')
    print(column_name)
    get_all_models(artists_to_model[[column_name]], artists_to_model[[y_column_name]])

-----------------------------------
NumArtworks
Linear regression
model score -0.2599340219857338
model mean error: 1928.6484901940896

Decision tree
model score: -2.332631316179829
model mean error: 2178.350290996676
------

Random forest


  return fit_method(estimator, *args, **kwargs)


model score: 0.18868896764829368
model mean avsolute error: 2269.273991493758
------

Gradient boosting
model score: 0.1903228228283993
model mean avsolute error: 2203.8299869723755
------
-----------------------------------
StdPrice
Linear regression
model score 0.19465740152017597
model mean error: 1199.243522777354

Decision tree
model score: -0.2586538233531799
model mean error: 1452.5601683775867
------

Random forest


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


model score: 0.9103515871594271
model mean avsolute error: 671.2776535848818
------

Gradient boosting
model score: 0.9942648841524547
model mean avsolute error: 361.9660061040648
------
-----------------------------------
MeanPricePerInch
Linear regression
model score -0.7311609536595591
model mean error: 1581.8271300365714

Decision tree
model score: -0.007065002441216217
model mean error: 1246.1391663098157
------

Random forest


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


model score: 0.8983434560322401
model mean avsolute error: 739.077948517208
------

Gradient boosting
model score: 0.9908758820704585
model mean avsolute error: 432.30269533209344
------
-----------------------------------
StdPricePerInch
Linear regression
model score -0.8596814279610816
model mean error: 1693.0606285398792

Decision tree


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


model score: -6.017490856880565
model mean error: 2716.673707272666
------

Random forest
model score: 0.8566336325622614
model mean avsolute error: 938.6233806864965
------

Gradient boosting


  y = column_or_1d(y, warn=True)


model score: 0.9893567557581012
model mean avsolute error: 482.18487307116163
------
-----------------------------------
MeanSize
Linear regression
model score 0.1980503409745763
model mean error: 1553.4493474955773

Decision tree
model score: -28.209249861337042
model mean error: 3855.5264204432387
------

Random forest


  return fit_method(estimator, *args, **kwargs)


model score: 0.7672255038201998
model mean avsolute error: 1083.0921248297968
------

Gradient boosting
model score: 0.9938399215465497
model mean avsolute error: 373.86919708730125
------
-----------------------------------
StdSize
Linear regression
model score -0.04863203530737836
model mean error: 1788.1442494015075

Decision tree
model score: -25.56391425311417
model mean error: 3195.305369480766
------

Random forest


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


model score: 0.754509069880301
model mean avsolute error: 1222.1255356390045
------

Gradient boosting
model score: 0.990912264980544
model mean avsolute error: 461.0642699165732
------
-----------------------------------
Artist featured in a collection
Linear regression
model score -0.2998777641180208
model mean error: 1974.4607983652195

Decision tree
model score: -0.29987776411802036
model mean error: 1974.4607983652193
------

Random forest


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


model score: 0.010034539171852486
model mean avsolute error: 2525.0560594923036
------

Gradient boosting
model score: 0.01004251688624802
model mean avsolute error: 2532.795239863684
------
-----------------------------------
Featured in Inside The Studio
Linear regression
model score -0.3031222110072589
model mean error: 1948.17264561659

Decision tree
model score: -0.3031222110072589
model mean error: 1948.17264561659
------

Random forest


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


model score: 2.9587292916355423e-05
model mean avsolute error: 2581.3863557647765
------

Gradient boosting
model score: 4.138379269758019e-05
model mean avsolute error: 2591.0665950596026
------
-----------------------------------
Featured in One to Watch
Linear regression
model score -0.3754287595788921
model mean error: 1987.7030263250563

Decision tree
model score: -0.3754287595788921
model mean error: 1987.7030263250565
------

Random forest


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


model score: 0.001526207348242381
model mean avsolute error: 2554.2794884985615
------

Gradient boosting
model score: 0.001612202564354459
model mean avsolute error: 2556.3755849786585
------
-----------------------------------
Featured in Rising Stars
Linear regression
model score -0.2088751321472364
model mean error: 1831.3100648118063

Decision tree
model score: -0.20887513214723707
model mean error: 1831.3100648118073
------

Random forest


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


model score: 0.0017618606478320142
model mean avsolute error: 2569.2389278931087
------

Gradient boosting
model score: 0.0017680612026913511
model mean avsolute error: 2576.7235847331845
------
-----------------------------------
Featured in the Catalog
Linear regression
model score -2.250867044009085
model mean error: 2895.425947875039

Decision tree
model score: -2.250867044009085
model mean error: 2895.4259478750396
------

Random forest


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


model score: 0.039803007522636547
model mean avsolute error: 2424.599504043574
------

Gradient boosting
model score: 0.039837928926199306
model mean avsolute error: 2429.789510592641
------
-----------------------------------
NFT Artist
Linear regression
model score -0.29069450599268487
model mean error: 1825.661026350268

Decision tree
model score: -0.2906945059926853
model mean error: 1825.6610263502687
------

Random forest


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


model score: 0.0002227902709504015
model mean avsolute error: 2572.1496767579238
------

Gradient boosting


  y = column_or_1d(y, warn=True)


model score: 0.0002586892554116149
model mean avsolute error: 2584.447689621879
------
-----------------------------------
Showed at the The Other Art Fair
Linear regression
model score -0.611016594546125
model mean error: 1841.770078883017

Decision tree
model score: -0.6110165945461257
model mean error: 1841.7700788830164
------

Random forest


  return fit_method(estimator, *args, **kwargs)


model score: 0.05691187588581603
model mean avsolute error: 2471.394564457679
------

Gradient boosting
model score: 0.05692581295337418
model mean avsolute error: 2479.489678218749
------


  y = column_or_1d(y, warn=True)


In [None]:
## GET MODELS FOR BADGES

x = artists_to_model[badges_list]
y = artists_to_model['MeanPrice']

get_all_models(x, y)