In [111]:
## IMPORTS

import pandas as pd
import spacy
import re
# SCI-KIT LEARN
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, median_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [112]:
## FUNCTIONS

def get_number(string):
    numbers = re.findall(r'\d+', string)
    return ''.join(numbers)

In [17]:
## SPACY

nlp = spacy.load("pt_core_news_lg")

# Define lists of possible goals and interests
MATERIALS_OPTIONS = [
    "Acrílica", "Aquarela", "Carvão", "Cerâmica",
    "Colagem", "Couro", "Desenho", "Escultura",
    "Fotografia", "Gesso", "Óleo", "Papel",
    "Pastel", "Pedra", "Pintura", "Tela",
    "Aquarela", "Nanquim", "Tinta", "Vidro"
    "Impressão", "Cerâmica", "Bronze", "Madeira",
    "Algodão", "Grafite", "Tecido", "Ferro",
    "Mármore", "Papel", "Policromia", "Resina",
    "Serigrafia", "Tecido", "Tinta acrílica", "Tinta a óleo",
    "Sobre Tela", "Sobre Papel", "Sobre Madeira"
    ]

CATEGORIES_OPTIONS = [
    "Pintura", "Escultura", "Fotografia", "Desenho",
    "Gravura", "Objeto", "Instalação", "Vídeo"
]

# Define a function to extract structured information from user_json
def extract_structured_info(artwork_description):
    structured_info = []
    doc = nlp(artwork_description)
    for token in doc:
        if any(material.lower() in token.text.lower() for material in MATERIALS_OPTIONS):
            structured_info.append(token.text.capitalize())
    return structured_info

In [100]:
## MODELS


def get_stats(x, y):
    stats = {}
    for variable in [x, y]:
        max_variable = round(max(variable), 2)
        min_variable = round(min(variable), 2)
        mean_variable = round(variable.mean(), 2)
        median_variable = round(variable.median(), 2)
        stats_variable = {'Max': max_variable, 'Min': min_variable, 'Mean': mean_variable, 'Median': median_variable}
        # add stats_variable to stats
        stats[variable] = stats_variable
    return stats


## GET MODELS

def get_decision_tree(X_train, y_train):
    decision_tree = DecisionTreeRegressor(random_state=42)
    decision_tree.fit(X_train, y_train)
    return decision_tree

def get_linear_regression(X_train, y_train):
    linear_regression = LinearRegression()
    linear_regression.fit(X_train, y_train)
    return linear_regression

def get_random_forest(X_train, y_train):
    rf_model = RandomForestRegressor(random_state=1)
    rf_model.fit(X_train, y_train)
    return rf_model

def get_gradient_boosting(X_train, y_train):
    # get gradient boosting model
    gb_model = GradientBoostingRegressor(random_state=1)
    gb_model.fit(X_train, y_train)
    return gb_model

# gives a dataframe taking models as columns and score as rows
def get_all_models(x_train, x_test, y_train, y_test):
    
    models = {'Linear regression': get_linear_regression(x_train, y_train),
              'Decision tree': get_decision_tree(x_train, y_train),
              'Random forest': get_random_forest(x_train, y_train),
              'Gradient boosting': get_gradient_boosting(x_train, y_train)}
    
    models_df = pd.DataFrame(columns=models.keys())
    for model_name, model in models.items():
        y_pred = model.predict(x_test)
        r2 = r2_score(y_test, y_pred)
        n = len(y_test)
        p = x_test.shape[1]
        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
        mean_error = mean_absolute_error(y_test, y_pred)
        median_error = median_absolute_error(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        pearson = np.corrcoef(y_test.to_numpy(), y_pred)[0, 1]

        models_df.loc['R² Score', model_name] = r2
        models_df.loc['Adjusted R² Score', model_name] = adj_r2
        models_df.loc['Pearson Correl Predicted-Actual', model_name] = pearson
        models_df.loc['Mean Absolute Error', model_name] = mean_error
        models_df.loc['Median Absolute Error', model_name] = median_error
        models_df.loc['Mean Squared Error', model_name] = mse

        # Round values
        models_df = models_df.round(2)

    return models_df, models

In [140]:
## OPEN FILES


# ART RIO
artrio = pd.read_csv('../temporary-files/artrio_artworks_info.csv')
artrio['Materials'] = artrio['Ficha técnica'].apply(lambda x: extract_structured_info(x) if isinstance(x, str) else None)
artrio = artrio[['Title', 'Artist', 'Price', 'Materials', 'Height', 'Width', 'Depth', 'Gallery']]

# BLOMBO
blombo = pd.read_csv('../temporary-files/blombo_artworks_info.csv')
blombo['Materials'] = blombo['Description'].apply(lambda x: extract_structured_info(x) if isinstance(x, str) else None)
blombo[['Height', 'Width', 'Depth']] = blombo['Dimensões'].str.split(' x ', expand=True).replace('cm', '', regex=True).replace('m', '', regex=True).replace('mm', '', regex=True).replace(' ', '', regex=True)
blombo['Price'] = blombo['Price'].apply(lambda x: get_number(x) if isinstance(x, str) else x).astype(float) /100
blombo = blombo[['Title', 'Artist', 'Price', 'Materials', 'Height', 'Width', 'Depth']]

# NANO
nano = pd.read_csv('../temporary-files/nano_artworks_info.csv')
nano.rename(columns={'Título': 'Title', 'Galeria': 'Gallery'}, inplace=True)
nano['Materials'] = nano['Técnica'].apply(lambda x: extract_structured_info(x) if isinstance(x, str) else None)
nano.Categories.replace('Categoria: ', '', regex=True, inplace=True)
nano[['Height', 'Width', 'Depth']] = nano['Dimensões'].apply(lambda x: x.split(' cm')[0] if isinstance(x, str) else None).str.split(' x ', expand=True).replace('cm', '', regex=True).replace('m', '', regex=True).replace('mm', '', regex=True).replace(' ', '', regex=True)
nano['Price'] = nano['Price'].apply(lambda x: get_number(x) if isinstance(x, str) else x).astype(float) /100
nano = nano[['Title', 'Artist', 'Price', 'Materials', 'Height', 'Width', 'Depth', 'Gallery']]

# SP ARTE
sp_arte = pd.read_csv('../temporary-files/sparte_artworks_info.csv')
sp_arte['Materials'] = sp_arte['Description'].apply(lambda x: extract_structured_info(x) if isinstance(x, str) else None)
sp_arte[['Height', 'Width', 'Depth']] = sp_arte['Size'].apply(lambda x: x.split(' cm')[0]).str.split(' × ', expand=True).replace('cm', '', regex=True).replace('m', '', regex=True).replace('mm', '', regex=True).replace(' ', '', regex=True)
sp_arte = sp_arte[['Title', 'Artist', 'Price', 'Materials', 'Height', 'Width', 'Depth', 'Gallery']]

In [141]:
## FIX SP ARTE PRICES (SOME PRICES ARE RANGES)

# split values that contain '-' and keep only second value
sp_arte['Price'] = sp_arte['Price'].apply(lambda x: x.split('-')[1] if isinstance(x, str) else x)

IndexError: list index out of range

In [142]:
## CONCATENATE DATAFRAMES

artworks = pd.concat([artrio, blombo, nano], ignore_index=True)

artworks['Price']

0        500.0
1        500.0
2        500.0
3        530.0
4        800.0
         ...  
2715    2500.0
2716    5500.0
2717    1300.0
2718    2500.0
2719    3300.0
Name: Price, Length: 2720, dtype: float64

In [147]:
train_data, test_data = train_test_split(artworks, test_size=0.2, random_state=42)

# get dummies for artist and gallery
train_data = pd.get_dummies(train_data, columns=['Artist', 'Gallery'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Artist', 'Gallery'], drop_first=True)
train_data.columns

Index(['Title', 'Price', 'Materials', 'Height', 'Width', 'Depth',
       'Artist_ALBERTO BITAR', 'Artist_ALBERTO MARTINS',
       'Artist_ALESSANDRA REHDER', 'Artist_ALESSANDRA REHDER 2',
       ...
       'Gallery_RV Cultura e Arte', 'Gallery_Raquel Arnaud',
       'Gallery_SILVIA CINTRA + BOX 4', 'Gallery_SIMÕES DE ASSIS GALERIA',
       'Gallery_Superfície', 'Gallery_SÉ', 'Gallery_Uncool Artist',
       'Gallery_Yaak Gallery', 'Gallery_Ybakatu', 'Gallery_Zilda Fraletti'],
      dtype='object', length=967)

In [160]:
## APPLY MODELS

x_train = train_data.drop(['Title', 'Price', 'Materials'], axis=1)
y_train = train_data['Price']

x_test = test_data.drop(['Title', 'Price', 'Materials'], axis=1)
y_test = test_data['Price']


# get all non-numeric values in x_train
non_numeric = x_train.select_dtypes(exclude=['int64', 'float64', 'bool']).columns

x_train['Height']

# models_df, models = get_all_models(x_train, x_test, y_train, y_test)

1029       35
2382    24X36
1196    50x40
1502       48
445      70.0
        ...  
1638    55x40
1095    21X21
1130      100
1294      153
860     103.0
Name: Height, Length: 2176, dtype: object

In [165]:
# function to get height, width and depth from description
# splits at 'cm', ignores everything to the right
# considers 'x', 'X', '×' and 'X' as possible separators
def get_dimensions(dimensions_str):
    separators = ['x', 'X', '×', 'X']
    
    dimensions_str = dimensions_str.split('cm')[0]
    for separator in separators:
        if separator in dimensions_str:
            height, width, depth = dimensions_str.split(separator)
            break
    height = float(height)
    width = float(width)
    depth = float(depth)

    return height, width, depth

dimensions_str = '10 x 10 x 10 cm'
print(get_dimensions(dimensions_str))

(10.0, 10.0, 10.0)
