In [263]:
# External modules
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import re
from collections import Counter
# Project modules
import filter

In [411]:
# FUNCTIONS

def calculate_area_and_price_per_area(dataframe):
    # Iterate over the "Size" column
    df = dataframe.copy()
    for i, size in enumerate(dataframe['Size']):
        # Extract the dimensions using regular expression
        dimensions = re.findall(r'\d+(?:\.\d+)?', size)
        if len(dimensions) >= 2:
            # Extract the width and height dimensions
            try:
                width = float(dimensions[0])
                height = float(dimensions[1])
                total_area = width * height
                df.at[i, 'Size'] = total_area
            except:
                df.at[i, 'Size'] = 'NaN'
    df['Price / in²'] = df['Price'] / df['Size']

    return df

def get_unique_values(df, column_name):
    # Extract all unique styles from the column
    unique_values = set()
    for styles in df[column_name]:
        if isinstance(styles, str):
            styles_list = [style.strip() for style in styles.split(',')]
            unique_values.update(styles_list)
    
    return unique_values

def check_columns(df, columns_names):
    for column in columns_names:
        df[column] = df[column].str.title()
        unique_list = get_unique_values(df, column)
        print(column, len(unique_list), 'unique values')

def restart_df(dataframe):
    df = calculate_area_and_price_per_area(dataframe)
    df = df[['Styles', 'Mediums', 'Subjects', 'Artist', 'Size', 'Price']]
    df = df.dropna(subset='Styles')
    columns_names = ['Styles', 'Mediums', 'Subjects']

    # Turn the "Styles" column into a list of strings
    for column in columns_names:
        column_serie = df[column].apply(lambda x: x.split(','))
        df[column] = column_serie
        return df

## CLEAN DATA

def remove_words_from_list(lst, words_to_remove):
    return [word for word in lst if word not in words_to_remove]

# fix styles
def replace_styles(styles):
    for key, values in fix_styles.items():
        for value in values:
            if value in styles:
                return key
    return styles


def fix_column(dataframe, column_name, fix_dict, remove_list, split_list):
    # Remove whitespaces from the beginning and end of each string
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: [item.strip() for item in lst])
    # Change '-' to whitespaces in each string
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: [item.replace('-', ' ') for item in lst])
    # Capitalize each string
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: [item.title() for item in lst])
    # Remove ' Art' and ' Painting' from the end of each string
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: [style.replace(' Art', '') for style in lst])
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: [style.replace(' Painting', '') for style in lst])
    # Splits items present the split_list into separate items
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: [item for item in lst if item not in split_list] + [item for item in lst for split_item in split_list if split_item in item])
    # Remove selected words from the list
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: remove_words_from_list(lst, remove_list))
    # use fix_styles to replace all the values in the lists to the correct ones (the keys in the dictionary)
    dataframe[column_name] = dataframe[column_name].apply(lambda lst: [next((key for key, values in fix_dict.items() if item in values), item) for item in lst])
    # Fix styles
    dataframe[column_name] = dataframe[column_name].apply(replace_styles)

    return dataframe


def get_occurrence_count_on_col_dict(dataframe, column_name):
    column = dataframe[column_name]
    # Flatten the list of styles into a single list
    all_values = [value for values_list in column for value in values_list]
    # Count the occurrences of each style
    occurrences_counts = Counter(all_values)
    # Sort the styles by count in descending order
    sorted_occurrences_counts = sorted(occurrences_counts.items(), key=lambda x: x[1], reverse=True)
    # Create the final dictionary of style counts
    occurrences_counts_dict = {value: count for value, count in sorted_occurrences_counts}

    return occurrences_counts_dict


def column_str_to_list(dataframe, column_name):
    dataframe[column_name] = dataframe[column_name].apply(lambda x: x.split(',') if isinstance(x, str) else x)
    return dataframe


## LINEAR REGRESSIONS
    
def get_linear_regression(x, y):
    # Create a LinearRegression object
    linear_regression = LinearRegression()
    # train a model to predict the price of an artwork based on its size (use 0.3 as the test_size)
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
    linear_regression.fit(X_train, y_train)
    # Check coefficients and intercept of the model
    print('coeficient',linear_regression.coef_)
    print('intercept',linear_regression.intercept_)
    print('score', linear_regression.score(X_test, y_test))
    # predict the price of an artwork with a size of 1000 square inches
    # linear_regression.predict([[1000]])
    return linear_regression


def linear_regression_by_column(dataframe, x_column, y_column):
    x_artists = dataframe[[x_column]]
    y_artists = dataframe[[y_column]]
    # Create a LinearRegression object
    linear_regression = get_linear_regression(x_artists, y_artists)
    print('------')

In [408]:
## ARTWORKS

def get_artworks_df():
    artworks = pd.read_csv('../saatchi_artworks_info.csv')
    artworks.rename(columns=lambda x: x.title(), inplace=True)
    artworks = calculate_area_and_price_per_area(artworks)
    return artworks
    
get_artworks_df()
print(artworks.head())
print(artworks.shape)

              Title               Packaging Collage Multi-Paneled Collage  \
0     Off the Leash          Ships in a Box     NaN                   NaN   
1         Fluxus 02  Ships Rolled in a Tube     NaN                   NaN   
2   The Make-Up Box          Ships in a Box     NaN                   NaN   
3     Introspection  Ships Rolled in a Tube     NaN                   NaN   
4  Dogs on Oriental          Ships in a Box     NaN                   NaN   

     Mediums   Ready To Hang               Original Digital         Country  \
0    Acrylic              No  One-of-a-kind Artwork     NaN          Canada   
1        Oil  Not applicable  One-of-a-kind Artwork     NaN          France   
2  Ink, Gold              No  One-of-a-kind Artwork     NaN  United Kingdom   
3    Digital              No                    NaN     NaN   United States   
4        Oil  Not applicable  One-of-a-kind Artwork     NaN   United States   

  Multi-Paneled Mixed Media  ...  \
0                       Na

In [433]:
## SEGMENT BY SEGMENT AND CLEAN DATA

artworks_data = get_artworks_df()

## COLUMN CHOSEN TO DEFINE SEGMENTS
column_name = ['Styles']

artworks_data = column_str_to_list(artworks_data, column_name)

occurrence_count_on_col_dict = get_occurrence_count_on_col_dict(artworks_data, column_name)
occurrences_threshold = 400
# Get list with styles that appear more than style_occurrencies_threshold times
unique_values_filtered_by_threshold = set()
unique_values_filtered_by_threshold.update([value for value, count in occurrence_count_on_col_dict.items() if count >= occurrences_threshold])
print(len(styles_filtered_by_threshold), column_name)

fixed_df = fix_column(artworks_data, column_name, filter.fix_styles, styles_to_remove, filter.split_list)
# Remove rows that have [] in styles
fixed_df = fixed_df[fixed_df[column_name[0]].apply(lambda x: len(x) > 0)]
fixed_df = fixed_df.dropna(subset=['Price', 'Size'])


## ANALYSING SEGMENTS

# New dataframe for each style in styles_filtered_by_threshold
segments_dfs = {}
for segment in unique_values_filtered_by_threshold:
    normalized_key = segment.strip().capitalize()
    segments_dfs[normalized_key] = fixed_df[fixed_df[column_name[0]].apply(lambda x: segment in x)]

# Get a new dataframe with mean price, mean size, and mean price per in² for each style
segments_mean_data = []
for segment, df in segments_dfs.items():
    mean_price = df['Price'].mean()
    mean_size = df['Size'].mean()
    mean_price_per_in2 = df['Price / in²'].mean()
    styles_mean_data.append({
        'Style': style,
        'Mean Price': mean_price,
        'Mean Size': mean_size,
        'Mean Price / in²': mean_price_per_in2
    })

segments_mean_df = pd.DataFrame(segments_mean_data).sort_values(by='Mean Price', ascending=False).dropna().reset_index(drop=True)
segments_mean_df

28 ['Styles']


In [351]:
## TESTING LINEAR REGRESSION

artworks_to_model = artworks.dropna(subset=['Price'])

x = artworks_to_model[['Size']]
y = artworks_to_model[['Price']]

linear_regression = get_linear_regression(x, y)

coeficient [[0.54455032]]
intercept [2402.42082547]
score -8.687018343698641


In [241]:
## ARTISTS

def get_artists_df():
    agg_df = artworks.dropna(subset=['Price'])
    agg_data = agg_df.groupby('Artist').agg({'Artist': 'count',
                                            'Price': ['mean', lambda x: x.std(skipna=False)],
                                            'Price / in²': ['mean', lambda x: x.std(skipna=False)],
                                            'Size': ['mean', lambda x: x.std(skipna=False)]})
    agg_data = agg_data.reset_index()

    # Flatten the column names
    agg_data.columns = ['Artist', 'NumArtworks', 'MeanPrice', 'StdPrice', 'MeanPricePerInch', 'StdPricePerInch', 'MeanSize', 'StdSize']

    # Create a new DataFrame with the aggregated data from artworks_info
    artists_from_artworks = pd.DataFrame(agg_data)
    artists_from_artworks = artists_from_artworks[artists_from_artworks['NumArtworks'] >= 3]
    # Get artists data from artists_info
    artists_from_artists = pd.read_json('../saatchi_artists_info.json')
    artists_from_artists.rename(columns=lambda x: x.title(), inplace=True)
    artists_from_artists.rename(columns={'Name': 'Artist'}, inplace=True)

    # New dataframe with all artists that are contained in both dataframes
    artists_from_artists = artists_from_artists[artists_from_artists['Artist'].isin(artists_from_artworks['Artist'])]
    artists = pd.merge(artists_from_artworks, artists_from_artists, on='Artist')

    return artists


artists = get_artists_df()
artists.columns

Index(['Artist', 'NumArtworks', 'MeanPrice', 'StdPrice', 'MeanPricePerInch',
       'StdPricePerInch', 'MeanSize', 'StdSize', 'Link', 'Location',
       'Followers_Count', 'Artworks_Count', 'Badges', 'Info', 'Education',
       'Events', 'Exhibitions'],
      dtype='object')

In [261]:
## LINEAR REGRESSIONS BY COLUMN

columns_names = ['MeanPrice', 'StdPrice',
           'MeanPricePerInch', 'StdPricePerInch',
           'MeanSize', 'StdSize',
           'Followers_Count', 'Artworks_Count', 'Badges']

y_column = 'MeanPricePerInch'

artists_to_model = artists[columns_names].dropna(subset=['MeanPrice'])

badges_to_model = artists_to_model['Badges'].str.join(',').str.get_dummies(sep=',')
artists_to_model = pd.concat([artists_to_model, badges_to_model], axis=1)

print(artists_to_model.columns)
columns_names.extend(badges_to_model.columns)
columns_names.remove(y_column)
columns_names.remove('Badges')

for column_name in columns_names:
    print(column_name)
    linear_regression_by_column(artists_to_model, column_name, y_column)

Index(['MeanPrice', 'StdPrice', 'MeanPricePerInch', 'StdPricePerInch',
       'MeanSize', 'StdSize', 'Followers_Count', 'Artworks_Count', 'Badges',
       'Artist featured in a collection', 'Featured in Inside The Studio',
       'Featured in One to Watch', 'Featured in the Catalog', 'NFT Artist',
       'Showed at the The Other Art Fair'],
      dtype='object')
MeanPrice
coeficient [[0.00076669]]
intercept [1.84631072]
score 0.07449145760591869
------
StdPrice
coeficient [[0.0007321]]
intercept [2.21469889]
score -0.0608580572762325
------
StdPricePerInch
coeficient [[0.44080529]]
intercept [2.36421748]
score 0.197526503818383
------
MeanSize
coeficient [[-0.00134015]]
intercept [4.21938388]
score 0.15609781257546373
------
StdSize
coeficient [[0.00141876]]
intercept [2.72582114]
score -0.2630722694068348
------
Followers_Count
coeficient [[0.00013428]]
intercept [3.13746353]
score 0.00459725775542541
------
Artworks_Count
coeficient [[0.00115421]]
intercept [2.86830051]
score -0.2307

In [262]:
# decision tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

x_column = 'StdSize'

# Create a DecisionTreeRegressor object
decision_tree = DecisionTreeRegressor(random_state=42)
# Train the model
decision_tree.fit(X_train, y_train)
# Check the score of the model
print(decision_tree.score(X_test, y_test))
# Check the mean absolute error of the model
print(mean_absolute_error(y_test, decision_tree.predict(X_test)))
# Predict the price of an artwork with a size of 1000 square inches
# decision_tree.predict([[1000]])

0.4424046858765265
1625.5064461198062
