In [None]:
## IMPORTS

In [None]:
## FUNCTIONS

def get_artists_df(artworks_df):
    agg_df = artworks_df.dropna(subset=['Price'])
    agg_data = agg_df.groupby('Artist').agg({'Artist': 'count',
                                            'Price (US$)': ['mean', lambda x: x.std(skipna=False)],
                                            'Price (US$/in²)': ['mean', lambda x: x.std(skipna=False)],
                                            'Size': ['mean', lambda x: x.std(skipna=False)]})
    agg_data = agg_data.reset_index()

    # Flatten the column names
    agg_data.columns = ['Artist', 'NumArtworks', 'Mean Price (US$)', 'Std Price (US$)', 'Mean Price (US$/in²)', 'Std Price Price (US$/in²)', 'Mean Size (in²)', 'Std Size (in²)']

    # Create a new DataFrame with the aggregated data from artworks_info
    artists_from_artworks = pd.DataFrame(agg_data)
    artists_from_artworks = artists_from_artworks[artists_from_artworks['NumArtworks'] >= 3]
    # Get artists data from artists_info
    artists_from_artists = pd.read_json('../temporary-files/saatchi_artists_info_clean.json')
    artists_from_artists.rename(columns=lambda x: x.title(), inplace=True)
    artists_from_artists.rename(columns={'Name': 'Artist'}, inplace=True)

    # New dataframe with all artists that are contained in both dataframes
    artists_from_artists = artists_from_artists[artists_from_artists['Artist'].isin(artists_from_artworks['Artist'])]
    artists = pd.merge(artists_from_artworks, artists_from_artists, on='Artist')

    return artists

In [None]:
## ARTISTS

artists = get_artists_df(artworks)
artists.columns

In [None]:
## GET MODELS BY COLUMN


columns_names = [ 'NumArtworks', 'MeanPrice', 'StdPrice',
                  'MeanPricePerInch', 'StdPricePerInch',
                  'MeanSize', 'StdSize', 'Badges']

y_column_name = 'MeanPrice'

artists_to_model = artists[columns_names].dropna(subset=['MeanPrice'])
# Creates dummies' columns for the badges column
badges_to_model = artists_to_model['Badges'].str.join(',').str.get_dummies(sep=',')
artists_to_model = pd.concat([artists_to_model, badges_to_model], axis=1)

# List of badges
badges_list = badges_to_model.columns.tolist()

# Adjusts list of columns to test
x_columns_names = columns_names + badges_list
x_columns_names.remove('Badges')
x_columns_names.remove(y_column_name)

# Apply linear regression ad decision tree to each column individually
for column_name in x_columns_names:
    print('-----------------------------------')
    print(column_name)
    get_all_models(artists_to_model[[column_name]], artists_to_model[[y_column_name]])

In [None]:
## PREPARE ARTISTS' TEXTS TO TRAIN MODEL
artists_text_to_model = get_artists_df(artworks)

# remove artists with no price data
artists_text_to_model = artists_text_to_model.dropna(subset=['MeanPricePerInch'])
# remove artists with no text data in any of the columns ['Info', 'Events', 'Exhibitions', 'Education']
artists_text_to_model = artists_text_to_model.dropna(subset=['Info', 'Events', 'Exhibitions', 'Education'], how='all')

# create text column with separators (title of the section before the text)


# create label column with the median Price (US$/in²) of the artist
artists_text_to_model['Label'] = artists_text_to_model['MeanPricePerInch']
# remove all other columns
artists_text_to_model = artists_text_to_model[['Text', 'Label']]
# save artists_json as json file
artists_text_to_model.to_json('../temporary-files/artists_text_to_model_json.json', orient='records')

artists_text_to_model

In [None]:
# Dummies com base em info:
# Tem exposição?
# Fez residência?

artists = get_artists_df(artworks)

artists_mean_price_per_inch = artists[['Artist', 'MeanPricePerInch']]
# save as csv
artists_mean_price_per_inch.to_csv('./models/saatchi_artists_mean_price_per_inch.csv', index=False)

In [None]:
## CLEAN EXHIBITIONS DATA

clean_exhibitions_system_message = {"role": "system", "content": "If user message doesnt contain any info of at least one exhibition, answer NaN. Otherwise, independently of original data structure, please organize the history of exhibitions in a single json format (without nesting) with keys 'event', 'event type', 'location', 'city/country', 'date'. 'event type' can take the following values: 'solo exhibition', 'collective exhibition', 'art fair'. In case of data absent or not clear, leave blank"}

def get_exhibitions_clean(exhibition_raw):
    if exhibition_raw != exhibition_raw:
        return exhibition_raw
    time.sleep(20)
    try:
        exhibition_clean = chat(clean_exhibitions_system_message, exhibition_raw)[-1]['content']
        exhibition_clean = exhibition_clean.replace('\n', '').replace('  ', '')
    except:
        exhibition_clean = 'error'
    return exhibition_clean

artists['ExhibitionsClean'] = artists['Exhibitions'].apply(lambda x: get_exhibitions_clean(x))

artists['ExhibitionsClean']

In [None]:
# flatten nested dicts

import ast
# count the exhibitions of artist
# artists['ExhibitionsCount'] = artists['ExhibitionsClean'].apply(lambda x: len(x) if x == x else 0)
# txt = artists.loc[17, 'ExhibitionsClean']
# get txt as dict

# print(txt_dict)

def flatten_dict(dict_string):
    print(dict_string)
    if dict_string != dict_string:
        return dict_string
    txt_dict = ast.literal_eval(dict_string)

    flattened_data = []
    for exhibitions in txt_dict.items():
        for exhibition in exhibitions:
            if not isinstance(exhibition, dict):
                continue
            flattened_data.append({
                "event": exhibition["event"],
                "event type": exhibition["event type"],
                "location": exhibition["location"],
                "date": exhibition["date"],
            })
    print(flattened_data)
    return flattened_data

artists['ExhibitionsClean'].apply(lambda x: flatten_dict(x))
# get unique values of locations in exhibitionsClean
# artists['ExhibitionsLocations'] = artists['ExhibitionsClean'].apply(lambda x: list(set([exhibition['location'] for exhibition in x])) if x == x else [])