In [30]:
import pandas as pd
import numpy as np
import re

# Carregar o dataset
df = pd.read_csv('raw_data/artDataset.csv')
# Verificar as primeiras linhas
print(df.head())

   Unnamed: 0       price                    artist                  title  \
0           0  28.500 USD           Tommaso Ottieri         Bayreuth Opera   
1           1   3.000 USD         Pavel Tchelitchew  Drawings of the Opera   
2           2   5.000 USD                 Leo Gabin        Two on Sidewalk   
3           3   5.000 USD         Matthias Dornfeld            Blumenszene   
4           4   2.500 USD  Alexis Marguerite Teplin   Feverish Embarkation   

               yearCreation  \
0                      2021   
1  First Half 20th Century    
2                      2016   
3                      2010   
4                      2001   

                                              signed  \
0                                    Signed on verso   
1                                  Signed and titled   
2                  Signed, titled and dated on verso   
3  Signed, titled and dated on the reverse with t...   
4                                    Signed on verso   

       

In [31]:
# Ver valores únicos e contagens
print("Valores únicos em 'signed':\n", df['signed'].value_counts(dropna=False))
print("\nValores únicos em 'condition':\n", df['condition'].value_counts(dropna=False))

Valores únicos em 'signed':
 signed
[nan]                                                                                               153
Signed lower right                                                                                   29
Signed verso                                                                                         15
Signed lower right recto                                                                             11
Signed and dated lower right recto                                                                    9
                                                                                                   ... 
Signed in ink on the photographer's label on verso                                                    1
Signed verso in pencil and with photographer's stamp                                                  1
Blindstamped on recto; Signed, titled and dated in pencil and with photographer's stamp on verso      1
Signed, stamped, titled and 

In [32]:
# Criar coluna binária
df['is_signed'] = df['signed'].notna().astype(int)  # 1 se há assinatura, 0 se NaN

# Opcional: Extrair detalhes específicos (ex.: 'numerado')
df['is_numbered'] = df['signed'].str.contains('numbered', case=False, na=False).astype(int)

In [33]:
# Definir regras para o score baseado em palavras-chave
conditions = [
    (df['condition'].str.contains('excellent|perfect', case=False, na=False)),  # Score 3
    (df['condition'].str.contains('very good|good|fine', case=False, na=False)),  # Score 2
    (df['condition'].str.contains('regular|fair', case=False, na=False)),  # Score 1
    (df['condition'].str.contains('poor|bad', case=False, na=False))  # Score 0
]

# Criar a coluna 'condition_score' com np.select
scores = [3, 2, 1, 0]  # Valores para cada condição acima
df['condition_score'] = np.select(conditions, scores, default=1)  # default=1 assume "regular" se não encontrar nada

# Verificar resultados
print(df[['condition', 'condition_score']].sample(10))  # Checar amostras aleatórias

# Ver distribuição
print(df['condition_score'].value_counts())

                                             condition  condition_score
697  This work is in very good condition.Not examin...                2
147  The work is in excellent condition, direct fro...                3
247  The work is in excellent condition, direct fro...                3
632  This work is in good condition.Not examined ou...                2
310    This print is in generally excellent condition.                3
198  Overall very good condition.Not examined out o...                2
457  Not examined outside of frame. Slight bowing t...                1
181                The work is in excellent condition.                3
72                                Excellent condition                 3
580  This work is in very good condition.Artwork no...                2
condition_score
2    295
3    278
1    181
Name: count, dtype: int64


In [34]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement,is_signed,is_numbered,condition_score
0,0,28.500 USD,Tommaso Ottieri,Bayreuth Opera,2021,Signed on verso,This work is in excellent condition.,Contemporary,Baroque,1,0,3
1,1,3.000 USD,Pavel Tchelitchew,Drawings of the Opera,First Half 20th Century,Signed and titled,Not examined out of frame.No obvious signs of ...,Post-War,Surrealism,1,0,1
2,2,5.000 USD,Leo Gabin,Two on Sidewalk,2016,"Signed, titled and dated on verso",This work is in excellent condition.,Contemporary,Abstract,1,0,3
3,3,5.000 USD,Matthias Dornfeld,Blumenszene,2010,"Signed, titled and dated on the reverse with t...",This work is in excellent condition.There is m...,Contemporary,Abstract,1,0,3
4,4,2.500 USD,Alexis Marguerite Teplin,Feverish Embarkation,2001,Signed on verso,This work is in excellent condition.,Contemporary,Abstract,1,0,3


In [35]:
def clean_price(price_text):
    if pd.isna(price_text):  # Lidar com missings
        return None
    # Encontrar todos os números (com . ou , como separadores)
    matches = re.findall(r'[\d.,]+', str(price_text))
    if matches:
        # Pegar o primeiro número encontrado (assumindo que é o valor principal)
        num_str = matches[0].replace('.', '').replace(',', '.')  # Converter "28.500" -> "28500" e "1,200" -> "1.200"
        return float(num_str)
    return None  # Caso não encontre números

df['price_clean'] = df['price'].apply(clean_price)

In [36]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement,is_signed,is_numbered,condition_score,price_clean
0,0,28.500 USD,Tommaso Ottieri,Bayreuth Opera,2021,Signed on verso,This work is in excellent condition.,Contemporary,Baroque,1,0,3,28500.0
1,1,3.000 USD,Pavel Tchelitchew,Drawings of the Opera,First Half 20th Century,Signed and titled,Not examined out of frame.No obvious signs of ...,Post-War,Surrealism,1,0,1,3000.0
2,2,5.000 USD,Leo Gabin,Two on Sidewalk,2016,"Signed, titled and dated on verso",This work is in excellent condition.,Contemporary,Abstract,1,0,3,5000.0
3,3,5.000 USD,Matthias Dornfeld,Blumenszene,2010,"Signed, titled and dated on the reverse with t...",This work is in excellent condition.There is m...,Contemporary,Abstract,1,0,3,5000.0
4,4,2.500 USD,Alexis Marguerite Teplin,Feverish Embarkation,2001,Signed on verso,This work is in excellent condition.,Contemporary,Abstract,1,0,3,2500.0


In [None]:
def clean_year(year_text):
    if pd.isna(year_text):
        return None

    year_text = str(year_text).strip()

    # Caso 1: Ano exato (ex.: "2021", "c. 1895")
    match_exact = re.search(r'(?:c\.\s*)?(\d{4})', year_text, re.IGNORECASE)
    if match_exact:
        return int(match_exact.group(1))

    # Caso 2: Século (ex.: "20th century")
    match_century = re.search(r'(\d{1,2})(?:st|nd|rd|th)\s*century', year_text, re.IGNORECASE)
    if match_century:
        century = int(match_century.group(1))
        return (century - 1) * 100 + 50  # Ano médio do século (ex.: "20th century" → 1950)

    # Caso 3: Períodos (ex.: "First half of 20th century")
    match_period = re.search(r'(first|second|early|mid|late)\s*(?:half|part)?\s*(?:of\s*)?(\d{1,2})(?:st|nd|rd|th)?\s*century', year_text, re.IGNORECASE)
    if match_period:
        period, century = match_period.groups()
        century = int(century)
        base_year = (century - 1) * 100
        if 'first' in period.lower():
            return base_year + 25  # Primeira metade (ex.: 1925 para "First half of 20th century")
        elif 'second' in period.lower():
            return base_year + 75  # Segunda metade
        elif 'early' in period.lower():
            return base_year + 20  # ~20 anos após início
        elif 'mid' in period.lower():
            return base_year + 50
        elif 'late' in period.lower():
            return base_year + 80

    # Caso 4: Décadas (ex.: "1960s")
    match_decade = re.search(r'(\d{3})0s', year_text)
    if match_decade:
        return int(match_decade.group(1)) + 5  # 1960s → 1965

    # Caso 5: Intervalos (ex.: "1890-1900")
    match_range = re.search(r'(\d{4})\s*[-–]\s*(\d{4})', year_text)
    if match_range:
        start, end = map(int, match_range.groups())
        return (start + end) // 2  # Média do intervalo

    return None  # Se não encaixar em nenhum padrão

df['year_clean'] = df['yearCreation'].apply(clean_year)

In [40]:
df.head(20)

Unnamed: 0.1,Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement,is_signed,is_numbered,condition_score,price_clean,year_clean
0,0,28.500 USD,Tommaso Ottieri,Bayreuth Opera,2021,Signed on verso,This work is in excellent condition.,Contemporary,Baroque,1,0,3,28500.0,2021.0
1,1,3.000 USD,Pavel Tchelitchew,Drawings of the Opera,First Half 20th Century,Signed and titled,Not examined out of frame.No obvious signs of ...,Post-War,Surrealism,1,0,1,3000.0,1950.0
2,2,5.000 USD,Leo Gabin,Two on Sidewalk,2016,"Signed, titled and dated on verso",This work is in excellent condition.,Contemporary,Abstract,1,0,3,5000.0,2016.0
3,3,5.000 USD,Matthias Dornfeld,Blumenszene,2010,"Signed, titled and dated on the reverse with t...",This work is in excellent condition.There is m...,Contemporary,Abstract,1,0,3,5000.0,2010.0
4,4,2.500 USD,Alexis Marguerite Teplin,Feverish Embarkation,2001,Signed on verso,This work is in excellent condition.,Contemporary,Abstract,1,0,3,2500.0,2001.0
5,5,7.575 USD,Kenzo Okada,Bamboo,1977,Signed lower right recto; numbered lower left ...,"This work is in excellent condition, direct fr...",Contemporary,Abstract Expressionism,1,1,3,7575.0,1977.0
6,6,7.550 USD,Francesco Clemente,Air,2007,"Numbered and signed on bottom corner, recto","This work is in excellent condition, direct fr...",Contemporary,Neo-Expressionism,1,1,3,7550.0,2007.0
7,7,3.550 USD,Günther Förg,Untitled (Green),1993,Signed lower right recto; numbered lower left ...,"This work is in very good condition, direct fr...",Contemporary,Modernism,1,1,2,3550.0,1993.0
8,8,3.075 USD,Dan Walsh,Manifold - Blue,2014,Signed lower right recto; numbered lower left ...,"This work is in excellent condition, direct fr...",Contemporary,Minimalism,1,1,3,3075.0,2014.0
9,9,3.550 USD,Günther Förg,Untitled (Orange/Black),1993,Signed lower right recto; numbered lower left ...,"This work is in excellent condition, direct fr...",Contemporary,Modernism,1,1,3,3550.0,1993.0
