In [None]:
import pandas as pd
df = pd.read_parquet('sources/metadata.parquet', engine='pyarrow')
df_large = pd.read_parquet('sources/metadata-large.parquet', engine='pyarrow')


# read excel_artist_names
import my_utils

excel_artist_names = my_utils.read_lines_as_list("sources/excel_artists_copy_paste_name.txt")
hundred_artist_names =  excel_artist_names[0:100]
assert len(hundred_artist_names) == 100
ten_artist_names = excel_artist_names[0:10]
assert len(ten_artist_names) == 10


In [None]:
df.shape

In [None]:
len(df["prompt"].unique())

In [None]:
# save to csv for human readability
df.to_csv("sources/metadata.csv", escapechar = "\\")
df_large.to_csv("sources/metadata-large.csv", escapechar = "\\")

# Exact matching

## Extract artists from Prompts

In [None]:
cols = ["prompt", "image_name"]

cp = df_large[cols].copy()
cp = cp.drop_duplicates(subset="prompt")
artist_names = excel_artist_names
artist_names = [x.lower() for x in artist_names]

cp['artists'] = cp['prompt'].map(
        lambda p: my_utils.extract_artists_exact(p, artist_names))
cp["num_artists"] = cp['artists'].map(
        lambda p: len(p))
cp.to_parquet("results/artists_exact_match_large.parquet")
cp.to_csv("results/artists_exact_match_large.csv", escapechar="\\")

## Count Artist Mentions

In [1]:
import pandas as pd
import my_utils
import time

excel_artist_names = my_utils.read_lines_as_list("sources/excel_artists_copy_paste_name.txt")
exact_matches = pd.read_parquet('results/artists_exact_match_large.parquet', engine='pyarrow')
print(exact_matches.dtypes)


cols = ["artist", "mentions"]

artist_mentions =  pd.DataFrame(columns=cols)

starttime = time.time()
c=0
for name in excel_artist_names:
    print(f'{c} {name}',flush=True)
    
    
    c+=1
    if c == 15:
          break
    new_row = dict()
    new_row['artist'] = name
    mdf = my_utils.exact_match_dataframe(exact_matches, name)
    #print(mdf)
    new_row['mentions'] = mdf.shape[0]
    
    new_row = pd.Series(new_row)
    #print(new_row)
    
    artist_mentions = pd.concat([artist_mentions, new_row.to_frame().T], ignore_index=True)
    
    duration = time.time() - starttime
    print(f'analysing {c} artists took {duration / 60} minutes',flush=True)
    print(f'time remaining estimate {(duration/c)*(len(excel_artist_names)-c)/60} minutes',flush=True)

artist_mentions.sort_values("mentions", axis=0, ascending=False)
    
artist_mentions.to_csv("results/artist_mentions.csv", escapechar = "\\")
artist_mentions.to_parquet("results/artist_mentions.parquet")
print("finished")

prompt         object
image_name     object
artists        object
num_artists     int64
dtype: object
0 Alvar Aalto
analysing 1 artists took 0.01518559455871582 minutes
time remaining estimate 52.8914258480072 minutes
1 Slim Aarons
analysing 2 artists took 0.030604982376098634 minutes
time remaining estimate 53.28327431678772 minutes
2 Edwin Austin Abbey
analysing 3 artists took 0.046196266015370684 minutes
time remaining estimate 53.603067333168454 minutes
3 Gertrude Abercrombie
analysing 4 artists took 0.06205739974975586 minutes
time remaining estimate 53.9899377822876 minutes
4 Marina Abramović
analysing 5 artists took 0.07790478467941284 minutes
time remaining estimate 54.20614917993546 minutes
5 Tomma Abts
analysing 6 artists took 0.09367370208104452 minutes
time remaining estimate 54.299522639645474 minutes
6 Vito Acconci
analysing 7 artists took 0.10877883831659953 minutes
time remaining estimate 54.03200297525951 minutes
7 Andreas Achenbach
analysing 8 artists took 0.123865548

# Fuzzy Matching

In [None]:
import time
import my_utils

starttime = time.time()

n = 1000
similarity_threshold = 0.8

df_reduced = df.head(n).copy()
assert df_reduced.shape[0] == n, f'shape is {df_reduced.shape[0]}'

artist_list = hundred_artist_names
df_reduced['artists'] = df_reduced['prompt'].map(
        lambda p: my_utils.extract_artists_fuzzy(p, artist_list, similarity_threshold))
df_reduced.to_parquet(f'{n}_entries.parquet')
df_reduced.to_csv(f'{n}_entries.csv')

duration = (time.time()-starttime)
print(f'processing took {duration/60} minutes, average {duration/n} seconds per row for {len(artist_list)} artists')

goal = 1600000
print(f'for {goal} entries it would take approximately {duration/n * goal / 60 / 60} hours')


df_reduced['artists_amount'] = df_reduced['artists'].apply(
        lambda query: len(query))
    
entries_with_artists = df_reduced.loc[df_reduced['artists_amount'] > 0, ["image_name", "prompt","user_name","artists","artists_amount"]]
entries_with_artists.to_parquet("artists_filtered.parquet")
entries_with_artists.to_csv("artists_filtered.csv")

# prognose: 933.333333333 stunden für den gesamten Datensatz (mit allen Namen aus )

# prognose: 0.11 * 1600000 /60 /60 = 48 stunden für nur die Namen aus 100_copy_paste_names.txt