In [15]:
from art_api import data, config
import pandas as pd

In [189]:
df = pd.read_csv(f"gs://{config.BUCKET_NAME}/{config.BUCKET_TRAIN_DATA_PATH}/{config.BUCKET_TRAIN_DATA_FILE}")
df.drop_duplicates(subset=["filename"], inplace=True)

In [190]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8255 entries, 0 to 8295
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         8255 non-null   int64 
 1   Image URL     8255 non-null   object
 2   Web page URL  8255 non-null   object
 3   Subset        8255 non-null   object
 4   Labels        8255 non-null   object
 5   filename      8255 non-null   object
 6   labels        8255 non-null   object
 7   aeroplane     8255 non-null   int64 
 8   bird          8255 non-null   int64 
 9   boat          8255 non-null   int64 
 10  chair         8255 non-null   int64 
 11  cow           8255 non-null   int64 
 12  diningtable   8255 non-null   int64 
 13  dog           8255 non-null   int64 
 14  horse         8255 non-null   int64 
 15  sheep         8255 non-null   int64 
 16  train         8255 non-null   int64 
dtypes: int64(11), object(6)
memory usage: 1.1+ MB


In [177]:
import os
from PIL import Image
import numpy as np
from tqdm import tqdm

In [179]:
df_sample.columns

Index(['index', 'Image URL', 'Web page URL', 'Subset', 'Labels', 'filename',
       'labels', 'aeroplane', 'bird', 'boat', 'chair', 'cow', 'diningtable',
       'dog', 'horse', 'sheep', 'train'],
      dtype='object')

In [191]:
def save_array(df):
    """Save a numpy array from df
    Args:
    df - dataframe to be read
    npy_file - name of numpy file
    Returns:
    .npy saved to raw_data directory based on npy_file
    """
    imgs = []
    for index, row in tqdm(df.iterrows()):
        img_file = str(row["filename"])
        image = Image.open(os.path.join(config.PATH_YOURPAINTINGS_SM, img_file))   
        imgs.append(np.array(image))
        X = np.array(imgs)
        np.save("../raw_data/X_array.npy", X)
        X.shape
        y = df.drop(columns=['index', 'Image URL', 'Web page URL', 'Subset', 'Labels', 'filename', 'labels'])
        y.shape
    return X, y

In [192]:
X, y = save_array(df)

8255it [1:06:11,  2.08it/s]


In [168]:
def sample(df):
    '''Pass in any dataframe, and be able to sample based on defined classes [can be pre-defined list] and number of samples
    Args:
    df - pass in the dataframe to be sampled
    
    Returns:
    df_sample - randomly select df w/ min_sample_num which is obtained from 
    '''
    min_sample_num = df.select_dtypes(include="number").sum().min()
    
    df_sample = pd.DataFrame()
    
    for cls in config.CLASSES:
        df_cls = df.query(f"{cls} == 1").sample(n=min_sample_num)
        df_sample = pd.concat([df_sample, df_cls])
        print(f"{min_sample_num} sampled per {cls}")
    print("\ndropping duplicates based on filename\n")    
    df_sample.drop_duplicates(subset=["filename"], inplace=True)
    print(f"After sampling, number of positive labels per class as follows:\n {df_sample.select_dtypes(include='number').sum()}, \n\nnumber of records in df_sample = {len(df_sample)}")
    
    return df_sample

In [169]:
df_sample = sample(df)

193 sampled per aeroplane
193 sampled per bird
193 sampled per boat
193 sampled per chair
193 sampled per cow
193 sampled per diningtable
193 sampled per dog
193 sampled per horse
193 sampled per sheep
193 sampled per train

dropping duplicates based on filename

After sampling, number of positive labels per class as follows:
 index          8075558
aeroplane          193
bird               236
boat               236
chair              263
cow                224
diningtable        263
dog                266
horse              248
sheep              227
train              199
dtype: int64, 

number of records in df_sample = 1884


In [193]:
df_sample

Unnamed: 0,index,Image URL,Web page URL,Subset,Labels,filename,labels,aeroplane,bird,boat,chair,cow,diningtable,dog,horse,sheep,train
5689,5846,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/shankar-at...,'train',' aeroplane bird diningtable',ABD_AAG_AG002362-001.jpg,"['aeroplane', 'bird', 'diningtable']",1,1,0,0,0,1,0,0,0,0
5576,5724,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/scuttling-...,'test',' aeroplane',IWM_IWM_LD_7488-001.jpg,['aeroplane'],1,0,0,0,0,0,0,0,0,0
661,669,https://d3d00swyhr67nd.cloudfront.net/w944h944...,https://artuk.org/discover/artworks/aircraft-r...,'train',' aeroplane',NWM_PST_AC2011_A_0024-001.jpg,['aeroplane'],1,0,0,0,0,0,0,0,0,0
1057,1071,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/blitz-plan...,'train',' aeroplane',TATE_TATE_T11790_10-001.jpg,['aeroplane'],1,0,0,0,0,0,0,0,0,0
896,906,https://d3d00swyhr67nd.cloudfront.net/w944h944...,https://artuk.org/discover/artworks/beach-scen...,'test',' aeroplane boat',DOR_WVH_PCF1-001.jpg,"['aeroplane', 'boat']",1,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4478,4583,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/midland-an...,'validation',' train',NY_NRM_1996_7369-001.jpg,['train'],0,0,0,0,0,0,0,0,0,1
4257,4348,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/locomotive...,'test',' train',NY_NRM_1998_10348-001.jpg,['train'],0,0,0,0,0,0,0,0,0,1
7171,7425,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/the-southb...,'test',' train',NY_NRM_1978_1320-001.jpg,['train'],0,0,0,0,0,0,0,0,0,1
3038,3103,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/great-west...,'validation',' train',NY_NRM_1986_9435-001.jpg,['train'],0,0,0,0,0,0,0,0,0,1
