In [52]:
import os, warnings
import wandb

import pandas as pd
from fastai.vision.all import *
from sklearn.model_selection import StratifiedKFold

import params
warnings.filterwarnings('ignore')

In [53]:
run = wandb.init(project=params.WANDB_PROJECT, entity=params.ENTITY, job_type="data_split")

In [54]:
raw_data_at = run.use_artifact(f'{params.RAW_DATA_AT}:latest')
path = Path(raw_data_at.download())

[34m[1mwandb[0m: Downloading large artifact raw_data_at:latest, 557.13MB. 8616 files... 
[34m[1mwandb[0m:   8616 of 8616 files downloaded.  
Done. 0:0:0.6


In [55]:
path.ls()

(#3) [Path('artifacts/raw_data_at:v6/images'),Path('artifacts/raw_data_at:v6/eda_table.table.json'),Path('artifacts/raw_data_at:v6/media')]

Get the original table from the EDA phase

In [56]:
orig_eda_table = raw_data_at.get("eda_table")

[34m[1mwandb[0m: Downloading large artifact raw_data_at:latest, 557.13MB. 8616 files... 
[34m[1mwandb[0m:   8616 of 8616 files downloaded.  
Done. 0:0:0.7


In [63]:
fnames = os.listdir(path/"media"/"images")
groups = fnames

Let's get the target variable and call it y

In [64]:
y = orig_eda_table.get_column('Class')

In [65]:
df = pd.DataFrame()
df['Filename'] = orig_eda_table.get_column('Filename')
df['Class'] = orig_eda_table.get_column('Class')
df['fold'] = -1

In [66]:
cv = StratifiedKFold(n_splits=10)
for i, (train_idxs, test_idxs) in enumerate(cv.split(df['Filename'], y)):
    df.loc[test_idxs, ['fold']] = i

In [67]:
df['Stage'] = 'train'
df.loc[df.fold == 0, ['Stage']] = 'test'
df.loc[df.fold == 1, ['Stage']] = 'valid'
del df['fold']
df.Stage.value_counts()

train    3680
test      460
valid     460
Name: Stage, dtype: int64

In [68]:
df.to_csv('data_split.csv', index=False)

In [69]:
processed_data_at = wandb.Artifact(params.PROCESSED_DATA_AT, type="split_data")
processed_data_at.add_file('data_split.csv')
processed_data_at.add_dir(path)

[34m[1mwandb[0m: Adding directory to artifact (./artifacts/raw_data_at:v6)... Done. 3.2s


In [70]:
data_split_table = wandb.Table(dataframe=df[['Filename', 'Stage']])
join_table = wandb.JoinedTable(orig_eda_table, data_split_table, "Filename")

# Add to artifact
processed_data_at.add(join_table, "eda_table_data_split")

ArtifactManifestEntry(path='eda_table_data_split.joined-table.json', digest='vQAz34WVE8mZUVJM684GJA==', ref=None, birth_artifact_id=None, size=122, extra={}, local_path='/Users/chrisjarrett/Library/Application Support/wandb/artifacts/staging/tmpbqljoepi')

In [50]:
run.log_artifact(processed_data_at)
run.finish()