In [1]:
import os
from fastai.vision.all import *
import zipfile

import params as params


import wandb

from kaggle.api.kaggle_api_extended import KaggleApi


In [2]:
PATH_TO_DATA = "data"
DATASET_NAME = 'brian-tumor-dataset'
UNZIPPED_NAME = 'Brain Tumor Data Set'
TUMOR_SUB_DIR = 'Brain Tumor'
HEALTHY_SUB_DIR = 'Healthy'

In [3]:
#Get files from kaggle
kaggle_api = KaggleApi()
kaggle_api.authenticate()

if os.path.exists(PATH_TO_DATA) == False:
    os.mkdir(PATH_TO_DATA)
    
kaggle_api.dataset_download_files(os.path.join('preetviradiya',"brian-tumor-dataset"), path=PATH_TO_DATA)

In [4]:
# Unzip
with zipfile.ZipFile(os.path.join(PATH_TO_DATA, DATASET_NAME + ".zip"), 'r') as zip_ref:
    zip_ref.extractall(os.path.join(PATH_TO_DATA))


In [5]:

def label_func(fname):
    return (fname.parent.parent/"labels")/f"{fname.stem}_mask.png"

def get_classes_per_image(mask_data, class_labels):
    unique = list(np.unique(mask_data))
    result_dict = {}
    for _class in class_labels.keys():
        result_dict[class_labels[_class]] = int(_class in unique)
    return result_dict

def _create_table(image_files, classification_label):
    "Create a table with the dataset"
    table = wandb.Table()
    
    for i, image_file in progress_bar(enumerate(image_files), total=len(image_files)):
        image = Image.open(image_file)
        table.add_data(
            str(image_file.name),
            wandb.Image(
                    image,
            ),
            str(classification_label)
        )
    

In [6]:
run = wandb.init(project=params.WANDB_PROJECT, entity=params.ENTITY, job_type="upload")
raw_data_at = wandb.Artifact(params.RAW_DATA_AT, type="raw_data")

[34m[1mwandb[0m: Currently logged in as: [33mchrisgjarrett[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
tumour_files = get_image_files(os.path.join(PATH_TO_DATA,UNZIPPED_NAME,UNZIPPED_NAME,TUMOR_SUB_DIR), recurse=False)
healthy_files = get_image_files(os.path.join(PATH_TO_DATA,UNZIPPED_NAME,UNZIPPED_NAME,HEALTHY_SUB_DIR), recurse=False)

In [8]:
# Create labels
labels = ["0"] * len(healthy_files)
labels.append(["1"] * len(tumour_files))

# Generate files list
all_files = tumour_files.copy()
all_files.append(healthy_files)

image_table = wandb.Table(columns=["Filename", "Image", "Class"])

# Healthy images
for i, image_file in progress_bar(enumerate(healthy_files), total=len(healthy_files)):
    image = Image.open(image_file)
    image_table.add_data(
        str(image_file.name),
        wandb.Image(
                image,
        ),
        str(params.CLASSES[0])
    )

# Tumour images
for i, image_file in progress_bar(enumerate(tumour_files), total=len(tumour_files)):
    image = Image.open(image_file)
    image_table.add_data(
        str(image_file.name),
        wandb.Image(
                image,
        ),
        str(params.CLASSES[1])
    )

In [9]:
# # Create table
# image_table = _create_table(all_files, labels)
raw_data_at.add_dir(os.path.join(PATH_TO_DATA,UNZIPPED_NAME,UNZIPPED_NAME,HEALTHY_SUB_DIR), name="images")
raw_data_at.add_dir(os.path.join(PATH_TO_DATA,UNZIPPED_NAME,UNZIPPED_NAME,TUMOR_SUB_DIR), name="images")
raw_data_at.add(image_table, "eda_table")

[34m[1mwandb[0m: Adding directory to artifact (./data/Brain Tumor Data Set/Brain Tumor Data Set/Healthy)... Done. 0.7s
[34m[1mwandb[0m: Adding directory to artifact (./data/Brain Tumor Data Set/Brain Tumor Data Set/Brain Tumor)... Done. 0.8s


ArtifactManifestEntry(path='eda_table.table.json', digest='WfMP7GD4pVV4WxbN4aXePQ==', ref=None, birth_artifact_id=None, size=1071105, extra={}, local_path='/Users/chrisjarrett/Library/Application Support/wandb/artifacts/staging/tmpa5mbokb7')

In [10]:
run.log_artifact(raw_data_at)
run.finish()